Tuesday, April 8, 2008

Data Analysis Using R

Copy and paste stuff. Very useful and I don't want it buried in seas of files.

Basics

· R is case sensitive, object-oriented

· A command ends with a semi-colon (;). The last semi-colon can be omitted.

· A comment begins with # regardless of its location. The single quotes ('') and double quotes ("") are used interchangably.

· Packages contains data sets and functions, are accessed through library().

· Objects include vectors, lists, data frames, matrices (array), and factors.

· An R list is an object consisting of an ordered collection of objects known as its components. Lists are a general form of vector in which the various elements need not be of the same type, and are often themselves vectors or lists.

· Data frames are matrix-like structures, in which the columns can be of different types. A data frame is a list with class "data.frame".

· A factor is a vector object used to specify a discrete classification (grouping) of the components of other vectors of the same length.

· Matrices or more generally arrays are multi-dimensional generalizations of vectors. An array can be considered as a multiply subscripted collection of data entries

· The "pi" is the constant 3.141592654. The "NA" indicates a missing value (default).

· The "pkg" (package); "d" (data frame); "m" (matrix); "v" (vector), url, file (file), obj (objects), fit (fitted model), n (number); s (string).

Basic Commands

· * quit(); q()

· * help(command); help.start()

· * search(); help.search()

· * dir(); methods()

· * library(p); identify(); attach(); detatch()

· * remove(); rm()

· * start:end; c(); rep(); seq()

· * scan(); print(); str(); ls()

· * cat(); cat("concaternate", c, "and print", "\t")

· * options(prompt='.', continue="///", digits=10); getOption("width")

· * source(); source.url() /* run commands in a file */

·

Simple examples

· library() # list packages available

· library(car) # load a package

· list(data()) # list data sets in the current package

· summary(Davis)

· list(Davis)

· list(Davis$weight)

· stem(Davis[,2]) # equal to stem(Davis$weight)

· stem(Davis$height, scale=4)

· boxplot(Davis$weight)

· w<-Davis$weight

· h<-Davis$height

· plot(w ~ h)

· cor(Davis[,c(2:3)])

· cor.test(w,h)

· t.test(Davis[,2], mu=65)

· t.test(Davis$height, Davis$weight, mu=100, paired=FALSE)

· var.test(Davis$height, Davis$weight)

· d<=read.csv("c:/temp/R/nes.csv", header=TRUE)

· list(names(d)) # list variable names

OPERATOR/FUNCTION

Operators

· * <- (left assignment), -> (right assignment)

· * +, -, *, /, ^, %% (modulus)

· * >, >=, <, <=, == (equal), != (not equal)

· * & (and), | (or)

· * %*% (matrix product); %/% (division)

· * %o% (Outer product); %x% (Kronecker product)

· * %in% (Matching operator);

Functions

* abs(); sin(); cos(); tan(); exp(); sqrt(); min(); max()

* log(); log(v,10); log10(); log2(); log(v, base=10)

* mean(); sum(); median(); range(); var(); sd()

* rank(); ave(v, group); by(group)

* c(a, b, c); c(start:end); seq(start:end); seq(10, 100, by=5)

* rep(n, time); rep(7, 3); rep(start:end, time)

* rep(1:3, c(2,2,2)); rep(1:3, each=2); rep(1:3, c(1:3))

* seq(1,4); seq(1,10, by=2); seq(0,1, length=10)

* length(), sort(), order(); rev(v) ## to reverse

* dnorm(1.96); dt(1.96, 100); df(1.96, 1, 100); dchisq(1.96, 10)

* pnorm(1.96); pt(1.96, 100); pf(1.96, 1, 100); pchisq(1.96, 10)

* rpois(n, lamda); rnorm(n); rt(n, df); rt(n, df=c(1:10)); rexp(n)

* substring(s, start, stop); substr(s, start, stop); nchar(s)

* date()

* mode() ## type of object

INPUT OUTPUT

Reading Text Files

* source(f); /* to execute commands in the file */

* read.table(f); read.table.url(url)

* download.file(url); url.show(url)

* m<-read.table("f:/temp/cigar.txt", header=TRUE)

* m<-read.table('f:/temp/cigar.txt')

* names(m)<-c("a", "b", "c")

* read.csv(f, header=TRUE, sep=",", quote="\"", dec=".")

* read.csv2(f, header=TRUE, sep=";", quote="\"", dec=",")

* read.delim(f, header=TRUE, sep="\t", quote="\"", dec=".")

* read.delim2(f, header=TRUE, sep="\t", quote="\"", dec=",")

* m<-read.csv("nes2.csv, header=TRUE)

* read.fwf(file, widths=c(3,5,3), header="FALSE, sep="", as.is=FALSE)

* as.is=TRUE; as.is=T # not to be converted into a factor

* na.strings<-c(".", "NA", "", "#") # characters for missing

* cnt=count.fields(df); which(cnt=7);

Reading Data Frames

* load(d);

* data(d);

data(d, package="p")

* data.frame(v1, v2) /* to make a data frame out of vectors */

* m3<-data.frame(as.matrix(m[,2:4]))

* m2<-edit(m); m2<-edit(data.frame(m)) # modify the dataframe

* data.entry(df)

Handling Data

* m2<-match(v1, v2, nomatch=0) # data merging

* m2<-match(m[,1], m[,3])

merge(df1, df2, by=’name) #merge two data frames by common column

Writing Data

* cat(); print()

* cat("y x1 x2", "2 4 2", "5 2 7", file="sample.txt", sep="\n")

* write.(obj, f)

* write.table(df, file='firms.csv', sep=",", row.names=NA, col.names=NA)

* save(f, obj); save.image(f)

* sink(); format()

MATRICES

Defining Matrices

* m<-c(1, 2, 3, 4); c(1, 2, 3, 4)->m; assign("m", c(1, 2, 3, 4))

* m<-data.frame(column1=c(1,2,3), column2=c(4,5,6)); ## 2 by 3

* rep(c(1,2,3), 2); rep(c(1,2,3), each=2);

* rep(c(1,2,3), c(2,2,2,)); m<-c(c1=15, c2=54, c3=50)

* seq(1,4); seq(1,10, by=2); seq(0,1, length=10);

* intm<-1:4; intm<-numeric(); intm[1]m<-1; intm[2]m>-2

* strm

* blm<-c(T,F); blm<-v1>10; ## a boolean vector of TRUE and FALSE

* m<-scan()

* mm<-matrix(1:12,4); mm<-matrix(1:12, nrow=4)

* mm<-matrix(1:12, ncol=3); mm<-matrix(1:12, nrow=4)

* mm<-matrix(1:12, nrow=4, ncol=3); mm<-matrix(1:12, 4, 3)

* arrm<-array(1:10); arrm<-array(1:10, dim=c(2,5))

* cbind(); rbind(); gl(); expand.grid()

* list()

Referring Matrices

* m[,2]; v=m[2,]; m[-1, -3] ## to extract elements

* m[c(1, 5, 6)]; m2=m[-c(1, 5, 6)] ## to extract elements

* m<-c(c1=15, c2=54, c3=50); m<-c("c1", "c3")

* m2<-m$c2; m2<-m[,2]; m2<-m[,"c2"]; m2<-m[[2]]

* m[,3:5]; m3<-m[,c(3, 4, 5)]; m3<-m[,c("c3", "c4", "c5")]

* m<-c(4, 2, 4); names(m)<-c("Grape", "Pear", "Apple")

* m1$v2 /*variable 2 of the data frame 1*/

* white(); which.max(); which(min)

* attr(m, which); attributes(obj)

Matrix Functions

* t(); det(); rank(); eigen(); diag(); prod(); crossprod()

* sum(); mean(); var(); sd(); min(); max(); prod(); cumsum(); cumprod()

* is.na(m) ## to check if m contains a missing value

* rowsum(); colsum(); nrow(); ccol()

* dim(m); dimnames(m)

* merge(df1, df2)

* as.factor(); as.matrix(), as.vector(); /* conversion*/

* is.factor(); is.matrix(), is.vector();

* class(); unclass()

* na.omit(); na.fail(); unique(); table(); sample()

* as.array(); as.data.frame()

* as.numeric(); as.characters(); as.logical(); as.complex()

REGRESSION

Ordinary Least Squares (OLS)

* lm(); glm()

* m.ols<-lm(v1~v2+v3, data=m) ## linear model

* lm(v1~v2+v3, data=m); summary(lm(v1~v2+v3, data=m)); summary(m.ols)

* names(m.ols); coef(m.ols); fitted(m.ols); resid(m.ols)

* predict(fit); AIC(fit); logLik(fit); deviance(fit)

* model.matrix(v1~v2+v3, data=m)

* m.ols2<-model.matrix(v1~v2+v3, data=m); summary(m.ols2)

Binary Response Regressions

* m.logit<-glm(v1~v2+v3,family=binomial(link=logit),data=m)

* summary(m.logit); coef(m.logit); fitted(m.logit); resid(m.logit)

* lsfit(v1,v2)

* nls(); m.nonlin<-lm(v1~v2+v2^2, data=m)

* anova(m.ols, m.nonlin)

* m.qr<-qr(m) ## QR Decomposition of a Matrix

STATISTICS

Descriptives

* summary(m); fivenum(m)

* stem(v); boxplot(v); boxplot(v1, v2); hist(v)

* qqnorm(v); qqline(v)

* rug(); lines()

* table() /*to make a table*/

* tabulate()

Multivariate Analysis

* cor(m); cor(sqrt(m)) ## Pearson correlation

* cor.test(v1, v2)

* prcomp() /* Principal components in the mva package*/

* kmeans() /* Kmeans cluster analysis in the mva package*/

* factanal() /* Factor analysis in the mva package*/

* cancor() /* Canonical correlation in the mva package*/

Categorical Data Analysis

* chisq.test(v1,v2) ## Pearson Chi-squared Test

* fisher.test(v1,v2) ## Fisher Exact Test

* friedman.test(v1,v2) ## Friedman Test

* prop.test(); binom.test() ## sign test

* kruskal.test(v1,v2) ## Kruskal-Wallis Rank Sum Test

* wilcox.test(v1,v2) ## Wilcoxon Rank Sum (Mann-Whitney) Test

* ks.test(v1,v2) ## Two Sample Kolmogorov-Smirnov Test

* bartlett.test(v1,v2) ## Bartlett Test for Homogeneity of Variances

ANOVA

T-test

* t.test(v1,v2); t.test(v1,v2, var.equal=FALSE)

* t.test(v1,v2, mu=0 paired=FALSE)

* t.test(v1.v2, mu=10, paired=F, var.equal=T)

* power.t.test(v1,v2); pairwise.t.test()

* var.test(v1,v2) ## F test for equal variance

ANOVA

* m.anova<-aov(v1~v2+v3, data=m)

* aov(); anova()

* summary(m.anova)

* power.anova.test() ## Power calculations for balanced one-way ANOVA tests

PROGRAMMING

Modules

frame_name<-function(arguments) {...}

mile.to.km<-function(mile) {mile*8/5}

km<-mile.to.km(c(35, 55, 75))

Flow Control

if (condition) {...} else if (condition) {...} else {...}

while (condition ) {...} # {} may be omitted for a single line expression

for (index in start:end) {...}

for (i in 1:100) {sum <- sum + i}

repeat {...}

switch (statement, list)

Programming Functions

* expression(); parse(); deparse(); eval()

* optim() /* general-purpose optimization */

* nlm() /* Newton algorithm */

* lm() /* linear models */

* nls() /* nonlinear least squares model */

GRAPHICS

Plotting

* plot(y~x, data=m, pch=16) # plotting character (pch)

* pairs(m) # scatterplot matrix

* xyrange<-range(m) # to get range of m

* plot(y~x, data=m, xlim=xyrange, ylim=xyrange)

* abline(0,1)

* plot((0:10), sin((1:10)*pi, type="1") # 1 joins the points

* barplot(); boxplot(); stem(); hist();

* matplot() /* matrix plot */

* pairs(m) /* scatterplots */

* coplot() /* conditional plot */

* stripplot() /* strip plot */

* qqplot(); qqnorm(); qqline() /* quantile0quantile plot */

Options

* points() # to add points to a plot

* lines() # to add lines

* text() # to add texts

* mtext() # to add margin texts

* axis() # to control axis

* par(cex=1.25 mex=1.25)

* par(mfrow=c(2,2), mfcol=c(1,1))

No comments: