# R commands for data exploration and analysis of microarray data # using t-tests and limma. # Requires functions that can be linked from Luc Janss' and Dan Nettleton's websites. # data reading and loading functions # note: using the menu's first make R point to the directory with the data file. # Also load the limma package using the package -> load package menu. # Data reading: d<-read.table("gxdata.txt",header=TRUE) # The following loads the gx* functions from Luc Janss' website: source("http://www.lucjanss.com/Docs/arrayfunctions.txt") # The following loads (a.o.) Dan Nettleton's q.value and scale.norm function from his website: source("http://www.public.iastate.edu/~dnett/MicroShortCourse/rfunctions.txt") # 1. first exploration gx.stats(d) plot(d$SL1CH1,d$SL1CH2) hist(d$SL1CH1) boxplot(d[,2:17]) sum(d$SL1CH1<4000) sum(d$SL1CH1<8000) plot(log2(d$SL1CH1),log2(d$SL1CH2)) boxplot(log2(d[,2:17])) # 2. conversion to logs and M&A values d2<-gx.ma(d) names(d2)<-c("NAME", "SL1A", "SL1M", "SL2A", "SL2M", "SL3A", "SL3M", "SL4A", "SL4M", "SL5A", "SL5M", "SL6A", "SL6M", "SL7A", "SL7M", "SL8A", "SL8M") plot(d2$SL1A,d2$SL1M) boxplot(d2[,2:17]) # 3a. make ranking for slide 8, show top data rank<-order(abs(d2$SL8M),decreasing=T) d2[rank[1:20],c("NAME","SL8A","SL8M")] # 3b. plot the selected top 100 genes in different color k<-rep(1,length(d2$NAME)) k[rank[1:100]]<-2 plot(d2$SL8A,d2$SL8M,col=k) # 4a. lowess normalisation of all slides d3<-gx.lowess(d2) # 4b. show new top genes in old MA plot rank<-order(abs(d3$SL8M),decreasing=T) k<-rep(1,length(d2$NAME)) k[rank[1:100]]<-2 plot(d2$SL8A,d2$SL8M,col=k) # 5. demonstration of background substraction using Slide 1 data s1<-d[,c("NAME","SL1CH1","SL1CH2")] s1$SL1CH1<-s1$SL1CH1-rnorm(5008,mean=800,sd=100) s1$SL1CH2<-s1$SL1CH2-rnorm(5008,mean=800,sd=100) s1sub<-s1[(s1$SL1CH1>0 & s1$SL1CH2>0),] dim(s1sub) s1sub<-gx.ma(s1sub) s1sub<-gx.lowess(s1sub) plot(s1sub$SL1CH1,s1sub$SL1CH2) # 6a. t-test for first gene (first row) t.test(c(-d3[1,"SL1M"],d3[1,"SL2M"]),c(d3[1,"SL3M"],-d3[1,"SL4M"]), var.equal=TRUE) # 6b. p-values from t-tests for all genes (this one takes about a minute or so) pval <- rep(0,5008) for(i in 1:5008) { pval[i] <- t.test(c(-d3[i,"SL1M"],d3[i,"SL2M"]),c(d3[i,"SL3M"],-d3[i,"SL4M"]), var.equal=TRUE)$p.value } # 6c. show distribution of p and q values hist(pval) qval <- q.value(pval) hist(qval) plot(pval,qval) plot(pval[order(pval)]) plot(qval[order(qval)]) # 7. top lists based on q-value d3[order(qval)[1:20],] # all data d3$NAME[order(qval)[1:20]] # gene names only sum(qval<0.4) # number of q-values <0.4 d3$NAME[qval<0.4] # gene names with q-value <0.4 # 8. variance adjustment between slides using scale normalisation # these functions need a "clean" data sheet without gene names and only M value mval<-d3[,c("SL1M", "SL2M", "SL3M", "SL4M", "SL5M", "SL6M", "SL7M", "SL8M")] boxplot(mval) # before normalisation mval <- scale.norm(mval) boxplot(mval) # after normalisation # 9. clustering top 100 genes, using only M values mval<-d3[,c("SL1M", "SL2M", "SL3M", "SL4M", "SL5M", "SL6M", "SL7M", "SL8M")] clusdat <- mval[order(qval)[1:100],] out <- kmeans(clusdat,centers=5) plot(clusdat, col=out$cluster) D <- dist(clusdat) out <- hclust(D, method="average") plot(out, hang=-1, labels=FALSE) # 10a. Use of Limma with the mval data sheet design <- c(-1,1,1,-1,1,-1,1,-1) # mean design <- cbind(design,c(0,0,1,-1,0,0,1,-1)) # day7 effect design <- cbind(design,c(0,0,0,0,1,-1,1,-1)) # Atten effect out <- lmFit(mval,design) out # look at the limma output out <- eBayes(out) topTable(out,coef=2,number=10) # coef=2 refers to day7 effects topTable(out,coef=3,number=10) # coef=3 refers to atten effects # 10b. some more information from the Limma fit day7 <- out$coefficients[,2] # fitted effect for second model effect (here day7) pval2 <- out$p.value[,2] # p values second model effect (here day7) hist(log((out$sigma)**2)) # histogram of variances per gene hist(log(out$s2.post)) # histogram of variances after eBayes smoothing