#RUNNING THIS SCRIPT #the simplest way to run this script is to put all of the data needed into one directory or folder #open a terminal window and change directories to the data directory #type R results will save the R output in results #the plot will appear in an X window (if one is open) and in a postscript file called Rplots.ps #the ps file may be viewed by running ghostscript on a linux/unix machine #or clicking on the icon in mac os10 which generates a pdf file #PROGRAM NOTES #load the limma package #limma = linear models for microarray analysis #see http://bioinf.wehi.edu.au/limma/ for description library(limma) #we are using a small data set of 95 genes that have been extracted #from a much larger gene set to illustrate treatment of statistical variation #mRNA from leaf or root tissue has been labeled with Cy3 or Cy5 (red or green) #mixed and hybridized to slides. #the next command reads in a text file rootleafnew.txt that shows a list of the arrays to be read #these array files are gpr files that show one line for each gene, listing the gene (EST), # the array number, and then foreground and background values for red and green (Rf,Rb,Gf,Gb) #view r9.gpr to see the format of the data file # #each line in rootleafnew.txt shows label for root or leaf tissue #view the file rootleafnew.txt to see the experimental protocol targets <- readTargets("rootleafnew.txt") #the next line creates an instance of the RGList data object #the read.images command reads the data in all of the gpr files into the data object #the object is a table or matrix of the data in the files with the column of the gpr file name, #and the same data columns as in the gpr files RG <-read.maimages(targets$File.Name,columns=list(Rf="Rf",Gf="Gf",Rb="Rb",Gb="Gb")) #normally the Rb and Gb values would be subtracted from Rf and Gf #by one of the methods described in the text #our statistician says not to subtract background for this small a data set of 95 genes #note that the foreground values in RG would normally be changed by this next command RG <- backgroundCorrect(RG, method="none") #now the red and green data points on each array will be normalized #so that as the spread of data points is more uniform #using the Loess method described in the text and normalizing #within a sliding window interval of 0.4 along a plot of the red/green averages (A) - x axis #vs. log of the ratio of red to green (M) - y axis #a new data object MA with these normalized values is created MA <- normalizeWithinArrays(RG, method="loess", span=0.4) #now fit the normalized data to a linear model with the lmFit command #the c argument represents information on the dye swaps #here, the use of red and green labels for leaf and root on each array #fit will have a new set of expression values based on the analysis #described on page 635, Figure 13.10 #these values take into account all sources of variation in the data #(slide, dye) except for variation between root and leaf fit <- lmFit(MA, c(-1,1,-1,1)) #the data spread from other genes is now used to estimate how far each value #is deviating from the mean based on the statistical variation in the average gene #this is called an empirical Bayes fit to the data which "smoothes" the standard errors fit2 <- eBayes(fit) #topTable is now used to show a table of the 29 genes that are showing #the most significant variation between roots and leaves #shown in the table are gene, M value (log of R/G expression ratio), A value (average of R and G expression values) #the t statistic between root and leaf values, the probability of a larger t corrected for false discover rate #see web site on line 12 #B is a measure of false discovery rate and is the log odds that the gene is differentially expressed topTable(fit2, adjust="fdr", n=29) #plot an M/A plot as described above for the data before and after normalization #cex is a value to increase the size of the text labels on the plot plotMA(RG,cex=0.5) plotMA(MA,cex=0.5)