#RUNNING THIS SCRIPT
#the simplest way to run this script is to put all of the data needed into one directory or folder
#open a terminal window and change directories to the data directory
#type R <bookscript1.R --vanilla and the script will run automatically and display the results
#R <bookscript1.R --vanilla >results will save the R output in results
#the plot will appear in an X window (if one is open) and in a postscript file called Rplots.ps
#the ps file may be viewed by running ghostscript on a linux/unix machine
#or clicking on the icon in mac os10 which generates a pdf file

#PROGRAM NOTES
#load the limma package
#limma = linear models for microarray analysis
#see http://bioinf.wehi.edu.au/limma/ for description

library(limma)

#we are using a small data set of 95 genes that have been extracted 
#from a much larger gene set to illustrate treatment of statistical variation
#mRNA from leaf or root tissue has been labeled with Cy3 or Cy5 (red or green)
#mixed and hybridized to slides.
#the next command reads in a text file rootleafnew.txt that shows a list of the arrays to be read
#these array files are gpr files that show one line for each gene, listing the gene (EST),
# the array number, and then foreground and background values for red and green (Rf,Rb,Gf,Gb)
#view r9.gpr to see the format of the data file
#
#each line in rootleafnew.txt shows label for root or leaf tissue 
#view the file rootleafnew.txt to see the experimental protocol
 
targets <- readTargets("rootleafnew.txt") 

#the next line creates an instance of the RGList data object
#the read.images command reads the data in all of the gpr files into the data object
#the object is a table or matrix of the data in the files with the column of the gpr file name,
#and the same data columns as in the gpr files

RG <-read.maimages(targets$File.Name,columns=list(Rf="Rf",Gf="Gf",Rb="Rb",Gb="Gb"))  
#normally the Rb and Gb values would be subtracted from Rf and Gf  
#by one of the methods described in the text
#our statistician says not to subtract background for this small a data set of 95 genes
#note that the foreground values in RG would normally be changed by this next command

RG <- backgroundCorrect(RG, method="none") 

#now the red and green data points on each array will be normalized
#so that as the spread of data points is more uniform
#using the Loess method described in the text and normalizing 
#within a sliding window interval of 0.4 along a plot of the red/green averages (A) - x axis
#vs. log of the ratio of red to green (M) - y axis
#a new data object MA with these normalized values is created

MA <- normalizeWithinArrays(RG, method="loess", span=0.4) 

#now fit the normalized data to a linear model with the lmFit command
#the c argument represents information on the dye swaps
#here, the use of red and green labels for leaf and root on each array
#fit will have a new set of expression values based on the analysis
#described on page 635, Figure 13.10
#these values take into account all sources of variation in the data
#(slide, dye) except for variation between root and leaf

fit <- lmFit(MA, c(-1,1,-1,1)) 

#the data spread from other genes is now used to estimate how far each value
#is deviating from the mean based on the statistical variation in the average gene
#this is called an empirical Bayes fit to the data which "smoothes" the standard errors

fit2 <- eBayes(fit) 

#topTable is now used to show a table of the 29 genes that are showing
#the most significant variation between roots and leaves
#shown in the table are gene, M value (log of R/G expression ratio), A value (average of R and G expression values)
#the t statistic between root and leaf values, the probability of a larger t corrected for false discover rate
#see web site on line 12
#B is a measure of false discovery rate and is the log odds that the gene is differentially expressed

topTable(fit2, adjust="fdr", n=29) 

#plot an M/A plot as described above for the data before and after normalization
#cex is a value to increase the size of the text labels on the plot

plotMA(RG,cex=0.5) 
plotMA(MA,cex=0.5)