#the objective of this script is two fold: to illustrate how a mixed model works, and to perform a cluster analysis #the mixed model is designed to obtain a better prediction of gene expression values based on #both measurable effects and non measurable random variations that are beyond experimental control #such as all of the sources of variation between slides that influence a particular gene expression value #this script produces 3 different types of hierarchical clusters #based on how the distance between clusters is calculated #the small data set is that used in bookscript1 #the plots will appear in an X window (if one is open) #and a postscript file Rplots.ps is produced #the ps file may be viewed by running ghostscript on a linux/unix machine #or by clicking on the icon in mac os10 which generates a pdf file #type R results1 will save the R output in results1 #first load the library nlme, a package for linear and non linear mixed effects models #mixed models allow the inclusion of both fixed and random effects to assist with statistical evaluation #fixed effects include controllable effects such as treatment or tissue type #random effects include uncontrollable effects such as the variation in expression levels within a particular slide library( nlme ) #read the file rootleafnew.csv into the data object x; csv is a file with rows of data separated by commas #x is a matrix that will have the rows and columns of the csv file #the csv file was not used in bookscript1.R but has the same data as in the gpr files plus two extra comma-separated data sets #the second to the last one is log to the base 2 of the spot median (l2spmed) #the last column quantile is a quantile normalization between the slides as described in the text #the quantile value was obtained from analysis of a much larger data set with many genes #a statistically representative subset of 95 genes was then chosen for illustration purposes x <- read.csv( "rootleafnew.csv" ) #the next expression is a linear mixed effects modeling of the data in x using the quantile values in the last set in the csv file #note that dye, trt, and oligo are horizontal positions in the rows of the csv file #dye vs. trt and dye vs. oligo are fixed effects and are found by performing an ANOVA analysis on the data #as described on pp. 634-635 #the first "~" separates the value on the left (quantile) and the terms of the mixed model on the right separated by "+" #the last term list( slide = pdIdent( ~ -1 ) ) predicts a variable effect slide to slide #using this model means that we think that slide to slide variation is due to many random effects #technically, pdIdent( ~ -1 ) generates types of matrices that may be used to estimate #the variable component from the statistical variation of expression values within slides #a detailed explanation would take more space than we have here #for those interested, read more about linear (and non linear) mixed effect models (lme and nlme) # positive definite (pd) matrices, pd identity matrices and variance-covariance matrices tlme.out <- lme( quantile ~ dye*trt + dye*oligo , data = x , random = list( slide = pdIdent( ~ -1 ) ) ) #the resulting residue values from application of the mixed model are copied to object x x$resid <- resid( tlme.out ) #a new empty matrix for the residue expression values is produced y <- matrix( nrow = 8 , ncol = 95 ) #copy the residual values from x that correspond to each slide and dye combination into columns of y y[1,] <- x$resid[ x$slide == "s91" & x$dye == "cy3" ] y[2,] <- x$resid[ x$slide == "s91" & x$dye == "cy5" ] y[3,] <- x$resid[ x$slide == "s92" & x$dye == "cy3" ] y[4,] <- x$resid[ x$slide == "s92" & x$dye == "cy5" ] y[5,] <- x$resid[ x$slide == "s93" & x$dye == "cy3" ] y[6,] <- x$resid[ x$slide == "s93" & x$dye == "cy5" ] y[7,] <- x$resid[ x$slide == "s94" & x$dye == "cy3" ] y[8,] <- x$resid[ x$slide == "s94" & x$dye == "cy5" ] #create a vector of gene names from the x oligo column gene.labels <- as.character( unique( x$oligo ) ) #calculate Euclidian distances between genes d.x <- dist( t( y ) ) #calculate the absolute value of the correlation coefficient between genes dist.x <- as.dist( 1 - abs( cor( y ) ) ) #perform heirarchical clustering using the complete, single or average method of finding clusters #as described on pp. 638-639 and the Euclidian distances #hx, hx.s and hx.a are objects that receive the clustering information hx <- hclust( dist.x ) hx.s <- hclust( dist.x , method = "single" ) hx.a <- hclust( dist.x , method = "average" ) #make separate plots in the X window or the Rplots.ps file with x labels and y labels plot(hx, main = "Cluster diagram using Euclidian distance. Complete linkage.", xlab = "Gene Name", sub = "") plot(hx.s, main = "Cluster diagram using Euclidian distance. Single linkage.", xlab = "Gene Name", ylab = "Distance", sub = "Gene") plot(hx.a, main = "Cluster diagram using Euclidian distance. Average linkage.", xlab = "Gene Name", sub = "")