stat540-seminar9.R

Shaun Jackman — Mar 10, 2013, 2:39 PM

# STAT 540 Seminar 9
# Shaun Jackman

# Data Preparation
library('RColorBrewer')
library('car')
Loading required package: MASS
Loading required package: nnet
library('lattice')
library('limma')
library('pvclust')
library('xtable')
prDat <- read.table("../data/photoRec/GSE4051_data.txt")  # the whole enchilada
load("../data/photoRec/GSE4051_design.robj")  # load exp design as 'prDes'
sprDat <- t(scale(t(prDat)))

# Sample Clustering
# Hierarchical clustering for photoRec data
# compute pairwise distances
pr.dis = dist(t(sprDat), method = "euclidean")
pr.nms = with(prDes, paste(gType, devStage, sep = "_"))
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# plot them
par(mar = c(0, 4, 4, 2))
par(mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")

plot of chunk unnamed-chunk-1

# identify 8 clusters
par(mfrow = c(1, 1))
plot(pr.hc.w, labels = pr.nms, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)

plot of chunk unnamed-chunk-1

# Exercise: Play with the options of the heatmap function and compare the different heatmaps. Note that one can also use the original data prDat and set the option `scale="row". You will get the same heatmaps although the columns may be ordered diffently (use Colv=NA to suppress reordering).
heatmap(as.matrix(sprDat), Rowv = NA, Colv = NULL,
    hclustfun = function(x) hclust(x, method = "ward"),
    distfun = function(x) dist(x, method = "euclidean"),
    scale = "none", labCol = pr.nms, labRow = NA, margin = c(8, 1),
    ColSideColor = brewer.pal(11, "RdGy")[c(4, 7)][unclass(prDes$gType)])

plot of chunk unnamed-chunk-1


# Partitioning methods for photoRec data
# K-means clustering
# Objects in columns
set.seed(31)
samplecenters <- 5
pr.km <- kmeans(t(sprDat), centers = samplecenters, nstart = 50)
# We can look at the within sum of squares of each cluster
pr.km$withinss
[1] 120153  78227 110209 100197 133036
# We can look at the composition of each cluster
pr.kmTable = data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable = xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each cluster")
align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
<!-- html table generated in R 2.15.3 by xtable 1.7-1 package -->
<!-- Sun Mar 10 14:40:11 2013 -->
<TABLE border=1>
<CAPTION ALIGN="top"> Number of samples from each develomental stage within each cluster </CAPTION>
<TR> <TH>  </TH> <TH> 1 </TH> <TH> 2 </TH> <TH> 3 </TH> <TH> 4 </TH> <TH> 5 </TH>  </TR>
  <TR> <TD> E16 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   6 </TD> <TD align="center">   0 </TD> <TD align="center">   1 </TD> </TR>
  <TR> <TD> P2 </TD> <TD align="center">   4 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   4 </TD> </TR>
  <TR> <TD> P6 </TD> <TD align="center">   5 </TD> <TD align="center">   1 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   2 </TD> </TR>
  <TR> <TD> P10 </TD> <TD align="center">   1 </TD> <TD align="center">   2 </TD> <TD align="center">   0 </TD> <TD align="center">   3 </TD> <TD align="center">   2 </TD> </TR>
  <TR> <TD> 4_weeks </TD> <TD align="center">   0 </TD> <TD align="center">   2 </TD> <TD align="center">   1 </TD> <TD align="center">   5 </TD> <TD align="center">   0 </TD> </TR>
   </TABLE>
# Repeate the analysis using a different seed and check if you get the same clusters.

# PAM algorithm
pr.pam <- pam(pr.dis, k = samplecenters)
Error: could not find function "pam"
split(as.vector(unlist(prDes$devStage)), pr.pam$clustering)
Error: object 'pr.pam' not found
summary(pr.pam)
Error: object 'pr.pam' not found
par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
Error: object 'pr.pam' not found
# Excersise: draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis. Use the information obtained to determine if 5 was the best choice for the number of clusters.
averageSilhouetteWidth <- function(k) {
    pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
    pr.pam <- pam(pr.dis, k = k)
    return(pr.pam$silinfo$avg.width)
}
k <- 2:10
avg.width <- unlist(apply(as.matrix(k), 1, averageSilhouetteWidth))
Error: could not find function "pam"
xyplot(avg.width ~ k, data.frame(k=k, avg.width=avg.width),
    main='Average silhouette width vs number of clusters')
Error: object 'avg.width' not found
cat('The optimal number of clusters is', k[which.min(avg.width)])
Error: object 'avg.width' not found

# Gene clustering
# A smaller dataset
# We start by using different clustering algorithms to cluster the top 972 genes that showed differential expression across the different develomental stage (BH adjusted pvalue < 10-5).
mm <- model.matrix(~devStage, prDes)
fit <- eBayes(lmFit(prDat, mm))
tt <- topTable(fit, n=Inf, sort.by='none',
    coef = grep('devStage', colnames(mm)))
cat('Number of probes with an adjusted p-value < 1e-5:',
    sum(tt$adj.P.Val < 1e-5))
Number of probes with an adjusted p-value < 1e-5: 972
topDat <- sprDat[tt$adj.P.Val < 1e-5,]

# Hierarchical:
geneC.dis = dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
    xlab = "")

plot of chunk unnamed-chunk-1


# Partitioning
set.seed(1234)
genecenters <- 5
kmeans.genes <- kmeans(topDat, centers = genecenters)
clusterNum <- 1
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
    ylab = "Relative expression")
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)

plot of chunk unnamed-chunk-1


# Or, probably more commonly used, we can see both dendograms using heatmaps (through hierarchical clustering):
heatmap(as.matrix(topDat), Rowv = NULL, Colv = NULL, hclustfun = function(x) hclust(x,
    method = "average"), distfun = function(x) dist(x, method = "euclidean"),
    labCol = pr.nms, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = brewer.pal(11,
        "RdGy")[c(2, 4, 7, 9, 11)][unclass(prDes$devStage)])

plot of chunk unnamed-chunk-1


# Redefining the attributes
oDat <- with(prDes, data.frame(sample, devStage, gType, probeset = factor(rep(rownames(topDat),
    each = ncol(topDat))), geneExp = as.vector(unlist(t(topDat)))))
devStageAvg = with(oDat, tapply(geneExp, list(probeset, devStage), mean))
heatmap(as.matrix(devStageAvg), Rowv = NULL, Colv = NA, hclustfun = function(x) hclust(x,
    method = "average"), distfun = function(x) dist(x, method = "euclidean"),
    labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 1))

plot of chunk unnamed-chunk-1

geneDS.km <- kmeans(devStageAvg, centers = 4, nstart = 50)
# Look at all clusters
par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    # Set up the axes without plotting; ylim set based on trial run.
    plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
        ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
            sep = " "))
    axis(2)
    axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
    # Plot the expression of all the genes in the selected cluster in grey.
    matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
    # Add the cluster center. This is last so it isn't underneath the members
    points(geneDS.km$centers[clusterNum, ], type = "l")
    # Optional: points to show development stages.
    points(geneDS.km$centers[clusterNum, ], pch = 20)
}

plot of chunk unnamed-chunk-1

par(mfrow = c(1, 1))
plot(geneDS.km$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
    ylab = "Average expression", axes = F, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(levels(prDes$devStage)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
    points(geneDS.km$centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
    points(geneDS.km$centers[clusterNum, ], col = clusterNum, pch = 20)
}

plot of chunk unnamed-chunk-1

cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
    col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")

plot of chunk unnamed-chunk-1


# Statistical measures to evaluate clusters
pvc <- pvclust(topDat, nboot = 100)
Bootstrap (r = 0.5)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.7)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.9)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.1)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.3)... Done.
Bootstrap (r = 1.4)... Done.
plot(pvc, labels = pr.nms, cex = 0.6)
pvrect(pvc, alpha = 0.95)

plot of chunk unnamed-chunk-1


# PCA (principal components analysis)
pcs <- prcomp(sprDat, center = F, scale = F)
plot(pcs)

plot of chunk unnamed-chunk-1

# append the rotations for the first 10 PCs to the phenodata
prDes <- cbind(prDes, pcs$rotation[prDes$sample, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prDes[, 1:6], pch = 19, cex = 0.8)

plot of chunk unnamed-chunk-1

# plot data on first two PCs, colored by development stage
plot(prDes[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
    pt.bg = c(1, 2, 3, 4, 5))

plot of chunk unnamed-chunk-1