Seminar 9 - Cluster Analysis and PCA

After loading the required libraries and the data, we begin our analysis.

Hierarchical clustering

# compute pairwise distances
pr.dis <- dist(t(sprDat), method = "euclidean")
# create a new factor representing the interaction of gType and devStage
prDes$grp <- with(prDes, interaction(gType, devStage))
summary(prDes$grp)
##     NrlKO.E16        wt.E16      NrlKO.P2         wt.P2      NrlKO.P6 
##             3             4             4             4             4 
##         wt.P6     NrlKO.P10        wt.P10 NrlKO.4_weeks    wt.4_weeks 
##             4             4             4             4             4
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")

Plot:

op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")

plot of chunk unnamed-chunk-3

par(op)

For the Ward procedure we identify 10 cluster using the dendogram:

op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = prDes$grp, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)

plot of chunk unnamed-chunk-4

par(op)

K-means clustering We set the number on cluster to be equal to 5:

set.seed(31)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# within sum of squares of each cluster
pr.km$withinss
## [1] 120153  78227 110209 100197 133036
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- kable(with(pr.kmTable, table(devStage, cluster)))
## |id       |  1|  2|  3|  4|  5|
## |:--------|--:|--:|--:|--:|--:|
## |E16      |  0|  0|  6|  0|  1|
## |P2       |  4|  0|  0|  0|  4|
## |P6       |  5|  1|  0|  0|  2|
## |P10      |  1|  2|  0|  3|  2|
## |4_weeks  |  0|  2|  1|  5|  0|

Repeat the analysis using a different seed

set.seed(540)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# within sum of squares of each cluster
pr.km$withinss
## [1] 133036 110209  78227 120153 100197
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- kable(with(pr.kmTable, table(devStage, cluster)))
## |id       |  1|  2|  3|  4|  5|
## |:--------|--:|--:|--:|--:|--:|
## |E16      |  1|  6|  0|  0|  0|
## |P2       |  4|  0|  0|  4|  0|
## |P6       |  2|  0|  1|  5|  0|
## |P10      |  2|  0|  2|  1|  3|
## |4_weeks  |  0|  1|  2|  0|  5|

PAM algorithm

set.seed(31)
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each PAM cluster")

The five chosen medoids are:

summary(pr.pam)$medoids
## [1] "Sample_22" "Sample_8"  "Sample_3"  "Sample_39" "Sample_13"

Silhouette plot:

op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")

plot of chunk unnamed-chunk-9

par(op)

Draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis.

av.sil.Width <- function(k) {
    pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
    pr.pam <- pam(pr.dis, k = k)
    return(pr.pam$silinfo$avg.width)
}
k <- 2:15
avg.width <- apply(as.matrix(k), 1, FUN = av.sil.Width)
plot(k, avg.width, xlab = "number of clusters", ylab = "average silhouette", 
    type = "b", col = "red", main = "Average silhouette vs. number of clusters")

plot of chunk unnamed-chunk-10

For a common choice of k, compare the clustering across different methods

k <- 6
# compute pairwise distances
pr.dis <- dist(t(sprDat), method = "euclidean")
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# which sample is in which cluster using the 4 methods
hclustdat <- data.frame(single = cutree(pr.hc.s, k), compl = cutree(pr.hc.c, 
    k), average = cutree(pr.hc.a, k), ward = cutree(pr.hc.w, k))
head(hclustdat)
##           single compl average ward
## Sample_20      1     1       1    1
## Sample_21      1     2       2    1
## Sample_22      1     1       1    1
## Sample_23      1     1       1    1
## Sample_16      2     3       3    2
## Sample_17      3     2       1    1
# k means
set.seed(15)
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each k-means cluster")
# PAM
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each PAM cluster")

Gene clustering

DesMat <- model.matrix(~devStage, prDes)
fit <- lmFit(prDat, DesMat)
ebFit <- eBayes(fit)
top <- topTable(ebFit, n = Inf, coef = grep("devStage", colnames(ebFit)), p.value = 1e-05)
topGenes <- rownames(top)
topDat <- sprDat[topGenes, ]
geneC.dis <- dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage", 
    xlab = "")

plot of chunk unnamed-chunk-12

Partitioning

set.seed(1234)
k <- 5
kmeans.genes <- kmeans(topDat, centers = k)
clusterNum <- 1
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples", 
    ylab = "Relative expression")
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)

plot of chunk unnamed-chunk-13

# heatmap
devStageCols <- brewer.pal(11, "RdGy")[c(2, 4, 7, 9, 11)]
heatmap(as.matrix(topDat), hclustfun = function(x) hclust(x, method = "average"), 
    labCol = prDes$grp, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = devStageCols[unclass(prDes$devStage)])
legend("topleft", levels(prDes$devStage), col = devStageCols, lty = 1, lwd = 5, 
    cex = 0.5)

plot of chunk unnamed-chunk-13

Redefining the attributes

annoTopDat <- stack(as.data.frame(topDat))  # stack probe data tall and skinny
annoTopDat$probeset <- rownames(topDat)  # add probeset ID as variable
## get info on gType and devStage, then average over reps within devStage
annoTopDat <- merge(annoTopDat, prDes, by.x = "ind", by.y = "sidChar")
devStageAvg <- ddply(annoTopDat, ~probeset, function(x) {
    avgByDevStage <- aggregate(values ~ devStage, x, mean)$values
    names(avgByDevStage) <- levels(x$devStage)
    avgByDevStage
})
## put probset info back into rownames
rownames(devStageAvg) <- devStageAvg$probeset
devStageAvg$probeset <- NULL
str(devStageAvg)
## 'data.frame':    972 obs. of  5 variables:
##  $ E16    : num  -0.628 1.235 -0.419 1.401 0.855 ...
##  $ P2     : num  -1.041 0.7 -0.918 0.737 0.74 ...
##  $ P6     : num  -0.214 -0.26 -0.744 -0.66 0.34 ...
##  $ P10    : num  0.722 -0.683 0.553 -0.779 -0.363 ...
##  $ 4_weeks: num  1.083 -0.838 1.475 -0.523 -1.464 ...
heatmap(as.matrix(devStageAvg), Colv = NA, hclustfun = function(x) hclust(x, 
    method = "average"), labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 
    1))

plot of chunk unnamed-chunk-14

k <- 4
geneDS.km <- kmeans(devStageAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage", 
        ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum, 
            sep = " "))
    axis(2)
    axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
    matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
    points(clust.centers[clusterNum, ], type = "l")
    points(clust.centers[clusterNum, ], pch = 20)
}

plot of chunk unnamed-chunk-14


par(op)

Statistical measures to evaluate clusters

pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = prDes$grp, cex = 0.6)
pvrect(pvc, alpha = 0.95)

plot of chunk unnamed-chunk-15

PCA (principal components analysis)

pcs <- prcomp(sprDat, center = F, scale = F)
# scree plot
plot(pcs)

plot of chunk unnamed-chunk-16

# append the rotations for the first 10 PCs to the phenodata
prinComp <- cbind(prDes, pcs$rotation[prDes$sidNum, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], pch = 19, 
    cex = 0.8)

plot of chunk unnamed-chunk-16

# plot data on first two PCs, colored by development stage
plot(prinComp[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21, 
    pt.bg = c(1, 2, 3, 4, 5))

plot of chunk unnamed-chunk-16

Redo the graphs using lattice

pairs(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], panel = function(...) smoothScatter(..., 
    add = TRUE))
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009

plot of chunk unnamed-chunk-17


xyplot(prinComp[, c("PC2")] ~ prinComp[, c("PC1")], auto.key = T, groups = prDes$devStage)

plot of chunk unnamed-chunk-17