Seminar 9: Cluster Analysis and PCA

by Erica Acton

Load photoRec dataset.

prDat <- read.table("/home/eacton/R/stat540-2014-ACTON-ERICA/GSE4051_data.tsv", 
    header = TRUE, row.names = 1)
str(prDat, max.level = 0)
## 'data.frame':    29949 obs. of  39 variables:
prDes <- readRDS("/home/eacton/R/stat540-2014-ACTON-ERICA/GSE4051_design.rds")
str(prDes)
## 'data.frame':    39 obs. of  4 variables:
##  $ sidChar : chr  "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
##  $ sidNum  : num  20 21 22 23 16 17 6 24 25 26 ...
##  $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
##  $ gType   : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...

Rescale the rows.

sprDat <- (t(scale(t(prDat))))
str(sprDat, max.level = 0, give.attr = FALSE)
##  num [1:29949, 1:39] 0.0838 0.1758 0.7797 -0.3196 0.8358 ...

round(data.frame(avgBefore = rowMeans(head(prDat)), avgAfter = rowMeans(head(sprDat)), 
    varBefore = apply(head(prDat), 1, var), varAfter = apply(head(sprDat), 1, 
        var)), 2)
##              avgBefore avgAfter varBefore varAfter
## 1415670_at        7.22        0      0.02        1
## 1415671_at        9.37        0      0.35        1
## 1415672_at        9.70        0      0.15        1
## 1415673_at        8.42        0      0.03        1
## 1415674_a_at      8.47        0      0.02        1
## 1415675_at        9.67        0      0.03        1

Hierarchical Clustering

Computer pairwise distances.

pr.dis <- dist(t(sprDat), method = "euclidian")

Create a new factor representing the interation of gType and devStage.

prDes$grp <- with(prDes, interaction(gType, devStage))
summary(prDes$grp)
##        wt.E16     NrlKO.E16         wt.P2      NrlKO.P2         wt.P6 
##             4             3             4             4             4 
##      NrlKO.P6        wt.P10     NrlKO.P10    wt.4_weeks NrlKO.4_weeks 
##             4             4             4             4             4

Compute hierarchical clustering using different linkage types.

pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

Plot the different hierarchical clustering types.

op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))

plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")

plot of chunk unnamed-chunk-7


par(op)

Identify 10 clusters.

op <- par(mar = c(1, 4, 4, 1))

plot(pr.hc.w, labels = prDes$grp, cex = 0.6, main = "Ward showing 10 Clusters")
rect.hclust(pr.hc.w, k = 10)

plot of chunk unnamed-chunk-8


par(op)

Heatmap example.

GreyFun <- colorRampPalette(brewer.pal(n = 9, "Greys"))
gTypeCols <- brewer.pal(n = 11, "RdGy")[c(4, 7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = GreyFun(256), hclustfun = function(x) hclust(x, 
    method = "ward"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8, 
    1), ColSideColor = gTypeCols[unclass(prDes$gType)])
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
legend("topright", legend = levels(prDes$gType), col = gTypeCols, lty = 1, lwd = 5, 
    cex = 0.5)

plot of chunk unnamed-chunk-9

Exercise: Playing with heatmaps.

GnBuFun <- colorRampPalette(brewer.pal(n = 9, "GnBu"))
gTypeCols <- brewer.pal(n = 9, "RdGy")[c(4, 7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = GnBuFun(256), hclustfun = function(x) hclust(x, 
    method = "average"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8, 
    1), ColSideColor = gTypeCols[unclass(prDes$gType)])
legend("topright", legend = levels(prDes$gType), col = gTypeCols, lty = 1, lwd = 5, 
    cex = 0.5)

plot of chunk unnamed-chunk-10

K-means Clustering

Choose parameters, including k.

set.seed(31)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)

Look at the sum of squares of each cluster.

pr.km$withinss
## [1] 120153  78227 110209 100197 133036

Look at the composition of each cluster.

pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each developmental stage within each k-means cluster")

align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
## <!-- html table generated in R 3.1.0 by xtable 1.7-3 package -->
## <!-- Mon Apr 14 00:27:17 2014 -->
## <TABLE border=1>
## <CAPTION ALIGN="top"> Number of samples from each developmental stage within each k-means cluster </CAPTION>
## <TR> <TH>  </TH> <TH> 1 </TH> <TH> 2 </TH> <TH> 3 </TH> <TH> 4 </TH> <TH> 5 </TH>  </TR>
##   <TR> <TD> E16 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   6 </TD> <TD align="center">   0 </TD> <TD align="center">   1 </TD> </TR>
##   <TR> <TD> P2 </TD> <TD align="center">   4 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   4 </TD> </TR>
##   <TR> <TD> P6 </TD> <TD align="center">   5 </TD> <TD align="center">   1 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   2 </TD> </TR>
##   <TR> <TD> P10 </TD> <TD align="center">   1 </TD> <TD align="center">   2 </TD> <TD align="center">   0 </TD> <TD align="center">   3 </TD> <TD align="center">   2 </TD> </TR>
##   <TR> <TD> 4_weeks </TD> <TD align="center">   0 </TD> <TD align="center">   2 </TD> <TD align="center">   1 </TD> <TD align="center">   5 </TD> <TD align="center">   0 </TD> </TR>
##    </TABLE>

PAM Algorithm

pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each developmental stage within each PAM cluster")

align(pamTable) <- "lccccc"
print(pamTable, type = "html", caption.placement = "top")
## <!-- html table generated in R 3.1.0 by xtable 1.7-3 package -->
## <!-- Mon Apr 14 00:27:17 2014 -->
## <TABLE border=1>
## <CAPTION ALIGN="top"> Number of samples from each developmental stage within each PAM cluster </CAPTION>
## <TR> <TH>  </TH> <TH> 1 </TH> <TH> 2 </TH> <TH> 3 </TH> <TH> 4 </TH> <TH> 5 </TH>  </TR>
##   <TR> <TD> E16 </TD> <TD align="center">   6 </TD> <TD align="center">   1 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> </TR>
##   <TR> <TD> P2 </TD> <TD align="center">   0 </TD> <TD align="center">   1 </TD> <TD align="center">   7 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> </TR>
##   <TR> <TD> P6 </TD> <TD align="center">   3 </TD> <TD align="center">   2 </TD> <TD align="center">   3 </TD> <TD align="center">   0 </TD> <TD align="center">   0 </TD> </TR>
##   <TR> <TD> P10 </TD> <TD align="center">   0 </TD> <TD align="center">   2 </TD> <TD align="center">   1 </TD> <TD align="center">   1 </TD> <TD align="center">   4 </TD> </TR>
##   <TR> <TD> 4_weeks </TD> <TD align="center">   1 </TD> <TD align="center">   0 </TD> <TD align="center">   1 </TD> <TD align="center">   4 </TD> <TD align="center">   2 </TD> </TR>
##    </TABLE>

summary(pr.pam)
## Medoids:
##      ID              
## [1,] "3"  "Sample_22"
## [2,] "15" "Sample_8" 
## [3,] "13" "Sample_3" 
## [4,] "35" "Sample_39"
## [5,] "28" "Sample_13"
## Clustering vector:
## Sample_20 Sample_21 Sample_22 Sample_23 Sample_16 Sample_17  Sample_6 
##         1         1         1         1         2         1         1 
## Sample_24 Sample_25 Sample_26 Sample_27 Sample_14  Sample_3  Sample_5 
##         3         3         3         3         3         3         3 
##  Sample_8 Sample_28 Sample_29 Sample_30 Sample_31  Sample_1 Sample_10 
##         2         2         1         1         3         3         1 
##  Sample_4  Sample_7 Sample_32 Sample_33 Sample_34 Sample_35 Sample_13 
##         3         2         4         2         2         3         5 
## Sample_15 Sample_18 Sample_19 Sample_36 Sample_37 Sample_38 Sample_39 
##         5         5         5         4         4         4         4 
## Sample_11 Sample_12  Sample_2  Sample_9 
##         5         3         5         1 
## Objective function:
## build  swap 
## 136.7 136.0 
## 
## Numerical information per cluster:
##      size max_diss av_diss diameter separation
## [1,]   10    223.8   150.3    284.1      113.8
## [2,]    6    179.4   136.8    226.2      150.4
## [3,]   12    173.7   136.0    221.4      113.8
## [4,]    5    206.9   133.5    254.7      166.8
## [5,]    6    151.5   113.2    209.3      150.9
## 
## Isolated clusters:
##  L-clusters: character(0)
##  L*-clusters: character(0)
## 
## Silhouette plot information:
##           cluster neighbor sil_width
## Sample_23       1        3   0.27682
## Sample_21       1        4   0.26316
## Sample_6        1        5   0.19196
## Sample_22       1        3   0.18419
## Sample_17       1        5   0.15715
## Sample_9        1        5   0.09849
## Sample_29       1        4   0.05244
## Sample_20       1        5   0.04262
## Sample_10       1        5  -0.06535
## Sample_30       1        3  -0.11538
## Sample_34       2        3   0.25189
## Sample_16       2        3   0.23344
## Sample_7        2        3   0.19390
## Sample_8        2        3   0.19259
## Sample_33       2        3   0.16094
## Sample_28       2        3  -0.03305
## Sample_3        3        5   0.29346
## Sample_31       3        5   0.26948
## Sample_24       3        5   0.24758
## Sample_35       3        5   0.22591
## Sample_14       3        5   0.22389
## Sample_25       3        1   0.20491
## Sample_5        3        2   0.15337
## Sample_27       3        2   0.08856
## Sample_1        3        5   0.07989
## Sample_26       3        2   0.05258
## Sample_4        3        5   0.04629
## Sample_12       3        5   0.02340
## Sample_36       4        1   0.18923
## Sample_32       4        5   0.10747
## Sample_39       4        5   0.10633
## Sample_38       4        5   0.10633
## Sample_37       4        5  -0.02457
## Sample_13       5        3   0.31248
## Sample_2        5        4   0.24355
## Sample_15       5        3   0.21337
## Sample_11       5        4   0.18197
## Sample_18       5        1   0.17144
## Sample_19       5        3   0.10056
## Average silhouette width per cluster:
## [1] 0.10861 0.16662 0.15911 0.09696 0.20390
## Average silhouette width of total data set:
## [1] 0.1462
## 
## Available components:
## [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
## [6] "clusinfo"   "silinfo"    "diss"       "call"

Silhouette plot.

op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 Clusters")

plot of chunk unnamed-chunk-15

par(op)

Gene Clustering

Start with the top 972 genes that showed differential expression across the different developmental stage (BH adjusted p value < 10-5).

devDes <- model.matrix(~devStage, prDes)
fit <- lmFit(prDat, devDes)
ebFit <- eBayes(fit)

topDat <- topTable(ebFit, coef = grep("devStage", colnames(coef(ebFit))), p.value = 1e-05, 
    n = 972)

ttopDat <- sprDat[rownames(topDat), ]
head(ttopDat)
##            Sample_20 Sample_21 Sample_22 Sample_23 Sample_16 Sample_17
## 1440645_at   -0.8052   -0.8282   -0.7585   -0.8742   -0.3668   -0.6953
## 1421084_at   -1.2109   -1.5209   -0.9759   -1.5029   -0.9287   -1.5552
## 1451590_at   -1.1387   -1.1429   -1.1309   -1.0890   -1.2790   -1.1413
## 1428680_at   -0.7774   -0.9124   -0.8370   -0.7632   -0.3976   -0.9552
## 1450215_at   -1.5844   -1.5934   -0.9616   -1.3886   -1.6179   -1.5934
## 1416041_at   -0.4638   -0.8668   -0.5684   -0.6407   -0.5958   -0.3712
##            Sample_6 Sample_24 Sample_25 Sample_26 Sample_27 Sample_14
## 1440645_at  -0.8462   -0.5961   -0.5551   -0.5163   -0.4516   -0.6248
## 1421084_at  -1.4433   -0.8845   -1.0063   -0.3879   -0.9321   -1.3275
## 1451590_at  -1.3654   -1.1178   -0.9241   -0.7011   -0.9963   -0.9047
## 1428680_at  -0.8294   -0.8705   -0.6106   -0.4110   -0.5821   -0.5578
## 1450215_at  -1.6513   -0.6457   -0.8491   -0.7587   -0.8919   -0.5134
## 1416041_at  -0.6094   -0.6303   -0.3647   -0.6653   -0.5333   -0.7272
##            Sample_3 Sample_5 Sample_8 Sample_28 Sample_29 Sample_30
## 1440645_at  -0.5738  -0.5975  -0.2504  -0.62198   -0.7736  -0.77865
## 1421084_at  -1.2186  -1.2949  -1.0938  -0.02257    0.4234   0.31620
## 1451590_at  -1.0838  -0.9780  -0.8519  -0.53103   -0.0741   0.04158
## 1428680_at  -0.5997  -0.5662  -0.5368  -0.32553   -0.5703  -0.68438
## 1450215_at  -0.5713  -0.6071  -1.2521  -0.41268    0.2215   0.03786
## 1416041_at  -0.8526  -0.6965  -0.7097  -0.50924   -0.5224  -0.31923
##            Sample_31 Sample_1 Sample_10 Sample_4 Sample_7 Sample_32
## 1440645_at  -0.53070  -0.4574   -0.6263  -0.4624  -0.2633   0.07733
## 1421084_at   0.27761   0.3462    0.3248   0.3977  -0.5042   0.98089
## 1451590_at  -0.23949   0.1698    0.2928   0.2358  -0.4510   1.07636
## 1428680_at  -0.60221  -0.6500   -0.6173  -0.7590  -0.6215   0.75773
## 1450215_at  -0.08596   0.3500    0.7765   0.5505  -0.3712   1.22374
## 1416041_at  -0.13798  -0.7157   -0.6554  -0.7825  -0.6253  -0.10019
##            Sample_33 Sample_34 Sample_35 Sample_13 Sample_15 Sample_18
## 1440645_at   -0.2418  -0.24969   -0.2073   0.01983  -0.07648   -0.1965
## 1421084_at    0.8480   0.89512    1.0495   0.81793   0.94230    0.8908
## 1451590_at    0.1159   0.06932    1.0031   0.74661   0.73091    0.8670
## 1428680_at   -0.3255  -0.32385    0.2379   0.10878  -0.09412    0.3603
## 1450215_at    0.7341   0.48462    0.8895   0.87536   0.80004    0.8848
## 1416041_at   -0.6554  -0.63792   -0.4172   0.21302   0.16155    0.2738
##            Sample_19 Sample_36 Sample_37 Sample_38 Sample_39 Sample_11
## 1440645_at  -0.24250     2.092     1.996     2.030     2.003    1.9920
## 1421084_at   0.85224     1.041     1.242     1.024     1.217    0.9552
## 1451590_at   0.66287     1.427     1.547     1.422     1.474    1.3747
## 1428680_at  -0.03291     2.108     1.548     1.638     1.359    2.1831
## 1450215_at   0.66822     1.313     1.294     1.271     1.186    1.0119
## 1416041_at   0.04492     2.040     2.149     2.188     2.286    1.6838
##            Sample_12 Sample_2 Sample_9
## 1440645_at     1.452    2.099   1.3070
## 1421084_at     1.054    1.105   0.8094
## 1451590_at     1.500    1.385   0.9978
## 1428680_at     2.191    1.915   1.4075
## 1450215_at     1.120    0.913   0.7435
## 1416041_at     1.049    1.788   1.4977

Hierarchical:

geneC.dis <- dist(ttopDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")

plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage", 
    xlab = "")

plot of chunk unnamed-chunk-17

Partitioning:

set.seed(1234)
k <- 5
kmeans.genes <- kmeans(ttopDat, centers = k)

Choose desired cluster.

clusterNum <- 1

Set up axes. Plot the expression of all the genes in the selected cluster in grey. Add in the cluster center. Colour points to show the developmental stage.

plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples", 
    ylab = "Relative Expression")

matlines(y = t(ttopDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)

plot of chunk unnamed-chunk-20

Heatmaps (hierarchical):

devStageCols <- brewer.pal(n = 11, "RdGy")[c(2, 4, 7, 9, 11)]
heatmap(as.matrix(ttopDat), col = GreyFun(256), hclustfun = function(x) hclust(x, 
    method = "average"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8, 
    1), ColSideColor = devStageCols[unclass(prDes$devStage)])
legend("topleft", levels(prDes$devStage), col = devStageCols, lty = 1, lwd = 5, 
    cex = 0.5)

plot of chunk unnamed-chunk-21

Redefining the Attributes

Define new attributes for a gene and estimate the parameters.

annoTopDat <- stack(as.data.frame(ttopDat))
annoTopDat$probeset <- rownames(ttopDat)

annoTopDat <- merge(annoTopDat, prDes, by.x = "ind", by.y = "sidChar")
devStageAvg <- ddply(annoTopDat, ~probeset, function(x) {
    avgbyDevStage <- aggregate(values ~ devStage, x, mean)$values
    names(avgbyDevStage) <- levels(x$devStage)
    avgbyDevStage
})

rownames(devStageAvg) <- devStageAvg$probeset
devStageAvg$probeset <- NULL
str(devStageAvg)
## 'data.frame':    972 obs. of  5 variables:
##  $ E16    : num  -0.628 1.235 -0.419 1.401 0.855 ...
##  $ P2     : num  -1.041 0.7 -0.918 0.737 0.74 ...
##  $ P6     : num  -0.214 -0.26 -0.744 -0.66 0.34 ...
##  $ P10    : num  0.722 -0.683 0.553 -0.779 -0.363 ...
##  $ 4_weeks: num  1.083 -0.838 1.475 -0.523 -1.464 ...

Make a heatmap.

heatmap(as.matrix(devStageAvg), Colv = NA, col = GreyFun(256), hclustfun = function(x) hclust(x, 
    method = "average"), labCol = colnames(devStageAvg), labRow = NA, margin = c(8, 
    1))

plot of chunk unnamed-chunk-23

Look at the relative expression in all clusters with respect to developmental stage as determined by kmeans.

k <- 4
geneDS.km <- kmeans(devStageAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers

Plot all cluster centers separately.

op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Developmental Stage", 
        ylab = "Relative Expression", axes = F, main = paste("Cluster", clusterNum, 
            sep = ""))
    axis(2)
    axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)

    matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
    points(clust.centers[clusterNum, ], type = "l")
    points(clust.centers[clusterNum, ], pch = 20)
}

plot of chunk unnamed-chunk-25

par(op)

Plot to compare all clusters' centers.

plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Developmental Stage", 
    ylab = "Relative Expression", axes = F, main = "Clusters' Centers")
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)

for (clusterNum in 1:4) {
    points(clust.centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
    points(clust.centers[clusterNum, ], col = clusterNum, pch = 20)
}

plot of chunk unnamed-chunk-26

Plotting 3-dimensional clusters as determined by k-means.

cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"], 
    col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")

plot of chunk unnamed-chunk-27

Statistical Measures to Evaluate Clusters

Qualifying cluster membership.

pvc <- pvclust(ttopDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = prDes$grp, cex = 0.6)
pvrect(pvc, alpha = 0.95)

plot of chunk unnamed-chunk-28

PCA (Principal Components Analysis)

# scree plot
pcs <- prcomp(sprDat, center = F, scale = F)
plot(pcs)

plot of chunk unnamed-chunk-29

# scatterplot relating PCs to covariates
prinComp <- cbind(prDes, pcs$rotation[prDes$sidNum, 1:10])
plot(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], pch = 19, 
    cex = 0.8)

plot of chunk unnamed-chunk-29

# plot data on 2 PCs, coloured by devStage
plot(prinComp[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21, 
    pt.bg = c(1, 2, 3, 4, 5))

plot of chunk unnamed-chunk-29