by Erica Acton
Load photoRec dataset.
prDat <- read.table("/home/eacton/R/stat540-2014-ACTON-ERICA/GSE4051_data.tsv",
header = TRUE, row.names = 1)
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("/home/eacton/R/stat540-2014-ACTON-ERICA/GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
Rescale the rows.
sprDat <- (t(scale(t(prDat))))
str(sprDat, max.level = 0, give.attr = FALSE)
## num [1:29949, 1:39] 0.0838 0.1758 0.7797 -0.3196 0.8358 ...
round(data.frame(avgBefore = rowMeans(head(prDat)), avgAfter = rowMeans(head(sprDat)),
varBefore = apply(head(prDat), 1, var), varAfter = apply(head(sprDat), 1,
var)), 2)
## avgBefore avgAfter varBefore varAfter
## 1415670_at 7.22 0 0.02 1
## 1415671_at 9.37 0 0.35 1
## 1415672_at 9.70 0 0.15 1
## 1415673_at 8.42 0 0.03 1
## 1415674_a_at 8.47 0 0.02 1
## 1415675_at 9.67 0 0.03 1
Computer pairwise distances.
pr.dis <- dist(t(sprDat), method = "euclidian")
Create a new factor representing the interation of gType and devStage.
prDes$grp <- with(prDes, interaction(gType, devStage))
summary(prDes$grp)
## wt.E16 NrlKO.E16 wt.P2 NrlKO.P2 wt.P6
## 4 3 4 4 4
## NrlKO.P6 wt.P10 NrlKO.P10 wt.4_weeks NrlKO.4_weeks
## 4 4 4 4 4
Compute hierarchical clustering using different linkage types.
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
Plot the different hierarchical clustering types.
op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
par(op)
Identify 10 clusters.
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = prDes$grp, cex = 0.6, main = "Ward showing 10 Clusters")
rect.hclust(pr.hc.w, k = 10)
par(op)
Heatmap example.
GreyFun <- colorRampPalette(brewer.pal(n = 9, "Greys"))
gTypeCols <- brewer.pal(n = 11, "RdGy")[c(4, 7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = GreyFun(256), hclustfun = function(x) hclust(x,
method = "ward"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8,
1), ColSideColor = gTypeCols[unclass(prDes$gType)])
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
legend("topright", legend = levels(prDes$gType), col = gTypeCols, lty = 1, lwd = 5,
cex = 0.5)
GnBuFun <- colorRampPalette(brewer.pal(n = 9, "GnBu"))
gTypeCols <- brewer.pal(n = 9, "RdGy")[c(4, 7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = GnBuFun(256), hclustfun = function(x) hclust(x,
method = "average"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8,
1), ColSideColor = gTypeCols[unclass(prDes$gType)])
legend("topright", legend = levels(prDes$gType), col = gTypeCols, lty = 1, lwd = 5,
cex = 0.5)
Choose parameters, including k.
set.seed(31)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
Look at the sum of squares of each cluster.
pr.km$withinss
## [1] 120153 78227 110209 100197 133036
Look at the composition of each cluster.
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each developmental stage within each k-means cluster")
align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
## <!-- html table generated in R 3.1.0 by xtable 1.7-3 package -->
## <!-- Mon Apr 14 00:27:17 2014 -->
## <TABLE border=1>
## <CAPTION ALIGN="top"> Number of samples from each developmental stage within each k-means cluster </CAPTION>
## <TR> <TH> </TH> <TH> 1 </TH> <TH> 2 </TH> <TH> 3 </TH> <TH> 4 </TH> <TH> 5 </TH> </TR>
## <TR> <TD> E16 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> <TD align="center"> 6 </TD> <TD align="center"> 0 </TD> <TD align="center"> 1 </TD> </TR>
## <TR> <TD> P2 </TD> <TD align="center"> 4 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> <TD align="center"> 4 </TD> </TR>
## <TR> <TD> P6 </TD> <TD align="center"> 5 </TD> <TD align="center"> 1 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> <TD align="center"> 2 </TD> </TR>
## <TR> <TD> P10 </TD> <TD align="center"> 1 </TD> <TD align="center"> 2 </TD> <TD align="center"> 0 </TD> <TD align="center"> 3 </TD> <TD align="center"> 2 </TD> </TR>
## <TR> <TD> 4_weeks </TD> <TD align="center"> 0 </TD> <TD align="center"> 2 </TD> <TD align="center"> 1 </TD> <TD align="center"> 5 </TD> <TD align="center"> 0 </TD> </TR>
## </TABLE>
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each developmental stage within each PAM cluster")
align(pamTable) <- "lccccc"
print(pamTable, type = "html", caption.placement = "top")
## <!-- html table generated in R 3.1.0 by xtable 1.7-3 package -->
## <!-- Mon Apr 14 00:27:17 2014 -->
## <TABLE border=1>
## <CAPTION ALIGN="top"> Number of samples from each developmental stage within each PAM cluster </CAPTION>
## <TR> <TH> </TH> <TH> 1 </TH> <TH> 2 </TH> <TH> 3 </TH> <TH> 4 </TH> <TH> 5 </TH> </TR>
## <TR> <TD> E16 </TD> <TD align="center"> 6 </TD> <TD align="center"> 1 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> </TR>
## <TR> <TD> P2 </TD> <TD align="center"> 0 </TD> <TD align="center"> 1 </TD> <TD align="center"> 7 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> </TR>
## <TR> <TD> P6 </TD> <TD align="center"> 3 </TD> <TD align="center"> 2 </TD> <TD align="center"> 3 </TD> <TD align="center"> 0 </TD> <TD align="center"> 0 </TD> </TR>
## <TR> <TD> P10 </TD> <TD align="center"> 0 </TD> <TD align="center"> 2 </TD> <TD align="center"> 1 </TD> <TD align="center"> 1 </TD> <TD align="center"> 4 </TD> </TR>
## <TR> <TD> 4_weeks </TD> <TD align="center"> 1 </TD> <TD align="center"> 0 </TD> <TD align="center"> 1 </TD> <TD align="center"> 4 </TD> <TD align="center"> 2 </TD> </TR>
## </TABLE>
summary(pr.pam)
## Medoids:
## ID
## [1,] "3" "Sample_22"
## [2,] "15" "Sample_8"
## [3,] "13" "Sample_3"
## [4,] "35" "Sample_39"
## [5,] "28" "Sample_13"
## Clustering vector:
## Sample_20 Sample_21 Sample_22 Sample_23 Sample_16 Sample_17 Sample_6
## 1 1 1 1 2 1 1
## Sample_24 Sample_25 Sample_26 Sample_27 Sample_14 Sample_3 Sample_5
## 3 3 3 3 3 3 3
## Sample_8 Sample_28 Sample_29 Sample_30 Sample_31 Sample_1 Sample_10
## 2 2 1 1 3 3 1
## Sample_4 Sample_7 Sample_32 Sample_33 Sample_34 Sample_35 Sample_13
## 3 2 4 2 2 3 5
## Sample_15 Sample_18 Sample_19 Sample_36 Sample_37 Sample_38 Sample_39
## 5 5 5 4 4 4 4
## Sample_11 Sample_12 Sample_2 Sample_9
## 5 3 5 1
## Objective function:
## build swap
## 136.7 136.0
##
## Numerical information per cluster:
## size max_diss av_diss diameter separation
## [1,] 10 223.8 150.3 284.1 113.8
## [2,] 6 179.4 136.8 226.2 150.4
## [3,] 12 173.7 136.0 221.4 113.8
## [4,] 5 206.9 133.5 254.7 166.8
## [5,] 6 151.5 113.2 209.3 150.9
##
## Isolated clusters:
## L-clusters: character(0)
## L*-clusters: character(0)
##
## Silhouette plot information:
## cluster neighbor sil_width
## Sample_23 1 3 0.27682
## Sample_21 1 4 0.26316
## Sample_6 1 5 0.19196
## Sample_22 1 3 0.18419
## Sample_17 1 5 0.15715
## Sample_9 1 5 0.09849
## Sample_29 1 4 0.05244
## Sample_20 1 5 0.04262
## Sample_10 1 5 -0.06535
## Sample_30 1 3 -0.11538
## Sample_34 2 3 0.25189
## Sample_16 2 3 0.23344
## Sample_7 2 3 0.19390
## Sample_8 2 3 0.19259
## Sample_33 2 3 0.16094
## Sample_28 2 3 -0.03305
## Sample_3 3 5 0.29346
## Sample_31 3 5 0.26948
## Sample_24 3 5 0.24758
## Sample_35 3 5 0.22591
## Sample_14 3 5 0.22389
## Sample_25 3 1 0.20491
## Sample_5 3 2 0.15337
## Sample_27 3 2 0.08856
## Sample_1 3 5 0.07989
## Sample_26 3 2 0.05258
## Sample_4 3 5 0.04629
## Sample_12 3 5 0.02340
## Sample_36 4 1 0.18923
## Sample_32 4 5 0.10747
## Sample_39 4 5 0.10633
## Sample_38 4 5 0.10633
## Sample_37 4 5 -0.02457
## Sample_13 5 3 0.31248
## Sample_2 5 4 0.24355
## Sample_15 5 3 0.21337
## Sample_11 5 4 0.18197
## Sample_18 5 1 0.17144
## Sample_19 5 3 0.10056
## Average silhouette width per cluster:
## [1] 0.10861 0.16662 0.15911 0.09696 0.20390
## Average silhouette width of total data set:
## [1] 0.1462
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call"
Silhouette plot.
op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 Clusters")
par(op)
Start with the top 972 genes that showed differential expression across the different developmental stage (BH adjusted p value < 10-5).
devDes <- model.matrix(~devStage, prDes)
fit <- lmFit(prDat, devDes)
ebFit <- eBayes(fit)
topDat <- topTable(ebFit, coef = grep("devStage", colnames(coef(ebFit))), p.value = 1e-05,
n = 972)
ttopDat <- sprDat[rownames(topDat), ]
head(ttopDat)
## Sample_20 Sample_21 Sample_22 Sample_23 Sample_16 Sample_17
## 1440645_at -0.8052 -0.8282 -0.7585 -0.8742 -0.3668 -0.6953
## 1421084_at -1.2109 -1.5209 -0.9759 -1.5029 -0.9287 -1.5552
## 1451590_at -1.1387 -1.1429 -1.1309 -1.0890 -1.2790 -1.1413
## 1428680_at -0.7774 -0.9124 -0.8370 -0.7632 -0.3976 -0.9552
## 1450215_at -1.5844 -1.5934 -0.9616 -1.3886 -1.6179 -1.5934
## 1416041_at -0.4638 -0.8668 -0.5684 -0.6407 -0.5958 -0.3712
## Sample_6 Sample_24 Sample_25 Sample_26 Sample_27 Sample_14
## 1440645_at -0.8462 -0.5961 -0.5551 -0.5163 -0.4516 -0.6248
## 1421084_at -1.4433 -0.8845 -1.0063 -0.3879 -0.9321 -1.3275
## 1451590_at -1.3654 -1.1178 -0.9241 -0.7011 -0.9963 -0.9047
## 1428680_at -0.8294 -0.8705 -0.6106 -0.4110 -0.5821 -0.5578
## 1450215_at -1.6513 -0.6457 -0.8491 -0.7587 -0.8919 -0.5134
## 1416041_at -0.6094 -0.6303 -0.3647 -0.6653 -0.5333 -0.7272
## Sample_3 Sample_5 Sample_8 Sample_28 Sample_29 Sample_30
## 1440645_at -0.5738 -0.5975 -0.2504 -0.62198 -0.7736 -0.77865
## 1421084_at -1.2186 -1.2949 -1.0938 -0.02257 0.4234 0.31620
## 1451590_at -1.0838 -0.9780 -0.8519 -0.53103 -0.0741 0.04158
## 1428680_at -0.5997 -0.5662 -0.5368 -0.32553 -0.5703 -0.68438
## 1450215_at -0.5713 -0.6071 -1.2521 -0.41268 0.2215 0.03786
## 1416041_at -0.8526 -0.6965 -0.7097 -0.50924 -0.5224 -0.31923
## Sample_31 Sample_1 Sample_10 Sample_4 Sample_7 Sample_32
## 1440645_at -0.53070 -0.4574 -0.6263 -0.4624 -0.2633 0.07733
## 1421084_at 0.27761 0.3462 0.3248 0.3977 -0.5042 0.98089
## 1451590_at -0.23949 0.1698 0.2928 0.2358 -0.4510 1.07636
## 1428680_at -0.60221 -0.6500 -0.6173 -0.7590 -0.6215 0.75773
## 1450215_at -0.08596 0.3500 0.7765 0.5505 -0.3712 1.22374
## 1416041_at -0.13798 -0.7157 -0.6554 -0.7825 -0.6253 -0.10019
## Sample_33 Sample_34 Sample_35 Sample_13 Sample_15 Sample_18
## 1440645_at -0.2418 -0.24969 -0.2073 0.01983 -0.07648 -0.1965
## 1421084_at 0.8480 0.89512 1.0495 0.81793 0.94230 0.8908
## 1451590_at 0.1159 0.06932 1.0031 0.74661 0.73091 0.8670
## 1428680_at -0.3255 -0.32385 0.2379 0.10878 -0.09412 0.3603
## 1450215_at 0.7341 0.48462 0.8895 0.87536 0.80004 0.8848
## 1416041_at -0.6554 -0.63792 -0.4172 0.21302 0.16155 0.2738
## Sample_19 Sample_36 Sample_37 Sample_38 Sample_39 Sample_11
## 1440645_at -0.24250 2.092 1.996 2.030 2.003 1.9920
## 1421084_at 0.85224 1.041 1.242 1.024 1.217 0.9552
## 1451590_at 0.66287 1.427 1.547 1.422 1.474 1.3747
## 1428680_at -0.03291 2.108 1.548 1.638 1.359 2.1831
## 1450215_at 0.66822 1.313 1.294 1.271 1.186 1.0119
## 1416041_at 0.04492 2.040 2.149 2.188 2.286 1.6838
## Sample_12 Sample_2 Sample_9
## 1440645_at 1.452 2.099 1.3070
## 1421084_at 1.054 1.105 0.8094
## 1451590_at 1.500 1.385 0.9978
## 1428680_at 2.191 1.915 1.4075
## 1450215_at 1.120 0.913 0.7435
## 1416041_at 1.049 1.788 1.4977
Hierarchical:
geneC.dis <- dist(ttopDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
xlab = "")
Partitioning:
set.seed(1234)
k <- 5
kmeans.genes <- kmeans(ttopDat, centers = k)
Choose desired cluster.
clusterNum <- 1
Set up axes. Plot the expression of all the genes in the selected cluster in grey. Add in the cluster center. Colour points to show the developmental stage.
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
ylab = "Relative Expression")
matlines(y = t(ttopDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)
Heatmaps (hierarchical):
devStageCols <- brewer.pal(n = 11, "RdGy")[c(2, 4, 7, 9, 11)]
heatmap(as.matrix(ttopDat), col = GreyFun(256), hclustfun = function(x) hclust(x,
method = "average"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8,
1), ColSideColor = devStageCols[unclass(prDes$devStage)])
legend("topleft", levels(prDes$devStage), col = devStageCols, lty = 1, lwd = 5,
cex = 0.5)
Define new attributes for a gene and estimate the parameters.
annoTopDat <- stack(as.data.frame(ttopDat))
annoTopDat$probeset <- rownames(ttopDat)
annoTopDat <- merge(annoTopDat, prDes, by.x = "ind", by.y = "sidChar")
devStageAvg <- ddply(annoTopDat, ~probeset, function(x) {
avgbyDevStage <- aggregate(values ~ devStage, x, mean)$values
names(avgbyDevStage) <- levels(x$devStage)
avgbyDevStage
})
rownames(devStageAvg) <- devStageAvg$probeset
devStageAvg$probeset <- NULL
str(devStageAvg)
## 'data.frame': 972 obs. of 5 variables:
## $ E16 : num -0.628 1.235 -0.419 1.401 0.855 ...
## $ P2 : num -1.041 0.7 -0.918 0.737 0.74 ...
## $ P6 : num -0.214 -0.26 -0.744 -0.66 0.34 ...
## $ P10 : num 0.722 -0.683 0.553 -0.779 -0.363 ...
## $ 4_weeks: num 1.083 -0.838 1.475 -0.523 -1.464 ...
Make a heatmap.
heatmap(as.matrix(devStageAvg), Colv = NA, col = GreyFun(256), hclustfun = function(x) hclust(x,
method = "average"), labCol = colnames(devStageAvg), labRow = NA, margin = c(8,
1))
Look at the relative expression in all clusters with respect to developmental stage as determined by kmeans.
k <- 4
geneDS.km <- kmeans(devStageAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
Plot all cluster centers separately.
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Developmental Stage",
ylab = "Relative Expression", axes = F, main = paste("Cluster", clusterNum,
sep = ""))
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
points(clust.centers[clusterNum, ], type = "l")
points(clust.centers[clusterNum, ], pch = 20)
}
par(op)
Plot to compare all clusters' centers.
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Developmental Stage",
ylab = "Relative Expression", axes = F, main = "Clusters' Centers")
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
points(clust.centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
points(clust.centers[clusterNum, ], col = clusterNum, pch = 20)
}
Plotting 3-dimensional clusters as determined by k-means.
cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")
Qualifying cluster membership.
pvc <- pvclust(ttopDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = prDes$grp, cex = 0.6)
pvrect(pvc, alpha = 0.95)
# scree plot
pcs <- prcomp(sprDat, center = F, scale = F)
plot(pcs)
# scatterplot relating PCs to covariates
prinComp <- cbind(prDes, pcs$rotation[prDes$sidNum, 1:10])
plot(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], pch = 19,
cex = 0.8)
# plot data on 2 PCs, coloured by devStage
plot(prinComp[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
pt.bg = c(1, 2, 3, 4, 5))