After loading the required libraries and the data, we begin our analysis.
Hierarchical clustering
# compute pairwise distances
pr.dis <- dist(t(sprDat), method = "euclidean")
# create a new factor representing the interaction of gType and devStage
prDes$grp <- with(prDes, interaction(gType, devStage))
summary(prDes$grp)
## NrlKO.E16 wt.E16 NrlKO.P2 wt.P2 NrlKO.P6
## 3 4 4 4 4
## wt.P6 NrlKO.P10 wt.P10 NrlKO.4_weeks wt.4_weeks
## 4 4 4 4 4
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
Plot:
op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
par(op)
For the Ward procedure we identify 10 cluster using the dendogram:
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = prDes$grp, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)
par(op)
K-means clustering We set the number on cluster to be equal to 5:
set.seed(31)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# within sum of squares of each cluster
pr.km$withinss
## [1] 120153 78227 110209 100197 133036
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- kable(with(pr.kmTable, table(devStage, cluster)))
## |id | 1| 2| 3| 4| 5|
## |:--------|--:|--:|--:|--:|--:|
## |E16 | 0| 0| 6| 0| 1|
## |P2 | 4| 0| 0| 0| 4|
## |P6 | 5| 1| 0| 0| 2|
## |P10 | 1| 2| 0| 3| 2|
## |4_weeks | 0| 2| 1| 5| 0|
Repeat the analysis using a different seed
set.seed(540)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# within sum of squares of each cluster
pr.km$withinss
## [1] 133036 110209 78227 120153 100197
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- kable(with(pr.kmTable, table(devStage, cluster)))
## |id | 1| 2| 3| 4| 5|
## |:--------|--:|--:|--:|--:|--:|
## |E16 | 1| 6| 0| 0| 0|
## |P2 | 4| 0| 0| 4| 0|
## |P6 | 2| 0| 1| 5| 0|
## |P10 | 2| 0| 2| 1| 3|
## |4_weeks | 0| 1| 2| 0| 5|
PAM algorithm
set.seed(31)
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each PAM cluster")
The five chosen medoids are:
summary(pr.pam)$medoids
## [1] "Sample_22" "Sample_8" "Sample_3" "Sample_39" "Sample_13"
Silhouette plot:
op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
par(op)
Draw a plot with number of clusters in the x-axis and the average silhouette widths in the y-axis.
av.sil.Width <- function(k) {
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
pr.pam <- pam(pr.dis, k = k)
return(pr.pam$silinfo$avg.width)
}
k <- 2:15
avg.width <- apply(as.matrix(k), 1, FUN = av.sil.Width)
plot(k, avg.width, xlab = "number of clusters", ylab = "average silhouette",
type = "b", col = "red", main = "Average silhouette vs. number of clusters")
For a common choice of k, compare the clustering across different methods
k <- 6
# compute pairwise distances
pr.dis <- dist(t(sprDat), method = "euclidean")
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# which sample is in which cluster using the 4 methods
hclustdat <- data.frame(single = cutree(pr.hc.s, k), compl = cutree(pr.hc.c,
k), average = cutree(pr.hc.a, k), ward = cutree(pr.hc.w, k))
head(hclustdat)
## single compl average ward
## Sample_20 1 1 1 1
## Sample_21 1 2 2 1
## Sample_22 1 1 1 1
## Sample_23 1 1 1 1
## Sample_16 2 3 3 2
## Sample_17 3 2 1 1
# k means
set.seed(15)
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each k-means cluster")
# PAM
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each PAM cluster")
Gene clustering
DesMat <- model.matrix(~devStage, prDes)
fit <- lmFit(prDat, DesMat)
ebFit <- eBayes(fit)
top <- topTable(ebFit, n = Inf, coef = grep("devStage", colnames(ebFit)), p.value = 1e-05)
topGenes <- rownames(top)
topDat <- sprDat[topGenes, ]
geneC.dis <- dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
xlab = "")
Partitioning
set.seed(1234)
k <- 5
kmeans.genes <- kmeans(topDat, centers = k)
clusterNum <- 1
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
ylab = "Relative expression")
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
points(kmeans.genes$centers[clusterNum, ], type = "l")
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)
# heatmap
devStageCols <- brewer.pal(11, "RdGy")[c(2, 4, 7, 9, 11)]
heatmap(as.matrix(topDat), hclustfun = function(x) hclust(x, method = "average"),
labCol = prDes$grp, labRow = NA, margin = c(8, 1), scale = "none", ColSideColor = devStageCols[unclass(prDes$devStage)])
legend("topleft", levels(prDes$devStage), col = devStageCols, lty = 1, lwd = 5,
cex = 0.5)
Redefining the attributes
annoTopDat <- stack(as.data.frame(topDat)) # stack probe data tall and skinny
annoTopDat$probeset <- rownames(topDat) # add probeset ID as variable
## get info on gType and devStage, then average over reps within devStage
annoTopDat <- merge(annoTopDat, prDes, by.x = "ind", by.y = "sidChar")
devStageAvg <- ddply(annoTopDat, ~probeset, function(x) {
avgByDevStage <- aggregate(values ~ devStage, x, mean)$values
names(avgByDevStage) <- levels(x$devStage)
avgByDevStage
})
## put probset info back into rownames
rownames(devStageAvg) <- devStageAvg$probeset
devStageAvg$probeset <- NULL
str(devStageAvg)
## 'data.frame': 972 obs. of 5 variables:
## $ E16 : num -0.628 1.235 -0.419 1.401 0.855 ...
## $ P2 : num -1.041 0.7 -0.918 0.737 0.74 ...
## $ P6 : num -0.214 -0.26 -0.744 -0.66 0.34 ...
## $ P10 : num 0.722 -0.683 0.553 -0.779 -0.363 ...
## $ 4_weeks: num 1.083 -0.838 1.475 -0.523 -1.464 ...
heatmap(as.matrix(devStageAvg), Colv = NA, hclustfun = function(x) hclust(x,
method = "average"), labCol = colnames(devStageAvg), labRow = NA, margin = c(8,
1))
k <- 4
geneDS.km <- kmeans(devStageAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
sep = " "))
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
points(clust.centers[clusterNum, ], type = "l")
points(clust.centers[clusterNum, ], pch = 20)
}
par(op)
Statistical measures to evaluate clusters
pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = prDes$grp, cex = 0.6)
pvrect(pvc, alpha = 0.95)
PCA (principal components analysis)
pcs <- prcomp(sprDat, center = F, scale = F)
# scree plot
plot(pcs)
# append the rotations for the first 10 PCs to the phenodata
prinComp <- cbind(prDes, pcs$rotation[prDes$sidNum, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], pch = 19,
cex = 0.8)
# plot data on first two PCs, colored by development stage
plot(prinComp[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
pt.bg = c(1, 2, 3, 4, 5))
Redo the graphs using lattice
pairs(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], panel = function(...) smoothScatter(...,
add = TRUE))
## KernSmooth 2.23 loaded
## Copyright M. P. Wand 1997-2009
xyplot(prinComp[, c("PC2")] ~ prinComp[, c("PC1")], auto.key = T, groups = prDes$devStage)