yifanzhang — Apr 14, 2014, 7:02 AM
library(RColorBrewer)
library(cluster)
library(pvclust)
library(xtable)
library(limma)
library(plyr)
prDat <- read.table("GSE4051_data.tsv",
header = TRUE, row.names = 1) # the whole enchilada
prDes <- readRDS("GSE4051_design.rds")
sprDat <- t(scale(t(prDat)))
round(data.frame(avgBefore = rowMeans(head(prDat)),
avgAfter = rowMeans(head(sprDat)),
varBefore = apply(head(prDat), 1, var),
varAfter = apply(head(sprDat), 1, var)), 2)
avgBefore avgAfter varBefore varAfter
1415670_at 7.22 0 0.02 1
1415671_at 9.37 0 0.35 1
1415672_at 9.70 0 0.15 1
1415673_at 8.42 0 0.03 1
1415674_a_at 8.47 0 0.02 1
1415675_at 9.67 0 0.03 1
pr.dis <- dist(t(sprDat), method = 'euclidean')
prDes$grp <- with(prDes, interaction(gType, devStage))
pr.hc.s <- hclust(pr.dis, method = 'single')
pr.hc.c <- hclust(pr.dis, method = 'complete')
pr.hc.a <- hclust(pr.dis, method = 'average')
pr.hc.w <- hclust(pr.dis, method = 'ward')
op <- par(mar = c(0,4,4,2), mfrow = c(2,2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
par(op)
op <- par(mar = c(1,4,4,1))
plot(pr.hc.w, labels = prDes$grp, cex = 0.6,
main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)
par(op)
jGraysFun <- colorRampPalette(brewer.pal(n = 9, "Greys"))
gTypeCols <- brewer.pal(11, "RdGy")[c(4,7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = jGraysFun(256),
hclustfun = function(x) hclust(x, method = 'ward'),
scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8,1),
ColSideColor = gTypeCols[unclass(prDes$gType)])
legend("topright", legend = levels(prDes$gType),
col = gTypeCols, lty = 1, lwd = 5, cex = 0.5)
devStageCols <- brewer.pal(11, "RdGy")[c(2,4,7,9,11)]
heatmap(as.matrix(sprDat), Rowv = NA, col = jGraysFun(256),
hclustfun = function(x) hclust(x, method = 'ward'),
scale = "none", labCol = prDes$grp, labRow = NA, margin = c(8,1),
ColSideColor = devStageCols[unclass(prDes$devStage)])
k <- 5
pr.pam <- pam(pr.dis, k = k)
op <- par(mar = c(5,1,4,4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
par(op)
avgwidth<-c()
number<-c()
for (i in 2:10){
pr.pam<-pam(pr.dis, k=i)
avgwidth[i]<-pr.pam$silinfo$avg.width
number[i]<-i
}
plot(avgwidth~number)
#2 is the best choice for the number of clusters