Yiming Zhang
##Load data and packages
library(RColorBrewer)
library(cluster)
library(pvclust)
library(xtable)
library(limma)
library(plyr)
Load the data and look at it
prDat <- read.table("GSE4051_data.tsv", header = TRUE, row.names = 1) # the whole enchilada
str(prDat, max.level = 0)
## 'data.frame': 29949 obs. of 39 variables:
prDes <- readRDS("GSE4051_design.rds")
str(prDes)
## 'data.frame': 39 obs. of 4 variables:
## $ sidChar : chr "Sample_20" "Sample_21" "Sample_22" "Sample_23" ...
## $ sidNum : num 20 21 22 23 16 17 6 24 25 26 ...
## $ devStage: Factor w/ 5 levels "E16","P2","P6",..: 1 1 1 1 1 1 1 2 2 2 ...
## $ gType : Factor w/ 2 levels "wt","NrlKO": 1 1 1 1 2 2 2 1 1 1 ...
sprDat <- t(scale(t(prDat)))
str(sprDat, max.level = 0, give.attr = FALSE)
## num [1:29949, 1:39] 0.0838 0.1758 0.7797 -0.3196 0.8358 ...
round(data.frame(avgBefore = rowMeans(head(prDat)), avgAfter = rowMeans(head(sprDat)),
varBefore = apply(head(prDat), 1, var), varAfter = apply(head(sprDat), 1,
var)), 2)
## avgBefore avgAfter varBefore varAfter
## 1415670_at 7.22 0 0.02 1
## 1415671_at 9.37 0 0.35 1
## 1415672_at 9.70 0 0.15 1
## 1415673_at 8.42 0 0.03 1
## 1415674_a_at 8.47 0 0.02 1
## 1415675_at 9.67 0 0.03 1
photoRec data# compute pairwise distances
pr.dis <- dist(t(sprDat), method = "euclidean")
# create a new factor representing the interaction of gType and devStage
prDes$grp <- with(prDes, interaction(gType, devStage))
summary(prDes$grp)
## wt.E16 NrlKO.E16 wt.P2 NrlKO.P2 wt.P6
## 4 3 4 4 4
## NrlKO.P6 wt.P10 NrlKO.P10 wt.4_weeks NrlKO.4_weeks
## 4 4 4 4 4
# compute hierarchical clustering using different linkage types
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# plot them
op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
par(op)
# identify 10 clusters
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = prDes$grp, cex = 0.6, main = "Ward showing 10 clusters")
rect.hclust(pr.hc.w, k = 10)
par(op)
jGraysFun <- colorRampPalette(brewer.pal(n = 9, "Greys"))
gTypeCols <- brewer.pal(11, "RdGy")[c(4, 7)]
heatmap(as.matrix(sprDat), Rowv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x,
method = "ward"), scale = "none", labCol = prDes$grp, labRow = NA, margins = c(8,
1), ColSideColor = gTypeCols[unclass(prDes$gType)])
legend("topright", legend = levels(prDes$gType), col = gTypeCols, lty = 1, lwd = 5,
cex = 0.5)
photoRec data# Objects in columns
set.seed(31)
k <- 5
pr.km <- kmeans(t(sprDat), centers = k, nstart = 50)
# We can look at the within sum of squares of each cluster
pr.km$withinss
## [1] 120153 78227 110209 100197 133036
# We can look at the composition of each cluster
pr.kmTable <- data.frame(devStage = prDes$devStage, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each k-means cluster")
align(prTable) <- "lccccc"
print(prTable, type = "html", caption.placement = "top")
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| E16 | 0 | 0 | 6 | 0 | 1 |
| P2 | 4 | 0 | 0 | 0 | 4 |
| P6 | 5 | 1 | 0 | 0 | 2 |
| P10 | 1 | 2 | 0 | 3 | 2 |
| 4_weeks | 0 | 2 | 1 | 5 | 0 |
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(devStage = prDes$devStage, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(devStage, cluster)), caption = "Number of samples from each develomental stage within each PAM cluster")
align(pamTable) <- "lccccc"
print(pamTable, type = "html", caption.placement = "top")
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| E16 | 6 | 1 | 0 | 0 | 0 |
| P2 | 0 | 1 | 7 | 0 | 0 |
| P6 | 3 | 2 | 3 | 0 | 0 |
| P10 | 0 | 2 | 1 | 1 | 4 |
| 4_weeks | 1 | 0 | 1 | 4 | 2 |
op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 5 clusters")
par(op)
##Gene Clustering
Get TopDat
model.mat <- model.matrix(~devStage, prDes)
pr.fit <- lmFit(prDat, model.mat)
ebayes.fit <- eBayes(pr.fit)
# sorted by adj.P.Val
top.genes <- topTable(ebayes.fit, number = 972, coef = grep("devStage", colnames(coef(ebayes.fit))))
head(top.genes)
## devStageP2 devStageP6 devStageP10 devStage4_weeks AveExpr F
## 1440645_at 0.3040 0.24339 0.8343 3.633 6.569 245.1
## 1421084_at 0.6697 3.49855 5.1652 5.506 10.023 168.8
## 1451590_at 0.4569 2.12916 3.5209 4.920 9.124 157.2
## 1428680_at 0.2265 0.21227 1.0350 3.072 8.066 140.9
## 1450215_at 1.5362 3.43618 4.8949 5.504 9.201 122.4
## 1416041_at -0.1085 0.09961 0.8187 4.425 9.325 111.4
## P.Value adj.P.Val
## 1440645_at 5.085e-26 1.523e-21
## 1421084_at 3.605e-23 5.398e-19
## 1451590_at 1.243e-22 1.240e-18
## 1428680_at 8.274e-22 6.195e-18
## 1450215_at 9.280e-21 5.558e-17
## 1416041_at 4.631e-20 2.311e-16
topDat <- sprDat[row.names(sprDat) %in% row.names(top.genes), ]
nrow(topDat)
## [1] 972
geneC.dis <- dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "average")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Average Linkage",
xlab = "")
set.seed(1234)
k <- 5
kmeans.genes <- kmeans(topDat, centers = k)
# choose which cluster we want
clusterNum <- 1
# Set up the axes without plotting; ylim set based on trial run.
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Samples",
ylab = "Relative expression")
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(kmeans.genes$centers[clusterNum, ], type = "l")
# Optional: colored points to show which development stage the samples are
# from.
points(kmeans.genes$centers[clusterNum, ], col = prDes$devStage, pch = 20)
Use Heat map
devStageCols <- brewer.pal(11, "RdGy")[c(2, 4, 7, 9, 11)]
heatmap(as.matrix(topDat), col = jGraysFun(256), hclustfun = function(x) hclust(x,
method = "average"), labCol = prDes$grp, labRow = NA, margin = c(8, 1),
scale = "none", ColSideColor = devStageCols[unclass(prDes$devStage)])
legend("topleft", levels(prDes$devStage), col = devStageCols, lty = 1, lwd = 5,
cex = 0.5)
Redefine the attributes by linear model: \[X_{gi,devStage}=\mu_{g,devStage}+\epslion_{gi,devStage}\] Define \[Att_{g}=(\mu_{g,E16},\mu_{g,P2},\mu_{g,P6},\mu_{g,P10},\mu_{g,4w})\]
annoTopDat <- stack(as.data.frame(topDat)) # stack probe data tall and skinny
annoTopDat$probeset <- rownames(topDat) # add probeset ID as variable
## get info on gType and devStage, then average over reps within devStage
annoTopDat <- merge(annoTopDat, prDes, by.x = "ind", by.y = "sidChar")
devStageAvg <- ddply(annoTopDat, ~probeset, function(x) {
avgByDevStage <- aggregate(values ~ devStage, x, mean)$values
names(avgByDevStage) <- levels(x$devStage)
avgByDevStage
})
## put probset info back into rownames
rownames(devStageAvg) <- devStageAvg$probeset
devStageAvg$probeset <- NULL
str(devStageAvg)
## 'data.frame': 972 obs. of 5 variables:
## $ E16 : num -0.628 1.235 -0.419 1.401 0.855 ...
## $ P2 : num -1.041 0.7 -0.918 0.737 0.74 ...
## $ P6 : num -0.214 -0.26 -0.744 -0.66 0.34 ...
## $ P10 : num 0.722 -0.683 0.553 -0.779 -0.363 ...
## $ 4_weeks: num 1.083 -0.838 1.475 -0.523 -1.464 ...
heatmap(as.matrix(devStageAvg), Colv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x,
method = "average"), labCol = colnames(devStageAvg), labRow = NA, margin = c(8,
1))
k <- 4
geneDS.km <- kmeans(devStageAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
# Set up the axes without plotting; ylim set based on trial run.
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
sep = " "))
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(devStageAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(clust.centers[clusterNum, ], type = "l")
# Optional: points to show development stages.
points(clust.centers[clusterNum, ], pch = 20)
}
par(op)
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Develomental Stage",
ylab = "Average expression", axes = FALSE, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
points(clust.centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
points(clust.centers[clusterNum, ], col = clusterNum, pch = 20)
}
cloud(devStageAvg[, "E16"] ~ devStageAvg[, "P6"] * devStageAvg[, "4_weeks"],
col = geneDS.km$clust, xlab = "E16", ylab = "P6", zlab = "4_weeks")
## Error: could not find function "cloud"
pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = prDes$grp, cex = 0.6)
pvrect(pvc, alpha = 0.95)
pcs <- prcomp(sprDat, center = F, scale = F)
# scree plot
plot(pcs)
prinComp <- cbind(prDes, pcs$rotation[prDes$sidNum, 1:10])
# scatter plot showing us how the first few PCs relate to covariates
plot(prinComp[, c("sidNum", "devStage", "gType", "PC1", "PC2", "PC3")], pch = 19,
cex = 0.8)
# plot data on first two PCs, colored by development stage
plot(prinComp[, c("PC1", "PC2")], bg = prDes$devStage, pch = 21, cex = 1.5)
legend(list(x = 0.2, y = 0.3), as.character(levels(prDes$devStage)), pch = 21,
pt.bg = c(1, 2, 3, 4, 5))