Here I create a markdown file for this task to have a better illustrate and make it easy to understand.
First read the design matrix and normalized data and have a simple check
dat <- read.table("data/GSE1710-normalized-data.tsv")
des <- readRDS("data/GSE1710-outlier-removed-design.rds")
# separate age into three groups
des$age <- as.numeric(levels(des$age))[as.integer(des$age)]
young <- which(des$age <= 30)
middleage <- which(des$age > 30 & des$age < 50)
old <- which(des$age >= 50)
des$age[young] <- "young"
des$age[middleage] <- "middleage"
des$age[old] <- "old"
des$age <- as.factor(des$age)
des$age <- recode(des$age, "", levels = c("young", "middleage", "old"))
## Error: could not find function "recode"
str(dat, max.level = 0)
## 'data.frame': 28993 obs. of 30 variables:
str(des)
## 'data.frame': 30 obs. of 4 variables:
## $ samples: Factor w/ 31 levels "GSM29595","GSM29596",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ group : Factor w/ 3 levels "NC","CD","UC": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 2 2 1 1 1 1 1 2 ...
## $ age : Factor w/ 3 levels "middleage","old",..: 1 1 3 2 1 2 2 3 2 2 ...
Also load the essential packages,
library(RColorBrewer)
library(cluster)
library(pvclust)
library(xtable)
library(limma)
library(plyr)
library(ggplot2)
library(car)
library(lattice)
Simple way to do PCA
pcs <- prcomp(dat, center = F, scale = F)
Check the priciple components
plot(pcs)
prinComp <- cbind(des, pcs$rotation)
Then check how the first few PCs relate to covariates
prinComp <- cbind(des, pcs$rotation)
plot(prinComp[, c("group", "sex", "age", "PC1", "PC2", "PC3")], pch = 19, cex = 0.8)
In this plot, it easy to tell that from first two components, samples in three experimental groups have significant different, but for sex and age, it's not so easy to tell the difference in these groups.
Look at the principle components in experimental group
Group <- des$group
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Group)) + geom_point()
Look at the two principle components in sex,
Sex <- des$sex
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Sex)) + geom_point()
then in age
Age <- des$age
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Age)) + geom_point()
sdat <- t(scale(t(dat)))
str(sdat, max.level = 0, give.attr = FALSE)
## num [1:28993, 1:30] 2.36 2.67 2.6 2.73 3.56 ...
round(data.frame(avgBefore = rowMeans(head(dat)), avgAfter = rowMeans(head(sdat)),
varBefore = apply(head(dat), 1, var), varAfter = apply(head(sdat), 1, var)),
2)
## avgBefore avgAfter varBefore varAfter
## 01A01 0.13 0 0 1
## 01A02 0.07 0 0 1
## 01A03 0.06 0 0 1
## 01A04 0.07 0 0 1
## 01A05 0.05 0 0 1
## 01A06 0.06 0 0 1
Data for each row now has mean 0 and variance 1.
In this part, I usse samples as objects to be clustered using gene attributes.
Compute pairwise distance
pr.dis <- dist(t(sdat), method = "euclidean")
# pr.dis <- dist(t(sdat), method = 'manhattan') manhattan have same results
# with euclidean
Create two new factors representing the interaction of group with sex and also group with age
des$grps <- with(des, interaction(group, sex))
des$grpa <- with(des, interaction(group, age))
summary(des$grps)
## NC.female CD.female UC.female NC.male CD.male UC.male
## 7 4 2 4 5 8
summary(des$grpa)
## NC.middleage CD.middleage UC.middleage NC.old CD.old
## 4 3 5 5 2
## UC.old NC.young CD.young UC.young
## 1 2 4 4
# compute hierarchical clustering using 'ward' pr.hc.w <- hclust(pr.dis,
# method = 'ward') plot(pr.hc.w, labels = FALSE, main = 'Ward', xlab = '')
pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")
# plot them
op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))
plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")
From the result, we can tell that “Ward” is better.
par(op)
# identify 6 clusters
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$grps, cex = 0.6, main = "Ward showing 6 clusters")
rect.hclust(pr.hc.w, k = 6)
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$grpa, cex = 0.6, main = "Ward showing 9 clusters")
rect.hclust(pr.hc.w, k = 9)
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$group, cex = 0.6, main = "Ward showing 3 clusters")
rect.hclust(pr.hc.w, k = 3)
From the above clustering results, we can not find any pattern about group with age and also group with age.
# Objects in columns
set.seed(20)
k <- 9
pr.km <- kmeans(t(sdat), centers = k, nstart = 50)
# We can look at the within sum of squares of each cluster
pr.km$withinss
## [1] 0 58022 27930 87010 29277 106532 52936 41177 0
# We can look at the composition of each cluster
pr.kmTable <- data.frame(group = des$group, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(group, cluster)), caption = "Number of samples from each experimental group within each k-means cluster")
align(prTable) <- "lccc"
## Error: "align" must have length equal to 10 ( ncol(x) + 1 )
print(prTable, type = "html", caption.placement = "top")
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|
| NC | 1 | 0 | 2 | 0 | 2 | 0 | 3 | 3 | 0 |
| CD | 0 | 1 | 0 | 3 | 0 | 4 | 1 | 0 | 0 |
| UC | 0 | 3 | 0 | 3 | 0 | 3 | 0 | 0 | 1 |
pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(group = des$group, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(group, cluster)), caption = "Number of samples from each experimental group within each PAM cluster")
align(pamTable) <- "lccc"
## Error: "align" must have length equal to 10 ( ncol(x) + 1 )
print(pamTable, type = "html", caption.placement = "top")
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|
| NC | 3 | 1 | 1 | 1 | 1 | 1 | 3 | 0 | 0 |
| CD | 1 | 0 | 0 | 4 | 0 | 0 | 2 | 2 | 0 |
| UC | 3 | 0 | 0 | 5 | 0 | 0 | 0 | 1 | 1 |
op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 3 clusters")
Load the top hits data to do gene clustering
hits <- read.table("data/groups-hits.tsv")
topDat <- subset(dat, rownames(dat) %in% rownames(hits))
geneC.dis <- dist(topDat, method = "euclidean")
geneC.hc.a <- hclust(geneC.dis, method = "ward")
plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Ward Linkage", xlab = "")
set.seed(1234)
k <- 5
kmeans.genes <- kmeans(topDat, centers = k)
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
# Set up the axes without plotting; ylim set based on trial run.
plot(kmeans.genes$centers[clusterNum, ], ylim = c(-0.1, 10), type = "n",
xlab = "Samples", ylab = "Relative expression")
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(kmeans.genes$centers[clusterNum, ], type = "l")
# Optional: colored points to show which group the samples are from.
points(kmeans.genes$centers[clusterNum, ], col = des$group, pch = 20)
}
Define attributes by estimating parameters of a linear model.
For a linear model defined: \[X_{gi,sex}=\mu_{g,sex}+\epslion_{gi,sex} \] Then we can define a new attributes for each gene, \[Att_{g}=(\mu_{g,female},\mu_{g,male})\] and estimate these parameters.
annoTopDat <- stack(as.data.frame(topDat))
annoTopDat$probeset <- rownames(topDat) # add probeset ID as variable
## get info on group and sex, then average over reps within sex
annoTopDat <- merge(annoTopDat, des, by.x = "ind", by.y = "samples")
sexAvg <- ddply(annoTopDat, ~probeset, function(x) {
avgBysex <- aggregate(values ~ sex, x, mean)$values
names(avgBysex) <- levels(x$sex)
avgBysex
})
## put probset info back into rownames
rownames(sexAvg) <- sexAvg$probeset
sexAvg$probeset <- NULL
str(sexAvg)
## 'data.frame': 1283 obs. of 2 variables:
## $ female: num 0.0672 0.0686 0.0554 0.0606 0.0637 ...
## $ male : num 0.0631 0.0644 0.0518 0.0538 0.0606 ...
heatmap(as.matrix(sexAvg), Colv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x,
method = "ward"), labCol = colnames(sexAvg), labRow = NA, margin = c(8,
1))
## Error: could not find function "jGraysFun"
k <- 4
geneDS.km <- kmeans(sexAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
# Set up the axes without plotting; ylim set based on trial run.
plot(clust.centers[clusterNum, ], ylim = c(-2, 2), type = "n", xlab = "Sex",
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
sep = " "))
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(sexAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(clust.centers[clusterNum, ], type = "l")
# Optional: points to show sex.
points(clust.centers[clusterNum, ], pch = 20)
}
pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = des$grps, cex = 0.6)
pvrect(pvc, alpha = 0.95)
Define attributes by estimating parameters of a linear model.
For a linear model defined: \[X_{gi,group}=\mu_{g,group}+\epslion_{gi,group} \] Then we can define a new attributes for each gene, \[Att_{g}=(\mu_{g,NC},\mu_{g,CD},\mu_{g,UC})\] and estimate these parameters.
annoTopDat <- stack(as.data.frame(topDat))
annoTopDat$probeset <- rownames(topDat) # add probeset ID as variable
## get info on group and sex, then average over reps within sex
annoTopDat <- merge(annoTopDat, des, by.x = "ind", by.y = "samples")
groupAvg <- ddply(annoTopDat, ~probeset, function(x) {
avgBygroup <- aggregate(values ~ group, x, mean)$values
names(avgBygroup) <- levels(x$group)
avgBygroup
})
## put probset info back into rownames
rownames(groupAvg) <- groupAvg$probeset
groupAvg$probeset <- NULL
str(groupAvg)
## 'data.frame': 1283 obs. of 3 variables:
## $ NC: num 0.0831 0.084 0.0675 0.0693 0.0787 ...
## $ CD: num 0.0493 0.0519 0.0426 0.0468 0.048 ...
## $ UC: num 0.0588 0.0595 0.0474 0.0519 0.056 ...
heatmap(as.matrix(groupAvg), Colv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x,
method = "ward"), labCol = colnames(groupAvg), labRow = NA, margin = c(8,
1))
## Error: could not find function "jGraysFun"
k <- 4
geneDS.km <- kmeans(groupAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers
# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
# Set up the axes without plotting; ylim set based on trial run.
plot(clust.centers[clusterNum, ], ylim = c(-2, 2), type = "n", xlab = "Experimental Group",
ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum,
sep = " "))
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
# Plot the expression of all the genes in the selected cluster in grey.
matlines(y = t(groupAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")
# Add the cluster center. This is last so it isn't underneath the members
points(clust.centers[clusterNum, ], type = "l")
# Optional: points to show sex.
points(clust.centers[clusterNum, ], pch = 20)
}
plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Experimental Group",
ylab = "Average expression", axes = FALSE, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)
for (clusterNum in 1:4) {
points(clust.centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
points(clust.centers[clusterNum, ], col = clusterNum, pch = 20)
}
cloud(groupAvg[, "NC"] ~ groupAvg[, "CD"] * groupAvg[, "UC"], col = geneDS.km$clust,
xlab = "NC", ylab = "CD", zlab = "UC")
pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = des$group, cex = 0.6)
pvrect(pvc, alpha = 0.95)