Task 4: PCA and clustering

Here I create a markdown file for this task to have a better illustrate and make it easy to understand.

First read the design matrix and normalized data and have a simple check

dat <- read.table("data/GSE1710-normalized-data.tsv")
des <- readRDS("data/GSE1710-outlier-removed-design.rds")
# separate age into three groups
des$age <- as.numeric(levels(des$age))[as.integer(des$age)]
young <- which(des$age <= 30)
middleage <- which(des$age > 30 & des$age < 50)
old <- which(des$age >= 50)
des$age[young] <- "young"
des$age[middleage] <- "middleage"
des$age[old] <- "old"
des$age <- as.factor(des$age)
des$age <- recode(des$age, "", levels = c("young", "middleage", "old"))
## Error: could not find function "recode"
str(dat, max.level = 0)
## 'data.frame':    28993 obs. of  30 variables:
str(des)
## 'data.frame':    30 obs. of  4 variables:
##  $ samples: Factor w/ 31 levels "GSM29595","GSM29596",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group  : Factor w/ 3 levels "NC","CD","UC": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex    : Factor w/ 2 levels "female","male": 2 1 2 2 1 1 1 1 1 2 ...
##  $ age    : Factor w/ 3 levels "middleage","old",..: 1 1 3 2 1 2 2 3 2 2 ...

Also load the essential packages,

library(RColorBrewer)
library(cluster)
library(pvclust)
library(xtable)
library(limma)
library(plyr)
library(ggplot2)
library(car)
library(lattice)

PCA (principal components analysis)

Simple way to do PCA

pcs <- prcomp(dat, center = F, scale = F)

Check the priciple components

plot(pcs)

plot of chunk unnamed-chunk-4

prinComp <- cbind(des, pcs$rotation)

Then check how the first few PCs relate to covariates

prinComp <- cbind(des, pcs$rotation)
plot(prinComp[, c("group", "sex", "age", "PC1", "PC2", "PC3")], pch = 19, cex = 0.8)

plot of chunk unnamed-chunk-5

In this plot, it easy to tell that from first two components, samples in three experimental groups have significant different, but for sex and age, it's not so easy to tell the difference in these groups.

Look at the principle components in experimental group

Group <- des$group
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Group)) + geom_point()

plot of chunk unnamed-chunk-6

Look at the two principle components in sex,

Sex <- des$sex
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Sex)) + geom_point()

plot of chunk unnamed-chunk-7

then in age

Age <- des$age
ggplot(prinComp, aes(x = PC1, y = PC2, colour = Age)) + geom_point()

plot of chunk unnamed-chunk-8

Cluster analysis

Preprocessing

sdat <- t(scale(t(dat)))
str(sdat, max.level = 0, give.attr = FALSE)
##  num [1:28993, 1:30] 2.36 2.67 2.6 2.73 3.56 ...
round(data.frame(avgBefore = rowMeans(head(dat)), avgAfter = rowMeans(head(sdat)), 
    varBefore = apply(head(dat), 1, var), varAfter = apply(head(sdat), 1, var)), 
    2)
##       avgBefore avgAfter varBefore varAfter
## 01A01      0.13        0         0        1
## 01A02      0.07        0         0        1
## 01A03      0.06        0         0        1
## 01A04      0.07        0         0        1
## 01A05      0.05        0         0        1
## 01A06      0.06        0         0        1

Data for each row now has mean 0 and variance 1.

Sample clustering

In this part, I usse samples as objects to be clustered using gene attributes.

Hierarchical clustering

Compute pairwise distance

pr.dis <- dist(t(sdat), method = "euclidean")
# pr.dis <- dist(t(sdat), method = 'manhattan') manhattan have same results
# with euclidean

Create two new factors representing the interaction of group with sex and also group with age

des$grps <- with(des, interaction(group, sex))
des$grpa <- with(des, interaction(group, age))
summary(des$grps)
## NC.female CD.female UC.female   NC.male   CD.male   UC.male 
##         7         4         2         4         5         8
summary(des$grpa)
## NC.middleage CD.middleage UC.middleage       NC.old       CD.old 
##            4            3            5            5            2 
##       UC.old     NC.young     CD.young     UC.young 
##            1            2            4            4
# compute hierarchical clustering using 'ward' pr.hc.w <- hclust(pr.dis,
# method = 'ward') plot(pr.hc.w, labels = FALSE, main = 'Ward', xlab = '')

pr.hc.s <- hclust(pr.dis, method = "single")
pr.hc.c <- hclust(pr.dis, method = "complete")
pr.hc.a <- hclust(pr.dis, method = "average")
pr.hc.w <- hclust(pr.dis, method = "ward")

# plot them
op <- par(mar = c(0, 4, 4, 2), mfrow = c(2, 2))

plot(pr.hc.s, labels = FALSE, main = "Single", xlab = "")
plot(pr.hc.c, labels = FALSE, main = "Complete", xlab = "")
plot(pr.hc.a, labels = FALSE, main = "Average", xlab = "")
plot(pr.hc.w, labels = FALSE, main = "Ward", xlab = "")

plot of chunk unnamed-chunk-13

From the result, we can tell that “Ward” is better.

par(op)

# identify 6 clusters
op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$grps, cex = 0.6, main = "Ward showing 6 clusters")
rect.hclust(pr.hc.w, k = 6)

plot of chunk unnamed-chunk-14

op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$grpa, cex = 0.6, main = "Ward showing 9 clusters")
rect.hclust(pr.hc.w, k = 9)

plot of chunk unnamed-chunk-15

op <- par(mar = c(1, 4, 4, 1))
plot(pr.hc.w, labels = des$group, cex = 0.6, main = "Ward showing 3 clusters")
rect.hclust(pr.hc.w, k = 3)

plot of chunk unnamed-chunk-16

From the above clustering results, we can not find any pattern about group with age and also group with age.

K-means clustering

# Objects in columns

set.seed(20)
k <- 9
pr.km <- kmeans(t(sdat), centers = k, nstart = 50)

# We can look at the within sum of squares of each cluster
pr.km$withinss
## [1]      0  58022  27930  87010  29277 106532  52936  41177      0
# We can look at the composition of each cluster

pr.kmTable <- data.frame(group = des$group, cluster = pr.km$cluster)
prTable <- xtable(with(pr.kmTable, table(group, cluster)), caption = "Number of samples from each experimental group  within each k-means cluster")
align(prTable) <- "lccc"
## Error: "align" must have length equal to 10 ( ncol(x) + 1 )
print(prTable, type = "html", caption.placement = "top")
Number of samples from each experimental group within each k-means cluster
1 2 3 4 5 6 7 8 9
NC 1 0 2 0 2 0 3 3 0
CD 0 1 0 3 0 4 1 0 0
UC 0 3 0 3 0 3 0 0 1

PAM clustering

pr.pam <- pam(pr.dis, k = k)
pr.pamTable <- data.frame(group = des$group, cluster = pr.pam$clustering)
pamTable <- xtable(with(pr.pamTable, table(group, cluster)), caption = "Number of samples from each experimental group within each PAM cluster")
align(pamTable) <- "lccc"
## Error: "align" must have length equal to 10 ( ncol(x) + 1 )
print(pamTable, type = "html", caption.placement = "top")
Number of samples from each experimental group within each PAM cluster
1 2 3 4 5 6 7 8 9
NC 3 1 1 1 1 1 3 0 0
CD 1 0 0 4 0 0 2 2 0
UC 3 0 0 5 0 0 0 1 1
op <- par(mar = c(5, 1, 4, 4))
plot(pr.pam, main = "Silhouette Plot for 3 clusters")

plot of chunk unnamed-chunk-22

Gene clustering

Load the top hits data to do gene clustering

hits <- read.table("data/groups-hits.tsv")
topDat <- subset(dat, rownames(dat) %in% rownames(hits))

Hierarchical

geneC.dis <- dist(topDat, method = "euclidean")

geneC.hc.a <- hclust(geneC.dis, method = "ward")

plot(geneC.hc.a, labels = FALSE, main = "Hierarchical with Ward Linkage", xlab = "")

plot of chunk unnamed-chunk-24

Partitioning

set.seed(1234)
k <- 5
kmeans.genes <- kmeans(topDat, centers = k)
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    # Set up the axes without plotting; ylim set based on trial run.
    plot(kmeans.genes$centers[clusterNum, ], ylim = c(-0.1, 10), type = "n", 
        xlab = "Samples", ylab = "Relative expression")

    # Plot the expression of all the genes in the selected cluster in grey.
    matlines(y = t(topDat[kmeans.genes$cluster == clusterNum, ]), col = "grey")

    # Add the cluster center. This is last so it isn't underneath the members
    points(kmeans.genes$centers[clusterNum, ], type = "l")

    # Optional: colored points to show which group the samples are from.
    points(kmeans.genes$centers[clusterNum, ], col = des$group, pch = 20)
}

plot of chunk unnamed-chunk-25

Redefining the attributes: for sex

Define attributes by estimating parameters of a linear model.

For a linear model defined: \[X_{gi,sex}=\mu_{g,sex}+\epslion_{gi,sex} \] Then we can define a new attributes for each gene, \[Att_{g}=(\mu_{g,female},\mu_{g,male})\] and estimate these parameters.

annoTopDat <- stack(as.data.frame(topDat))
annoTopDat$probeset <- rownames(topDat)  # add probeset ID as variable
## get info on group and sex, then average over reps within sex
annoTopDat <- merge(annoTopDat, des, by.x = "ind", by.y = "samples")
sexAvg <- ddply(annoTopDat, ~probeset, function(x) {
    avgBysex <- aggregate(values ~ sex, x, mean)$values
    names(avgBysex) <- levels(x$sex)
    avgBysex
})
## put probset info back into rownames
rownames(sexAvg) <- sexAvg$probeset
sexAvg$probeset <- NULL
str(sexAvg)
## 'data.frame':    1283 obs. of  2 variables:
##  $ female: num  0.0672 0.0686 0.0554 0.0606 0.0637 ...
##  $ male  : num  0.0631 0.0644 0.0518 0.0538 0.0606 ...
heatmap(as.matrix(sexAvg), Colv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x, 
    method = "ward"), labCol = colnames(sexAvg), labRow = NA, margin = c(8, 
    1))
## Error: could not find function "jGraysFun"
k <- 4
geneDS.km <- kmeans(sexAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers

# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    # Set up the axes without plotting; ylim set based on trial run.
    plot(clust.centers[clusterNum, ], ylim = c(-2, 2), type = "n", xlab = "Sex", 
        ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum, 
            sep = " "))
    axis(2)
    axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)

    # Plot the expression of all the genes in the selected cluster in grey.
    matlines(y = t(sexAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")

    # Add the cluster center. This is last so it isn't underneath the members
    points(clust.centers[clusterNum, ], type = "l")

    # Optional: points to show sex.
    points(clust.centers[clusterNum, ], pch = 20)
}

plot of chunk unnamed-chunk-28

pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = des$grps, cex = 0.6)
pvrect(pvc, alpha = 0.95)

plot of chunk unnamed-chunk-30

Redefining the attributes: for experimental group

Define attributes by estimating parameters of a linear model.

For a linear model defined: \[X_{gi,group}=\mu_{g,group}+\epslion_{gi,group} \] Then we can define a new attributes for each gene, \[Att_{g}=(\mu_{g,NC},\mu_{g,CD},\mu_{g,UC})\] and estimate these parameters.

annoTopDat <- stack(as.data.frame(topDat))
annoTopDat$probeset <- rownames(topDat)  # add probeset ID as variable
## get info on group and sex, then average over reps within sex
annoTopDat <- merge(annoTopDat, des, by.x = "ind", by.y = "samples")
groupAvg <- ddply(annoTopDat, ~probeset, function(x) {
    avgBygroup <- aggregate(values ~ group, x, mean)$values
    names(avgBygroup) <- levels(x$group)
    avgBygroup
})
## put probset info back into rownames
rownames(groupAvg) <- groupAvg$probeset
groupAvg$probeset <- NULL
str(groupAvg)
## 'data.frame':    1283 obs. of  3 variables:
##  $ NC: num  0.0831 0.084 0.0675 0.0693 0.0787 ...
##  $ CD: num  0.0493 0.0519 0.0426 0.0468 0.048 ...
##  $ UC: num  0.0588 0.0595 0.0474 0.0519 0.056 ...
heatmap(as.matrix(groupAvg), Colv = NA, col = jGraysFun(256), hclustfun = function(x) hclust(x, 
    method = "ward"), labCol = colnames(groupAvg), labRow = NA, margin = c(8, 
    1))
## Error: could not find function "jGraysFun"
k <- 4
geneDS.km <- kmeans(groupAvg, centers = k, nstart = 50)
clust.centers <- geneDS.km$centers

# Look at all clusters
op <- par(mfrow = c(2, 2))
for (clusterNum in 1:4) {
    # Set up the axes without plotting; ylim set based on trial run.
    plot(clust.centers[clusterNum, ], ylim = c(-2, 2), type = "n", xlab = "Experimental Group", 
        ylab = "Relative expression", axes = F, main = paste("Cluster", clusterNum, 
            sep = " "))
    axis(2)
    axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)

    # Plot the expression of all the genes in the selected cluster in grey.
    matlines(y = t(groupAvg[geneDS.km$cluster == clusterNum, ]), col = "grey")

    # Add the cluster center. This is last so it isn't underneath the members
    points(clust.centers[clusterNum, ], type = "l")

    # Optional: points to show sex.
    points(clust.centers[clusterNum, ], pch = 20)
}

plot of chunk unnamed-chunk-33


plot(clust.centers[clusterNum, ], ylim = c(-4, 4), type = "n", xlab = "Experimental Group", 
    ylab = "Average expression", axes = FALSE, main = "Clusters centers")
axis(2)
axis(1, 1:5, c(colnames(clust.centers)[1:4], "4W"), cex.axis = 0.9)

for (clusterNum in 1:4) {
    points(clust.centers[clusterNum, ], type = "l", col = clusterNum, lwd = 2)
    points(clust.centers[clusterNum, ], col = clusterNum, pch = 20)
}

plot of chunk unnamed-chunk-34

cloud(groupAvg[, "NC"] ~ groupAvg[, "CD"] * groupAvg[, "UC"], col = geneDS.km$clust, 
    xlab = "NC", ylab = "CD", zlab = "UC")

plot of chunk unnamed-chunk-35

pvc <- pvclust(topDat, nboot = 100)
## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.
plot(pvc, labels = des$group, cex = 0.6)
pvrect(pvc, alpha = 0.95)

plot of chunk unnamed-chunk-37