PROJECT 1: CLUSTERING & PRINCIPAL COMPONENT ANALYSIS

Leukemia Dataset Description, as per DataScienceProject.html provided:

22283 observations (genes)
16 samples (patients)
Gene expression levels for patients with poor and good progonosis, measured in terms of the number of leukemic cells present in the bone marrow after a period of treatment:
- Low numbers of leukemic cells indicates a good progonosis.
- High numbers of leukemic cells indicate a poor progonosis.

Import Data & Exploratory Data Analysis:

#read data into a dataframe
df1 <- read.table('leukemia_array.txt')
str(df1)

## 'data.frame':    22283 obs. of  16 variables:
##  $ p1_good: num  1292 232 169 1589 128 ...
##  $ p2_good: num  932 259 348 1985 131 ...
##  $ p3_good: num  1172.9 236.3 363.3 2050.3 60.1 ...
##  $ p4_good: num  1212.5 365.8 205.8 1782.1 75.1 ...
##  $ p5_good: num  1280 243 255 1521 50 ...
##  $ p6_good: num  1351.3 260.9 243.2 2153.3 26.9 ...
##  $ p7_good: num  2242 192 234 2399 114 ...
##  $ p8_good: num  1411.4 195.9 122.4 1537.4 64.7 ...
##  $ p1_poor: num  1405 215 422 2956 102 ...
##  $ p2_poor: num  1070.6 83.9 253.8 4594.6 171.3 ...
##  $ p3_poor: num  1563.5 278.3 768.5 1558.7 57.5 ...
##  $ p4_poor: num  1475 526 141 3221 211 ...
##  $ p5_poor: num  1212 276 254 2044 102 ...
##  $ p6_poor: num  1418 280 147.6 1539 88.8 ...
##  $ p7_poor: num  1516 294 611 3532 196 ...
##  $ p8_poor: num  2279 238 893 3671 207 ...

#transpose df, to allow for PCA (genes are now recorded in columns and patients in rows)
df1 <- t(df1)
str(df1)

##  num [1:16, 1:22283] 1292 932 1173 1212 1280 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:16] "p1_good" "p2_good" "p3_good" "p4_good" ...
##   ..$ : chr [1:22283] "1007_s_at" "1053_at" "117_at" "121_at" ...

#Add prognosis label

prognosis <- c(rep("good", 8), rep("poor", 8))
prognosis <- as.data.frame(prognosis)

df2 <- cbind(prognosis, df1)
df2$prognosis <- as.factor(df2$prognosis)

#histogram of expression levels
hist(df1,breaks=100,freq=TRUE,main='Histogram of gene expression levels',xlab='Expression level')

#most genes have a very low expression level

Principal Component Analysis (PCA):

#pca on different transformations of data, including plots with proportion of variance explained per principal component
par(mfrow=c(1, 4))

#pca on original data:
pca.orig.dat <- prcomp(df1)
plot(pca.orig.dat, type="l")

#pca on original data, normalized and standardized
pca.orig.norm.st <- prcomp(df1, center = TRUE, scale. = TRUE)
plot(pca.orig.norm.st, type="l")

#log transform data
log.all.dat <- log(df1)

#pca on log-transformed data
pca.log.dat <- prcomp(log.all.dat)
plot(pca.log.dat, type="l")

#pca on log-transformed data, normalized and standardized
pca.log.norm.st <- prcomp(log.all.dat, center = TRUE, scale. = TRUE)
plot(pca.log.norm.st, type="l")

#PCA on the original data struggles to explain much of the variance, there is an increase in explanatory power once the data is normalized and standardized, but the greatest explanatory power is attained on the log-data (not normalized or standardized).

#pca results summarized
summary(pca.orig.dat)

## Importance of components%s:
##                              PC1       PC2       PC3       PC4       PC5
## Standard deviation     1.562e+05 1.097e+05 8.902e+04 8.095e+04 5.651e+04
## Proportion of Variance 3.663e-01 1.804e-01 1.189e-01 9.834e-02 4.791e-02
## Cumulative Proportion  3.663e-01 5.468e-01 6.657e-01 7.640e-01 8.119e-01
##                              PC6       PC7       PC8       PC9      PC10
## Standard deviation     4.615e+04 4.433e+04 4.069e+04 3.603e+04 3.507e+04
## Proportion of Variance 3.197e-02 2.949e-02 2.485e-02 1.948e-02 1.845e-02
## Cumulative Proportion  8.439e-01 8.734e-01 8.982e-01 9.177e-01 9.362e-01
##                             PC11      PC12      PC13      PC14      PC15
## Standard deviation     3.373e+04 3.198e+04 2.785e+04 2.613e+04 2.521e+04
## Proportion of Variance 1.707e-02 1.534e-02 1.164e-02 1.025e-02 9.540e-03
## Cumulative Proportion  9.532e-01 9.686e-01 9.802e-01 9.905e-01 1.000e+00
##                             PC16
## Standard deviation     3.746e-10
## Proportion of Variance 0.000e+00
## Cumulative Proportion  1.000e+00

summary(pca.orig.norm.st)

## Importance of components%s:
##                           PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     85.226 51.3415 44.34176 37.21473 36.33934 34.33981
## Proportion of Variance  0.326  0.1183  0.08824  0.06215  0.05926  0.05292
## Cumulative Proportion   0.326  0.4443  0.53250  0.59465  0.65391  0.70683
##                             PC7      PC8      PC9    PC10     PC11
## Standard deviation     30.23176 29.39799 28.51449 27.8071 27.09373
## Proportion of Variance  0.04102  0.03878  0.03649  0.0347  0.03294
## Cumulative Proportion   0.74785  0.78663  0.82312  0.8578  0.89077
##                            PC12     PC13     PC14     PC15      PC16
## Standard deviation     26.27332 25.18051 24.22583 22.86498 1.692e-13
## Proportion of Variance  0.03098  0.02845  0.02634  0.02346 0.000e+00
## Cumulative Proportion   0.92175  0.95020  0.97654  1.00000 1.000e+00

summary(pca.log.dat)

## Importance of components%s:
##                            PC1      PC2      PC3      PC4      PC5
## Standard deviation     49.3687 27.82194 25.52599 22.75810 22.53319
## Proportion of Variance  0.2838  0.09015  0.07588  0.06032  0.05913
## Cumulative Proportion   0.2838  0.37399  0.44988  0.51019  0.56933
##                             PC6      PC7     PC8      PC9     PC10    PC11
## Standard deviation     20.90393 20.46430 20.1739 19.64844 19.49222 19.2143
## Proportion of Variance  0.05089  0.04877  0.0474  0.04496  0.04425  0.0430
## Cumulative Proportion   0.62022  0.66899  0.7164  0.76135  0.80560  0.8486
##                            PC12     PC13     PC14     PC15      PC16
## Standard deviation     18.93668 18.69074 17.77911 16.61446 9.539e-14
## Proportion of Variance  0.04176  0.04068  0.03681  0.03215 0.000e+00
## Cumulative Proportion   0.89035  0.93104  0.96785  1.00000 1.000e+00

summary(pca.log.norm.st)

## Importance of components%s:
##                           PC1     PC2      PC3      PC4      PC5      PC6
## Standard deviation     81.632 50.7948 41.81428 37.86222 36.23344 33.74602
## Proportion of Variance  0.299  0.1158  0.07846  0.06433  0.05892  0.05111
## Cumulative Proportion   0.299  0.4148  0.49331  0.55764  0.61656  0.66766
##                             PC7      PC8      PC9     PC10     PC11
## Standard deviation     31.46372 30.65406 30.11230 29.10130 28.68284
## Proportion of Variance  0.04443  0.04217  0.04069  0.03801  0.03692
## Cumulative Proportion   0.71209  0.75426  0.79495  0.83296  0.86988
##                            PC12     PC13     PC14    PC15      PC16
## Standard deviation     27.93695 27.29255 26.39643 26.0258 1.899e-13
## Proportion of Variance  0.03503  0.03343  0.03127  0.0304 0.000e+00
## Cumulative Proportion   0.90491  0.93833  0.96960  1.0000 1.000e+00

#plot pca results (PC1 v PC2 for each form of the data)
library(devtools)
#install_github("ggbiplot", "vqv")
 
library(ggbiplot)

print(ggbiplot(pca.orig.dat, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))

#the original data does not separate very well with PCA

print(ggbiplot(pca.orig.norm.st, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))

#individuals with good prognoses cluster together when data is normalized and standardized

print(ggbiplot(pca.log.dat, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))

#all individuals with a good prognosis lie within x=(-1,0), y=(-0.5,2.5), there are no individuals with bad prognoses in this area of the PC1 vs PC2 graph

print(ggbiplot(pca.log.norm.st, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))

#PCA using pcaMethods package
# source("https://bioconductor.org/biocLite.R")
# biocLite("pcaMethods")
library(pcaMethods)

par(mfrow=c(1,4))

pca2.orig.dat <- pca(df1, nPcs = 5, scale = "none", center = FALSE)
summary(pca2.orig.dat)

## svd calculated PCA
## Importance of component(s):
##                  PC1     PC2     PC3     PC4     PC5
## R2            0.9554 0.01128 0.00767 0.00616 0.00494
## Cumulative R2 0.9554 0.96669 0.97436 0.98052 0.98546

plot(pca2.orig.dat@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.98546, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col=  "red")

#using five PCs on the original data using this method, results in a cumulative R-squared = 0.985

pca2.orig.norm.st <- pca(df1, nPcs = 5, scale = "uv", center = TRUE)
summary(pca2.orig.norm.st)

## svd calculated PCA
## Importance of component(s):
##                 PC1    PC2     PC3     PC4     PC5
## R2            0.326 0.1183 0.08824 0.06215 0.05926
## Cumulative R2 0.326 0.4443 0.53250 0.59465 0.65391

plot(pca2.orig.norm.st@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.65391, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col=  "red")

#when this data is normalized and standardized using this method, there is a +/- 30% reduction in cumulative R^2 using 5 PCs

pca2.log.dat <- pca(log.all.dat, nPcs = 5, scale = "none", center = FALSE)
summary(pca2.log.dat)

## svd calculated PCA
## Importance of component(s):
##                  PC1     PC2     PC3     PC4    PC5
## R2            0.9905 0.00247 0.00087 0.00073 0.0006
## Cumulative R2 0.9905 0.99298 0.99385 0.99458 0.9952

plot(pca2.log.dat@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.9952, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col=  "red")

#once again, log-transforming the data explains nearly all the variance (at 5 PCs, cumulative R^2 = 0.995)

pca2.log.norm.st <- pca(log.all.dat, nPcs = 5, scale = "uv", center = TRUE)
summary(pca2.log.norm.st)

## svd calculated PCA
## Importance of component(s):
##                 PC1    PC2     PC3     PC4     PC5
## R2            0.299 0.1158 0.07847 0.06433 0.05892
## Cumulative R2 0.299 0.4148 0.49331 0.55764 0.61656

plot(pca2.log.norm.st@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.61656, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col=  "red")

#normalizing and standardizing the log-transformed data is not useful, and also reduces cumulative explained vriance by around 30%

#scores and loadings plot
slplot(pca2.orig.dat, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)

#using the first 2 PCs of the original data, a line can be drawn at y=0 that separates all but 2 patients into the correct diagnostic category

slplot(pca2.orig.norm.st, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)

slplot(pca2.log.dat, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)

slplot(pca2.log.norm.st, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)

#similar to what we saw earlier, the log-transformed data separates the prognoses well, but normalizing and standardizing does not contribute to clustering ability

PCA using Singular value decomposition (SVD) techniques

In an SVD analysis, an (n x p) matrix: X, is decomposed by X = U * D * V, where:

U is an m×n orthogonal matrix.
V is an n×n orthogonal matrix.
D is an n×n diagonal matrix.

sv <- svd(df1)

U = sv$u
V = sv$v
D = sv$d

#Z = U * D (U are un-scaled PCs)
# The k^th column of Z: Z(k), is the k^th PC
# The k^th column of V: V(k), is the k^th PC loading (feature weights), and encodes the k^th PC in feature space
# The k^th column of U: U(k), is the k^th PC loading (observation weights), and encodes the k^th PC in observation space
# The diagonals in D: d(k) gives the strength of the k^th PC
# The variance explained by k^th PC is given by: d(k)^2
# The total variance of the data is given by: sum(d(k1)^2 + d(k2)^2 + …..d(k)^2+….)
# The proportion of variance explained by k^th PC is given by: d(k)^2 / sum(d(k1)^2 + d(k2)^2 + …..d(k)^2+….)

#PLOT OF LOADINGS (unscaled PCs)
plot(U[,1],U[,2],type="n",xlab="PC1",ylab="PC2")
text(U[,1],U[,2],labels = df2$prognosis)

#PLOT OF Z (scaled PCs)
Z = U%*%diag(D)
plot(Z[,1], Z[,2], type ="n", xlab="PC1", ylab="PC2")
text(Z[,1], Z[,2],labels = df2$prognosis)

#The above two plots are basically identical, except for the x- and y-axis scales, good prognoses tend toward the bottom of the plot in a narrow band, whereas bad prognoses are more spread out, suggesting that there are concrete gene expression levels which define a good prognosis, but a bad prognosis is more heterogenous (anything that does not fall within the good prognosis gene expression level limits)...

#PLOT OF Z (scaled PC2 vs PC3)
Z = U%*%diag(D)
plot(Z[,2], Z[,3], type ="n", xlab="PC2", ylab="PC3")
text(Z[,2], Z[,3],labels = df2$prognosis)

#PLOT OF Z (scaled PC3 vs PC4)
Z = U%*%diag(D)
plot(Z[,3], Z[,4], type ="n", xlab="PC3", ylab="PC4")
text(Z[,3], Z[,4],labels = df2$prognosis)

#looking at PC2 - PC4, the same pattern of a tightly clustering good prognosis group vs. more diffuse bad prognosis group is evident

#PLOT OF Z (scaled PC4 vs PC5)
Z = U%*%diag(D)
plot(Z[,4], Z[,5], type ="n", xlab="PC4", ylab="PC5")
text(Z[,4], Z[,5],labels = df2$prognosis)

#PC4 and PC5 don't separate the groups very well, since most of the variance has already been explained by the first 3 PCs

#PC loadings - visualize data by limiting to top genes in magnitude in the PC loadings

#The matrix V contains the weights for the features, and we can use V to select important features (genes) that contribute to each PC.

aa<- grep("grey",colors())
bb<- grep("green",colors())
cc<-  grep("red",colors())
gcol2<- colors()[c(aa[1:30],bb[1:20],rep(cc,2))]

#Heatmap of top 100 genes driving PC1
k=1
X <- t(df1)
ord1<- order(abs(V[,k]),decreasing=TRUE)
x1<- as.matrix(X[ord1[1:100],])
heatmap(x1,col=gcol2)

#a heatmap clusters similar genes, as well as similar samples, low gene expressions are indicated by green, middle expression levels by black and high expression by red.
#since there are a lot of the 100 most informative genes that have high expression levels in patients with good prognoses (p8_good, p6_good, p5_good, p7_good, p4_good), one could invesitgate whether it is a breakdown of tumor suppressor gene/ growth inhibiting gene expression that drives leukemia, instead of an overexpression oncogenes/ growth promoting genes

#Heatmap of top 100 genes driving PC2
j<- 2
ord<- order(abs(V[,j]),decreasing=TRUE)
x<- as.matrix(X[ord[1:100],])
heatmap(x,col=gcol2)

#there is not such an obvious pattern in the genes driving the second PC

#We find the genes that drive Leukemia together.
#Variance Explained

varex = 0
cumvar = 0
denom = sum(D^2)

numPC <- length(D)

for(i in 1:numPC){
  varex[i] = D[i]^2/denom
  cumvar[i] = sum(D[1:i]^2)/denom
}

## variance explained by each PC cumulatively
cumvar

##  [1] 0.9554089 0.9666907 0.9743649 0.9805170 0.9854642 0.9884442 0.9904316
##  [8] 0.9922282 0.9937769 0.9949888 0.9961363 0.9971906 0.9980626 0.9987864
## [15] 0.9994230 1.0000000

#screeplot

par(mfrow=c(1,2))
plot(1:16,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
plot(1:16,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")

#very few PCs are required to explain most of the variance, one definitely needs less than 5 PCs for analysis, as is evident in the above plots

#Sparse PCA

#When p>>n (we have way more genes than the samples), many featuers(genes) are irrelevant. PCA can perform very badly. Sparse PCA zeroes out irrelevant features from PC loadings, i.e. it does feature selection for us, retaining only the features that drive the major pattern in the data

#install.packages('nsprcomp')
library(nsprcomp)

#Sparse PCA, retaining 4 PCs, at different tolerance thresholds (components are omitted if their standard deviations are less than or equal to tol times the standard deviation of the first component

oldw <- getOption("warn")
options(warn = -1)

spc <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0, nneg = TRUE)
spc2 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 3, tol = 0, nneg = TRUE)
spc3 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.0001, nneg = TRUE)
spc4 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.001, nneg = TRUE)
spc5 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.01, nneg = TRUE)
spc6 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.1, nneg = TRUE)
spc7 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.5, nneg = TRUE)
spc8 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.9, nneg = TRUE)

options(warn = oldw)

#loadings (0 indicates that a gene is not retained)
V <- as.data.frame(spc$rotation)
V2 <- as.data.frame(spc2$rotation)
V3 <- as.data.frame(spc3$rotation)
V4 <- as.data.frame(spc4$rotation)
V5 <- as.data.frame(spc5$rotation)
V6 <- as.data.frame(spc6$rotation)
V7 <- as.data.frame(spc7$rotation)
V8 <- as.data.frame(spc8$rotation)

length(which(V==0))

## [1] 156

#157 genes are deemed irrelevant, using sparse PCA on scaled data

length(which(V2==0))

## [1] 106

#when only 3 PCs are used, more genes are retained

length(which(V3==0))

## [1] 157

length(which(V4==0))

## [1] 156

length(which(V5==0))

## [1] 159

length(which(V6==0))

## [1] 155

#similar numbers of genes are left out at 4 PCs, while tolerance levels remain low

length(which(V7==0))

## [1] 0

length(which(V8==0))

## [1] 0

#ll genes are retained at high tolerance levels

plot(V)

plot(V2)

plot(V3)

plot(V4)

plot(V5)

plot(V6)

plot(V7)
plot(V8)

#PC scatterplot
cols = as.numeric(as.factor(df2$prognosis))
K = 3
pclabs = c("PC1","PC2","PC3","PC4")
par(mfrow=c(1,K))
for(i in 1:K){
  j = i+1
  plot(spc$x[,i],spc$x[,j],type="n",xlab=pclabs[i],ylab=pclabs[j])
  text(spc$x[,i],spc$x[,j],colnames(df1),col=cols)
}

MSc Data Science, Department of Statistical Sciences, University of Cape Town

STA5077Z: Unsupervised Learning Module Project

Gerhard Viljoen [VLJCHR004]

26 July 2017

PROJECT 1: CLUSTERING & PRINCIPAL COMPONENT ANALYSIS

Leukemia Dataset Description, as per DataScienceProject.html provided:

Import Data & Exploratory Data Analysis:

Principal Component Analysis (PCA):

PCA using Singular value decomposition (SVD) techniques

In an SVD analysis, an (n x p) matrix: X, is decomposed by X = U * D * V, where: