#read data into a dataframe
df1 <- read.table('leukemia_array.txt')
str(df1)
## 'data.frame': 22283 obs. of 16 variables:
## $ p1_good: num 1292 232 169 1589 128 ...
## $ p2_good: num 932 259 348 1985 131 ...
## $ p3_good: num 1172.9 236.3 363.3 2050.3 60.1 ...
## $ p4_good: num 1212.5 365.8 205.8 1782.1 75.1 ...
## $ p5_good: num 1280 243 255 1521 50 ...
## $ p6_good: num 1351.3 260.9 243.2 2153.3 26.9 ...
## $ p7_good: num 2242 192 234 2399 114 ...
## $ p8_good: num 1411.4 195.9 122.4 1537.4 64.7 ...
## $ p1_poor: num 1405 215 422 2956 102 ...
## $ p2_poor: num 1070.6 83.9 253.8 4594.6 171.3 ...
## $ p3_poor: num 1563.5 278.3 768.5 1558.7 57.5 ...
## $ p4_poor: num 1475 526 141 3221 211 ...
## $ p5_poor: num 1212 276 254 2044 102 ...
## $ p6_poor: num 1418 280 147.6 1539 88.8 ...
## $ p7_poor: num 1516 294 611 3532 196 ...
## $ p8_poor: num 2279 238 893 3671 207 ...
#transpose df, to allow for PCA (genes are now recorded in columns and patients in rows)
df1 <- t(df1)
str(df1)
## num [1:16, 1:22283] 1292 932 1173 1212 1280 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:16] "p1_good" "p2_good" "p3_good" "p4_good" ...
## ..$ : chr [1:22283] "1007_s_at" "1053_at" "117_at" "121_at" ...
#Add prognosis label
prognosis <- c(rep("good", 8), rep("poor", 8))
prognosis <- as.data.frame(prognosis)
df2 <- cbind(prognosis, df1)
df2$prognosis <- as.factor(df2$prognosis)
#histogram of expression levels
hist(df1,breaks=100,freq=TRUE,main='Histogram of gene expression levels',xlab='Expression level')
#most genes have a very low expression level
#pca on different transformations of data, including plots with proportion of variance explained per principal component
par(mfrow=c(1, 4))
#pca on original data:
pca.orig.dat <- prcomp(df1)
plot(pca.orig.dat, type="l")
#pca on original data, normalized and standardized
pca.orig.norm.st <- prcomp(df1, center = TRUE, scale. = TRUE)
plot(pca.orig.norm.st, type="l")
#log transform data
log.all.dat <- log(df1)
#pca on log-transformed data
pca.log.dat <- prcomp(log.all.dat)
plot(pca.log.dat, type="l")
#pca on log-transformed data, normalized and standardized
pca.log.norm.st <- prcomp(log.all.dat, center = TRUE, scale. = TRUE)
plot(pca.log.norm.st, type="l")
#PCA on the original data struggles to explain much of the variance, there is an increase in explanatory power once the data is normalized and standardized, but the greatest explanatory power is attained on the log-data (not normalized or standardized).
#pca results summarized
summary(pca.orig.dat)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.562e+05 1.097e+05 8.902e+04 8.095e+04 5.651e+04
## Proportion of Variance 3.663e-01 1.804e-01 1.189e-01 9.834e-02 4.791e-02
## Cumulative Proportion 3.663e-01 5.468e-01 6.657e-01 7.640e-01 8.119e-01
## PC6 PC7 PC8 PC9 PC10
## Standard deviation 4.615e+04 4.433e+04 4.069e+04 3.603e+04 3.507e+04
## Proportion of Variance 3.197e-02 2.949e-02 2.485e-02 1.948e-02 1.845e-02
## Cumulative Proportion 8.439e-01 8.734e-01 8.982e-01 9.177e-01 9.362e-01
## PC11 PC12 PC13 PC14 PC15
## Standard deviation 3.373e+04 3.198e+04 2.785e+04 2.613e+04 2.521e+04
## Proportion of Variance 1.707e-02 1.534e-02 1.164e-02 1.025e-02 9.540e-03
## Cumulative Proportion 9.532e-01 9.686e-01 9.802e-01 9.905e-01 1.000e+00
## PC16
## Standard deviation 3.746e-10
## Proportion of Variance 0.000e+00
## Cumulative Proportion 1.000e+00
summary(pca.orig.norm.st)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 85.226 51.3415 44.34176 37.21473 36.33934 34.33981
## Proportion of Variance 0.326 0.1183 0.08824 0.06215 0.05926 0.05292
## Cumulative Proportion 0.326 0.4443 0.53250 0.59465 0.65391 0.70683
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 30.23176 29.39799 28.51449 27.8071 27.09373
## Proportion of Variance 0.04102 0.03878 0.03649 0.0347 0.03294
## Cumulative Proportion 0.74785 0.78663 0.82312 0.8578 0.89077
## PC12 PC13 PC14 PC15 PC16
## Standard deviation 26.27332 25.18051 24.22583 22.86498 1.692e-13
## Proportion of Variance 0.03098 0.02845 0.02634 0.02346 0.000e+00
## Cumulative Proportion 0.92175 0.95020 0.97654 1.00000 1.000e+00
summary(pca.log.dat)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 49.3687 27.82194 25.52599 22.75810 22.53319
## Proportion of Variance 0.2838 0.09015 0.07588 0.06032 0.05913
## Cumulative Proportion 0.2838 0.37399 0.44988 0.51019 0.56933
## PC6 PC7 PC8 PC9 PC10 PC11
## Standard deviation 20.90393 20.46430 20.1739 19.64844 19.49222 19.2143
## Proportion of Variance 0.05089 0.04877 0.0474 0.04496 0.04425 0.0430
## Cumulative Proportion 0.62022 0.66899 0.7164 0.76135 0.80560 0.8486
## PC12 PC13 PC14 PC15 PC16
## Standard deviation 18.93668 18.69074 17.77911 16.61446 9.539e-14
## Proportion of Variance 0.04176 0.04068 0.03681 0.03215 0.000e+00
## Cumulative Proportion 0.89035 0.93104 0.96785 1.00000 1.000e+00
summary(pca.log.norm.st)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 81.632 50.7948 41.81428 37.86222 36.23344 33.74602
## Proportion of Variance 0.299 0.1158 0.07846 0.06433 0.05892 0.05111
## Cumulative Proportion 0.299 0.4148 0.49331 0.55764 0.61656 0.66766
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 31.46372 30.65406 30.11230 29.10130 28.68284
## Proportion of Variance 0.04443 0.04217 0.04069 0.03801 0.03692
## Cumulative Proportion 0.71209 0.75426 0.79495 0.83296 0.86988
## PC12 PC13 PC14 PC15 PC16
## Standard deviation 27.93695 27.29255 26.39643 26.0258 1.899e-13
## Proportion of Variance 0.03503 0.03343 0.03127 0.0304 0.000e+00
## Cumulative Proportion 0.90491 0.93833 0.96960 1.0000 1.000e+00
#plot pca results (PC1 v PC2 for each form of the data)
library(devtools)
#install_github("ggbiplot", "vqv")
library(ggbiplot)
print(ggbiplot(pca.orig.dat, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))
#the original data does not separate very well with PCA
print(ggbiplot(pca.orig.norm.st, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))
#individuals with good prognoses cluster together when data is normalized and standardized
print(ggbiplot(pca.log.dat, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))
#all individuals with a good prognosis lie within x=(-1,0), y=(-0.5,2.5), there are no individuals with bad prognoses in this area of the PC1 vs PC2 graph
print(ggbiplot(pca.log.norm.st, var.axes = FALSE, choices = 1:2, elipse = TRUE, groups = df2$prognosis, circle = TRUE))
#PCA using pcaMethods package
# source("https://bioconductor.org/biocLite.R")
# biocLite("pcaMethods")
library(pcaMethods)
par(mfrow=c(1,4))
pca2.orig.dat <- pca(df1, nPcs = 5, scale = "none", center = FALSE)
summary(pca2.orig.dat)
## svd calculated PCA
## Importance of component(s):
## PC1 PC2 PC3 PC4 PC5
## R2 0.9554 0.01128 0.00767 0.00616 0.00494
## Cumulative R2 0.9554 0.96669 0.97436 0.98052 0.98546
plot(pca2.orig.dat@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.98546, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col= "red")
#using five PCs on the original data using this method, results in a cumulative R-squared = 0.985
pca2.orig.norm.st <- pca(df1, nPcs = 5, scale = "uv", center = TRUE)
summary(pca2.orig.norm.st)
## svd calculated PCA
## Importance of component(s):
## PC1 PC2 PC3 PC4 PC5
## R2 0.326 0.1183 0.08824 0.06215 0.05926
## Cumulative R2 0.326 0.4443 0.53250 0.59465 0.65391
plot(pca2.orig.norm.st@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.65391, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col= "red")
#when this data is normalized and standardized using this method, there is a +/- 30% reduction in cumulative R^2 using 5 PCs
pca2.log.dat <- pca(log.all.dat, nPcs = 5, scale = "none", center = FALSE)
summary(pca2.log.dat)
## svd calculated PCA
## Importance of component(s):
## PC1 PC2 PC3 PC4 PC5
## R2 0.9905 0.00247 0.00087 0.00073 0.0006
## Cumulative R2 0.9905 0.99298 0.99385 0.99458 0.9952
plot(pca2.log.dat@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.9952, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col= "red")
#once again, log-transforming the data explains nearly all the variance (at 5 PCs, cumulative R^2 = 0.995)
pca2.log.norm.st <- pca(log.all.dat, nPcs = 5, scale = "uv", center = TRUE)
summary(pca2.log.norm.st)
## svd calculated PCA
## Importance of component(s):
## PC1 PC2 PC3 PC4 PC5
## R2 0.299 0.1158 0.07847 0.06433 0.05892
## Cumulative R2 0.299 0.4148 0.49331 0.55764 0.61656
plot(pca2.log.norm.st@R2cum, type = "b", ylim = c(0,1))
abline(h = 0.61656, col="blue")
abline(h = 0.5, col="purple")
abline(v=5, col= "red")
#normalizing and standardizing the log-transformed data is not useful, and also reduces cumulative explained vriance by around 30%
#scores and loadings plot
slplot(pca2.orig.dat, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)
#using the first 2 PCs of the original data, a line can be drawn at y=0 that separates all but 2 patients into the correct diagnostic category
slplot(pca2.orig.norm.st, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)
slplot(pca2.log.dat, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)
slplot(pca2.log.norm.st, scoresLoadings = c(TRUE,FALSE),sl = df2$prognosis)
#similar to what we saw earlier, the log-transformed data separates the prognoses well, but normalizing and standardizing does not contribute to clustering ability
sv <- svd(df1)
U = sv$u
V = sv$v
D = sv$d
#Z = U * D (U are un-scaled PCs)
# The k^th column of Z: Z(k), is the k^th PC
# The k^th column of V: V(k), is the k^th PC loading (feature weights), and encodes the k^th PC in feature space
# The k^th column of U: U(k), is the k^th PC loading (observation weights), and encodes the k^th PC in observation space
# The diagonals in D: d(k) gives the strength of the k^th PC
# The variance explained by k^th PC is given by: d(k)^2
# The total variance of the data is given by: sum(d(k1)^2 + d(k2)^2 + …..d(k)^2+….)
# The proportion of variance explained by k^th PC is given by: d(k)^2 / sum(d(k1)^2 + d(k2)^2 + …..d(k)^2+….)
#PLOT OF LOADINGS (unscaled PCs)
plot(U[,1],U[,2],type="n",xlab="PC1",ylab="PC2")
text(U[,1],U[,2],labels = df2$prognosis)
#PLOT OF Z (scaled PCs)
Z = U%*%diag(D)
plot(Z[,1], Z[,2], type ="n", xlab="PC1", ylab="PC2")
text(Z[,1], Z[,2],labels = df2$prognosis)
#The above two plots are basically identical, except for the x- and y-axis scales, good prognoses tend toward the bottom of the plot in a narrow band, whereas bad prognoses are more spread out, suggesting that there are concrete gene expression levels which define a good prognosis, but a bad prognosis is more heterogenous (anything that does not fall within the good prognosis gene expression level limits)...
#PLOT OF Z (scaled PC2 vs PC3)
Z = U%*%diag(D)
plot(Z[,2], Z[,3], type ="n", xlab="PC2", ylab="PC3")
text(Z[,2], Z[,3],labels = df2$prognosis)
#PLOT OF Z (scaled PC3 vs PC4)
Z = U%*%diag(D)
plot(Z[,3], Z[,4], type ="n", xlab="PC3", ylab="PC4")
text(Z[,3], Z[,4],labels = df2$prognosis)
#looking at PC2 - PC4, the same pattern of a tightly clustering good prognosis group vs. more diffuse bad prognosis group is evident
#PLOT OF Z (scaled PC4 vs PC5)
Z = U%*%diag(D)
plot(Z[,4], Z[,5], type ="n", xlab="PC4", ylab="PC5")
text(Z[,4], Z[,5],labels = df2$prognosis)
#PC4 and PC5 don't separate the groups very well, since most of the variance has already been explained by the first 3 PCs
#PC loadings - visualize data by limiting to top genes in magnitude in the PC loadings
#The matrix V contains the weights for the features, and we can use V to select important features (genes) that contribute to each PC.
aa<- grep("grey",colors())
bb<- grep("green",colors())
cc<- grep("red",colors())
gcol2<- colors()[c(aa[1:30],bb[1:20],rep(cc,2))]
#Heatmap of top 100 genes driving PC1
k=1
X <- t(df1)
ord1<- order(abs(V[,k]),decreasing=TRUE)
x1<- as.matrix(X[ord1[1:100],])
heatmap(x1,col=gcol2)
#a heatmap clusters similar genes, as well as similar samples, low gene expressions are indicated by green, middle expression levels by black and high expression by red.
#since there are a lot of the 100 most informative genes that have high expression levels in patients with good prognoses (p8_good, p6_good, p5_good, p7_good, p4_good), one could invesitgate whether it is a breakdown of tumor suppressor gene/ growth inhibiting gene expression that drives leukemia, instead of an overexpression oncogenes/ growth promoting genes
#Heatmap of top 100 genes driving PC2
j<- 2
ord<- order(abs(V[,j]),decreasing=TRUE)
x<- as.matrix(X[ord[1:100],])
heatmap(x,col=gcol2)
#there is not such an obvious pattern in the genes driving the second PC
#We find the genes that drive Leukemia together.
#Variance Explained
varex = 0
cumvar = 0
denom = sum(D^2)
numPC <- length(D)
for(i in 1:numPC){
varex[i] = D[i]^2/denom
cumvar[i] = sum(D[1:i]^2)/denom
}
## variance explained by each PC cumulatively
cumvar
## [1] 0.9554089 0.9666907 0.9743649 0.9805170 0.9854642 0.9884442 0.9904316
## [8] 0.9922282 0.9937769 0.9949888 0.9961363 0.9971906 0.9980626 0.9987864
## [15] 0.9994230 1.0000000
#screeplot
par(mfrow=c(1,2))
plot(1:16,varex,type="l",lwd=2,xlab="PC",ylab="% Variance Explained")
plot(1:16,cumvar,type="l",lwd=2,xlab="PC",ylab="Cummulative Variance Explained")
#very few PCs are required to explain most of the variance, one definitely needs less than 5 PCs for analysis, as is evident in the above plots
#Sparse PCA
#When p>>n (we have way more genes than the samples), many featuers(genes) are irrelevant. PCA can perform very badly. Sparse PCA zeroes out irrelevant features from PC loadings, i.e. it does feature selection for us, retaining only the features that drive the major pattern in the data
#install.packages('nsprcomp')
library(nsprcomp)
#Sparse PCA, retaining 4 PCs, at different tolerance thresholds (components are omitted if their standard deviations are less than or equal to tol times the standard deviation of the first component
oldw <- getOption("warn")
options(warn = -1)
spc <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0, nneg = TRUE)
spc2 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 3, tol = 0, nneg = TRUE)
spc3 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.0001, nneg = TRUE)
spc4 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.001, nneg = TRUE)
spc5 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.01, nneg = TRUE)
spc6 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.1, nneg = TRUE)
spc7 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.5, nneg = TRUE)
spc8 <- nsprcomp(t(x1), scale. = TRUE, ncomp = 4, tol = 0.9, nneg = TRUE)
options(warn = oldw)
#loadings (0 indicates that a gene is not retained)
V <- as.data.frame(spc$rotation)
V2 <- as.data.frame(spc2$rotation)
V3 <- as.data.frame(spc3$rotation)
V4 <- as.data.frame(spc4$rotation)
V5 <- as.data.frame(spc5$rotation)
V6 <- as.data.frame(spc6$rotation)
V7 <- as.data.frame(spc7$rotation)
V8 <- as.data.frame(spc8$rotation)
length(which(V==0))
## [1] 156
#157 genes are deemed irrelevant, using sparse PCA on scaled data
length(which(V2==0))
## [1] 106
#when only 3 PCs are used, more genes are retained
length(which(V3==0))
## [1] 157
length(which(V4==0))
## [1] 156
length(which(V5==0))
## [1] 159
length(which(V6==0))
## [1] 155
#similar numbers of genes are left out at 4 PCs, while tolerance levels remain low
length(which(V7==0))
## [1] 0
length(which(V8==0))
## [1] 0
#ll genes are retained at high tolerance levels
plot(V)
plot(V2)
plot(V3)
plot(V4)
plot(V5)
plot(V6)
plot(V7)
plot(V8)
#PC scatterplot
cols = as.numeric(as.factor(df2$prognosis))
K = 3
pclabs = c("PC1","PC2","PC3","PC4")
par(mfrow=c(1,K))
for(i in 1:K){
j = i+1
plot(spc$x[,i],spc$x[,j],type="n",xlab=pclabs[i],ylab=pclabs[j])
text(spc$x[,i],spc$x[,j],colnames(df1),col=cols)
}