NCI60 Data Example

NCI60 cancer cell line microarray data
Consists of 6,830 gene expressions and 64 cancer cell lines

library(ISLR)
nci.labs = NCI60$labs  #labels - for checking later
nci.data = NCI60$data

Each cell line is labeled a cancer type.
We don't use these to perform unsupervised learning.
We will use these to check how good the results are though.

dim(nci.data)

## [1]   64 6830

length(nci.labs)

## [1] 64

nci.labs[1:4]

## [1] "CNS"   "CNS"   "CNS"   "RENAL"

table(nci.labs)

## nci.labs
##      BREAST         CNS       COLON K562A-repro K562B-repro    LEUKEMIA 
##           7           5           7           1           1           6 
## MCF7A-repro MCF7D-repro    MELANOMA       NSCLC     OVARIAN    PROSTATE 
##           1           1           8           9           6           2 
##       RENAL     UNKNOWN 
##           9           1

Pricinpal Component Analysis

Although one could argue that it is better NOT to scale the genes.

pr.out = prcomp(nci.data, scale = T)

Simple color assignment function for each of the 64 cell lines.

Cols = function(vec) {
    cols = rainbow(length(unique(vec)))
    return(cols[as.numeric(as.factor(vec))])
}

Plot the pricinpal component score vectors

par(mfrow = c(1, 2), pin = c(3, 3))
plot(pr.out$x[, 1:2], col = Cols(nci.labs), pch = 19, xlab = "Z1", ylab = "Z2")  #Z1 v Z2
plot(pr.out$x[, c(1, 3)], col = Cols(nci.labs), pch = 19, xlab = "Z1", ylab = "Z3")  #Z1 v Z3

plot of chunk unnamed-chunk-5

Summary of PVE

summary(pr.out)

## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     27.853 21.4814 19.8205 17.0326 15.9718 15.7211
## Proportion of Variance  0.114  0.0676  0.0575  0.0425  0.0374  0.0362
## Cumulative Proportion   0.114  0.1812  0.2387  0.2812  0.3185  0.3547
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     14.4715 13.5443 13.1440 12.7386 12.6867 12.1577
## Proportion of Variance  0.0307  0.0269  0.0253  0.0238  0.0236  0.0216
## Cumulative Proportion   0.3853  0.4122  0.4375  0.4613  0.4848  0.5065
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     11.8302 11.6255 11.4378 11.0005 10.6567 10.4888
## Proportion of Variance  0.0205  0.0198  0.0192  0.0177  0.0166  0.0161
## Cumulative Proportion   0.5270  0.5467  0.5659  0.5836  0.6002  0.6163
##                           PC19    PC20    PC21    PC22   PC23   PC24
## Standard deviation     10.4352 10.3219 10.1461 10.0544 9.9027 9.6477
## Proportion of Variance  0.0159  0.0156  0.0151  0.0148 0.0144 0.0136
## Cumulative Proportion   0.6323  0.6479  0.6630  0.6778 0.6921 0.7057
##                          PC25   PC26   PC27   PC28   PC29   PC30   PC31
## Standard deviation     9.5076 9.3325 9.2732 9.0900 8.9812 8.7500 8.5996
## Proportion of Variance 0.0132 0.0127 0.0126 0.0121 0.0118 0.0112 0.0108
## Cumulative Proportion  0.7190 0.7317 0.7443 0.7564 0.7682 0.7794 0.7903
##                          PC32   PC33    PC34    PC35    PC36    PC37
## Standard deviation     8.4474 8.3730 8.21579 8.15731 7.97465 7.90446
## Proportion of Variance 0.0104 0.0103 0.00988 0.00974 0.00931 0.00915
## Cumulative Proportion  0.8007 0.8110 0.82087 0.83061 0.83992 0.84907
##                           PC38    PC39    PC40    PC41   PC42    PC43
## Standard deviation     7.82127 7.72156 7.58603 7.45619 7.3444 7.10449
## Proportion of Variance 0.00896 0.00873 0.00843 0.00814 0.0079 0.00739
## Cumulative Proportion  0.85803 0.86676 0.87518 0.88332 0.8912 0.89861
##                          PC44    PC45   PC46    PC47    PC48    PC49
## Standard deviation     7.0131 6.95839 6.8663 6.80744 6.64763 6.61607
## Proportion of Variance 0.0072 0.00709 0.0069 0.00678 0.00647 0.00641
## Cumulative Proportion  0.9058 0.91290 0.9198 0.92659 0.93306 0.93947
##                           PC50    PC51    PC52    PC53    PC54    PC55
## Standard deviation     6.40793 6.21984 6.20326 6.06706 5.91805 5.91233
## Proportion of Variance 0.00601 0.00566 0.00563 0.00539 0.00513 0.00512
## Cumulative Proportion  0.94548 0.95114 0.95678 0.96216 0.96729 0.97241
##                           PC56    PC57   PC58    PC59    PC60    PC61
## Standard deviation     5.73539 5.47261 5.2921 5.02117 4.68398 4.17567
## Proportion of Variance 0.00482 0.00438 0.0041 0.00369 0.00321 0.00255
## Cumulative Proportion  0.97723 0.98161 0.9857 0.98940 0.99262 0.99517
##                           PC62    PC63     PC64
## Standard deviation     4.08212 4.04124 2.15e-14
## Proportion of Variance 0.00244 0.00239 0.00e+00
## Cumulative Proportion  0.99761 1.00000 1.00e+00

plot(pr.out)

plot of chunk unnamed-chunk-6

Scree plot

pve = 100 * pr.out$sdev^2/sum(pr.out$sdev^2)
plot(pve, type = "o", ylab = "PVE", xlab = "Principal Component", col = "blue")

plot of chunk unnamed-chunk-7

plot(cumsum(pve), type = "o", ylab = "Cumulative PVE", xlab = "Principal Component", 
    col = "brown3")

plot of chunk unnamed-chunk-7

Clear “elbow” in the scree plot after 7th component or so.

Clustering

Hierachical cluster analysis using complete, single, and average linkage

sd.data = scale(nci.data)  #scale the data first
data.dist = dist(sd.data)  #Euclidean distance for dissimilarity measure

plot(hclust(data.dist), labels = nci.labs, main = "Complete Linkage", xlab = "", 
    sub = "", ylab = "")  #default linkage method is 'complete'

plot of chunk unnamed-chunk-8


plot(hclust(data.dist, method = "average"), labels = nci.labs, main = "Average Linkage", 
    xlab = "", sub = "", ylab = "")

plot of chunk unnamed-chunk-8


plot(hclust(data.dist, method = "single"), labels = nci.labs, main = "Single Linkage", 
    xlab = "", sub = "", ylab = "")

plot of chunk unnamed-chunk-8

Single linkage methods tend to yield a “trailing” clusters
Complete and average tend to yield more balanced clusters
Using complete linkage hierarchical clustering:

hc.out = hclust(dist(sd.data))
hc.clusters = cutree(hc.out, 4)
table(hc.clusters, nci.labs)

##            nci.labs
## hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##           1      2   3     2           0           0        0           0
##           2      3   2     0           0           0        0           0
##           3      0   0     0           1           1        6           0
##           4      2   0     5           0           0        0           1
##            nci.labs
## hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##           1           0        8     8       6        2     8       1
##           2           0        0     1       0        0     1       0
##           3           0        0     0       0        0     0       0
##           4           1        0     0       0        0     0       0

Clear pattern
All Leukemia cell lines fall in cluster 3
Breast cancer cell lines are spread out over 3 clusters
Plot the cut in the dendrogram that yielded the 4 clusters

plot(hc.out, labels = nci.labs)
abline(h = 139, col = "red")

plot of chunk unnamed-chunk-10

Summary of the dendrogram

hc.out

## 
## Call:
## hclust(d = dist(sd.data))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 64

Look at how this compares to K-means clustering with K=4

set.seed(2)
km.out = kmeans(sd.data, 4, nstart = 20)
km.clusters = km.out$cluster
table(km.clusters, hc.clusters)

##            hc.clusters
## km.clusters  1  2  3  4
##           1 11  0  0  9
##           2  0  0  8  0
##           3  9  0  0  0
##           4 20  7  0  0

The results from hierachical and k-means clustering are different
Only Cluster 2 in K-means and Cluster 3 in Hierarchical are the same
The other clusters are mixed
Hierarchical clustering on the first 5 principal components
Dissimilarity measure is now based on the first 5 principal components!
Use PCA to “de-noise” the data first!

hc.out = hclust(dist(pr.out$x[, 1:5]))  #Distance is based on principal component scores!!
plot(hc.out, labels = nci.labs, main = "Hierarchical Clustering on First Five PCA Score Vectors")

plot of chunk unnamed-chunk-13

table(cutree(hc.out, 4), nci.labs)

##    nci.labs
##     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##   1      0   2     7           0           0        2           0
##   2      5   3     0           0           0        0           0
##   3      0   0     0           1           1        4           0
##   4      2   0     0           0           0        0           1
##    nci.labs
##     MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##   1           0        1     8       5        2     7       0
##   2           0        7     1       1        0     2       1
##   3           0        0     0       0        0     0       0
##   4           1        0     0       0        0     0       0