# NCI60 cancer cell line microarray data, which consists of 6,830 gene expression
# measurements on 64 cancer cell lines. Each cell line is labeled with a cancer type.
# The data has 64 rows and 6,830 columns.
library (ISLR)
## Warning: package 'ISLR' was built under R version 3.4.2
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
## [1] 64 6830
# examining the cancer types for the cell lines
table(nci.labs)
## nci.labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA
## 7 5 7 1 1 6
## MCF7A-repro MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE
## 1 1 8 9 6 2
## RENAL UNKNOWN
## 9 1
#==============================================================================================
# We first perform PCA on the data after scaling the variables (genes) to
# have standard deviation one
pr.out =prcomp (nci.data , scale=TRUE)
# # The function will be used to assign a color to each of
# the 64 cell lines, based on the cancer type to which it corresponds.
Cols=function (vec ){
cols=rainbow (length (unique (vec )));
return (cols[as.numeric (as.factor (vec))]);
}
# plot the principal component score vectors
par(mfrow =c(1,2))
plot(pr.out$x [,1:2], col =Cols(nci.labs), pch =19,
xlab ="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3) ], col =Cols(nci.labs), pch =19,
xlab ="Z1",ylab="Z3")

# obtain a summary of the proportion of variance explained (PVE)
# of the first few principal components using the summary() method for a
# prcomp object
summary (pr.out)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 27.8535 21.48136 19.82046 17.03256 15.97181
## Proportion of Variance 0.1136 0.06756 0.05752 0.04248 0.03735
## Cumulative Proportion 0.1136 0.18115 0.23867 0.28115 0.31850
## PC6 PC7 PC8 PC9 PC10
## Standard deviation 15.72108 14.47145 13.54427 13.14400 12.73860
## Proportion of Variance 0.03619 0.03066 0.02686 0.02529 0.02376
## Cumulative Proportion 0.35468 0.38534 0.41220 0.43750 0.46126
## PC11 PC12 PC13 PC14 PC15
## Standard deviation 12.68672 12.15769 11.83019 11.62554 11.43779
## Proportion of Variance 0.02357 0.02164 0.02049 0.01979 0.01915
## Cumulative Proportion 0.48482 0.50646 0.52695 0.54674 0.56590
## PC16 PC17 PC18 PC19 PC20
## Standard deviation 11.00051 10.65666 10.48880 10.43518 10.3219
## Proportion of Variance 0.01772 0.01663 0.01611 0.01594 0.0156
## Cumulative Proportion 0.58361 0.60024 0.61635 0.63229 0.6479
## PC21 PC22 PC23 PC24 PC25 PC26
## Standard deviation 10.14608 10.0544 9.90265 9.64766 9.50764 9.33253
## Proportion of Variance 0.01507 0.0148 0.01436 0.01363 0.01324 0.01275
## Cumulative Proportion 0.66296 0.6778 0.69212 0.70575 0.71899 0.73174
## PC27 PC28 PC29 PC30 PC31 PC32
## Standard deviation 9.27320 9.0900 8.98117 8.75003 8.59962 8.44738
## Proportion of Variance 0.01259 0.0121 0.01181 0.01121 0.01083 0.01045
## Cumulative Proportion 0.74433 0.7564 0.76824 0.77945 0.79027 0.80072
## PC33 PC34 PC35 PC36 PC37 PC38
## Standard deviation 8.37305 8.21579 8.15731 7.97465 7.90446 7.82127
## Proportion of Variance 0.01026 0.00988 0.00974 0.00931 0.00915 0.00896
## Cumulative Proportion 0.81099 0.82087 0.83061 0.83992 0.84907 0.85803
## PC39 PC40 PC41 PC42 PC43 PC44
## Standard deviation 7.72156 7.58603 7.45619 7.3444 7.10449 7.0131
## Proportion of Variance 0.00873 0.00843 0.00814 0.0079 0.00739 0.0072
## Cumulative Proportion 0.86676 0.87518 0.88332 0.8912 0.89861 0.9058
## PC45 PC46 PC47 PC48 PC49 PC50
## Standard deviation 6.95839 6.8663 6.80744 6.64763 6.61607 6.40793
## Proportion of Variance 0.00709 0.0069 0.00678 0.00647 0.00641 0.00601
## Cumulative Proportion 0.91290 0.9198 0.92659 0.93306 0.93947 0.94548
## PC51 PC52 PC53 PC54 PC55 PC56
## Standard deviation 6.21984 6.20326 6.06706 5.91805 5.91233 5.73539
## Proportion of Variance 0.00566 0.00563 0.00539 0.00513 0.00512 0.00482
## Cumulative Proportion 0.95114 0.95678 0.96216 0.96729 0.97241 0.97723
## PC57 PC58 PC59 PC60 PC61 PC62
## Standard deviation 5.47261 5.2921 5.02117 4.68398 4.17567 4.08212
## Proportion of Variance 0.00438 0.0041 0.00369 0.00321 0.00255 0.00244
## Cumulative Proportion 0.98161 0.9857 0.98940 0.99262 0.99517 0.99761
## PC63 PC64
## Standard deviation 4.04124 2.148e-14
## Proportion of Variance 0.00239 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
# Using the plot() function, we can also plot the variance explained by the
# first few principal components
plot(pr.out) # Note that the height of each bar in the bar plot is given by squaring the
# corresponding element of pr.out$sdev
#===================================================================================================
# Clustering the Observations of the NCI60 Data
sd.data=scale(nci.data)
par(mfrow =c(1,3))

data.dist=dist(sd.data)
# performing hierarchical clustering
# Complete linkage
plot(hclust (data.dist), labels =nci.labs , main=" Complete Linkage ", xlab ="", sub ="", ylab ="")
#Average Linkage
plot(hclust (data.dist , method ="average"), labels =nci.labs ,
main=" Average Linkage ", xlab ="", sub ="", ylab ="")
#Single Linkage
plot(hclust (data.dist , method ="single"), labels =nci.labs ,
main=" Single Linkage ", xlab="", sub ="", ylab ="")

# NOTE: the choice of linkage certainly does affect the results obtained.
# Typically, single linkage will tend to yield trailing clusters: very large
# clusters onto which individual observations attach one-by-one. On the other hand,
# complete and average linkage tend to yield more balanced, attractive clusters.
# For this reason, complete and average linkage are generally preferred to single linkage.
# Inference: Clearly cell lines within a single cancer type do tend to cluster together,
# although the clustering is not perfect.
# We can cut the dendrogram at the height that will yield a particular number of clusters, k=4
hc.out =hclust (dist(sd.data))
hc.clusters =cutree (hc.out ,4)
table(hc.clusters ,nci.labs)
## nci.labs
## hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 2 3 2 0 0 0 0
## 2 3 2 0 0 0 0 0
## 3 0 0 0 1 1 6 0
## 4 2 0 5 0 0 0 1
## nci.labs
## hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 8 8 6 2 8 1
## 2 0 0 1 0 0 1 0
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0
# There are some clear patterns. All the leukemia cell lines fall in cluster 3,
# while the breast cancer cell lines are spread out over three different clusters.
par(mfrow =c(1,1))
plot(hc.out , labels =nci.labs)
abline (h=139, col =" red ")

# How do these NCI60 hierarchical clustering results compare to what we get if we
# perform K-means clustering with K = 4??
set.seed (2)
km.out =kmeans (sd.data , 4, nstart =20)
km.clusters =km.out$cluster
table(km.clusters ,hc.clusters )
## hc.clusters
## km.clusters 1 2 3 4
## 1 11 0 0 9
## 2 0 0 8 0
## 3 9 0 0 0
## 4 20 7 0 0
# We see that the four clusters obtained using hierarchical clustering and Kmeans
# clustering are somewhat different. Cluster 2 in K-means clustering is
# identical to cluster 3 in hierarchical clustering. However, the other clusters differ.
#=================================================================================================
# Rather than performing hierarchical clustering on the entire data matrix,
# we can simply perform hierarchical clustering on the first few principal
# component score vectors, as follows:
hc.out =hclust (dist(pr.out$x [ ,1:5]) )
plot(hc.out , labels =nci.labs , main=" Hier. Clust . on First Five Score Vectors ")

table(cutree (hc.out ,4) , nci.labs)
## nci.labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 0 2 7 0 0 2 0
## 2 5 3 0 0 0 0 0
## 3 0 0 0 1 1 4 0
## 4 2 0 0 0 0 0 1
## nci.labs
## MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 1 8 5 2 7 0
## 2 0 7 1 1 0 2 1
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0
#==================================================================================================