Lab 3: NCI60 Data Example

library(ISLR)
nci.labs = NCI60$labs
nci.data = NCI60$data
dim(nci.data)
[1]   64 6830
nci.labs[1:4]
[1] "CNS"   "CNS"   "CNS"   "RENAL"
table(nci.labs)
nci.labs
     BREAST         CNS       COLON K562A-repro 
          7           5           7           1 
K562B-repro    LEUKEMIA MCF7A-repro MCF7D-repro 
          1           6           1           1 
   MELANOMA       NSCLC     OVARIAN    PROSTATE 
          8           9           6           2 
      RENAL     UNKNOWN 
          9           1 

PCA on the NCI60 Data

pr.out = prcomp(nci.data, scale = TRUE)
Cols = function (vec) {
  cols = rainbow(length(unique(vec)))
  return(cols[as.numeric(as.factor(vec))])
}
par(mfrow = c(1, 2))
plot(pr.out$x[,1:2], col = Cols(nci.labs), pch = 19, xlab = "Z1", ylab = "Z2")
plot(pr.out$x[,c(1:3)], col = Cols(nci.labs), pch = 19, xlab = "Z1", ylab = "Z3")

summary(pr.out)
Importance of components:
                           PC1      PC2      PC3
Standard deviation     27.8535 21.48136 19.82046
Proportion of Variance  0.1136  0.06756  0.05752
Cumulative Proportion   0.1136  0.18115  0.23867
                            PC4      PC5      PC6
Standard deviation     17.03256 15.97181 15.72108
Proportion of Variance  0.04248  0.03735  0.03619
Cumulative Proportion   0.28115  0.31850  0.35468
                            PC7      PC8      PC9
Standard deviation     14.47145 13.54427 13.14400
Proportion of Variance  0.03066  0.02686  0.02529
Cumulative Proportion   0.38534  0.41220  0.43750
                           PC10     PC11     PC12
Standard deviation     12.73860 12.68672 12.15769
Proportion of Variance  0.02376  0.02357  0.02164
Cumulative Proportion   0.46126  0.48482  0.50646
                           PC13     PC14     PC15
Standard deviation     11.83019 11.62554 11.43779
Proportion of Variance  0.02049  0.01979  0.01915
Cumulative Proportion   0.52695  0.54674  0.56590
                           PC16     PC17     PC18
Standard deviation     11.00051 10.65666 10.48880
Proportion of Variance  0.01772  0.01663  0.01611
Cumulative Proportion   0.58361  0.60024  0.61635
                           PC19    PC20     PC21
Standard deviation     10.43518 10.3219 10.14608
Proportion of Variance  0.01594  0.0156  0.01507
Cumulative Proportion   0.63229  0.6479  0.66296
                          PC22    PC23    PC24
Standard deviation     10.0544 9.90265 9.64766
Proportion of Variance  0.0148 0.01436 0.01363
Cumulative Proportion   0.6778 0.69212 0.70575
                          PC25    PC26    PC27
Standard deviation     9.50764 9.33253 9.27320
Proportion of Variance 0.01324 0.01275 0.01259
Cumulative Proportion  0.71899 0.73174 0.74433
                         PC28    PC29    PC30
Standard deviation     9.0900 8.98117 8.75003
Proportion of Variance 0.0121 0.01181 0.01121
Cumulative Proportion  0.7564 0.76824 0.77945
                          PC31    PC32    PC33
Standard deviation     8.59962 8.44738 8.37305
Proportion of Variance 0.01083 0.01045 0.01026
Cumulative Proportion  0.79027 0.80072 0.81099
                          PC34    PC35    PC36
Standard deviation     8.21579 8.15731 7.97465
Proportion of Variance 0.00988 0.00974 0.00931
Cumulative Proportion  0.82087 0.83061 0.83992
                          PC37    PC38    PC39
Standard deviation     7.90446 7.82127 7.72156
Proportion of Variance 0.00915 0.00896 0.00873
Cumulative Proportion  0.84907 0.85803 0.86676
                          PC40    PC41   PC42
Standard deviation     7.58603 7.45619 7.3444
Proportion of Variance 0.00843 0.00814 0.0079
Cumulative Proportion  0.87518 0.88332 0.8912
                          PC43   PC44    PC45
Standard deviation     7.10449 7.0131 6.95839
Proportion of Variance 0.00739 0.0072 0.00709
Cumulative Proportion  0.89861 0.9058 0.91290
                         PC46    PC47    PC48
Standard deviation     6.8663 6.80744 6.64763
Proportion of Variance 0.0069 0.00678 0.00647
Cumulative Proportion  0.9198 0.92659 0.93306
                          PC49    PC50    PC51
Standard deviation     6.61607 6.40793 6.21984
Proportion of Variance 0.00641 0.00601 0.00566
Cumulative Proportion  0.93947 0.94548 0.95114
                          PC52    PC53    PC54
Standard deviation     6.20326 6.06706 5.91805
Proportion of Variance 0.00563 0.00539 0.00513
Cumulative Proportion  0.95678 0.96216 0.96729
                          PC55    PC56    PC57
Standard deviation     5.91233 5.73539 5.47261
Proportion of Variance 0.00512 0.00482 0.00438
Cumulative Proportion  0.97241 0.97723 0.98161
                         PC58    PC59    PC60
Standard deviation     5.2921 5.02117 4.68398
Proportion of Variance 0.0041 0.00369 0.00321
Cumulative Proportion  0.9857 0.98940 0.99262
                          PC61    PC62    PC63
Standard deviation     4.17567 4.08212 4.04124
Proportion of Variance 0.00255 0.00244 0.00239
Cumulative Proportion  0.99517 0.99761 1.00000
                            PC64
Standard deviation     2.148e-14
Proportion of Variance 0.000e+00
Cumulative Proportion  1.000e+00
plot(pr.out)

pve = 100*pr.out$sdev^2/sum(pr.out$sdev^2)
par(mfrow = c(1,2))
plot(pve, type = "o", ylab = "PVE", xlab = "Principal Component", col = "blue")
plot(cumsum(pve), type = "o", ylab = "Cumulative PVE", xlab = "Principal Component", col = "brown3")

Clustering the Observations of the NCI60 Data

sd.data = scale(nci.data)
par(mfrow = c(1,3))
data.dist = dist(sd.data)
plot(hclust(data.dist), labels = nci.labs, main = "Complete Linkage", xlab = "", sub = "", ylab = "")
plot(hclust(data.dist, method = "average"), labels = nci.labs, main = "Average Linkage", xlab = "", sub = "", ylab = "")
plot(hclust(data.dist, method = "single"), labels = nci.labs, main = "Single Linkage", xlab = "", sub = "", ylab = "")

hc.out = hclust(dist(sd.data))
hc.clusters = cutree(hc.out, 4)
table(hc.clusters, nci.labs)
           nci.labs
hc.clusters BREAST CNS COLON K562A-repro
          1      2   3     2           0
          2      3   2     0           0
          3      0   0     0           1
          4      2   0     5           0
           nci.labs
hc.clusters K562B-repro LEUKEMIA MCF7A-repro
          1           0        0           0
          2           0        0           0
          3           1        6           0
          4           0        0           1
           nci.labs
hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN
          1           0        8     8       6
          2           0        0     1       0
          3           0        0     0       0
          4           1        0     0       0
           nci.labs
hc.clusters PROSTATE RENAL UNKNOWN
          1        2     8       1
          2        0     1       0
          3        0     0       0
          4        0     0       0
par(mfrow = c(1,1))
plot(hc.out, labels = nci.labs)
abline(h = 139, col = "red")

hc.out

Call:
hclust(d = dist(sd.data))

Cluster method   : complete 
Distance         : euclidean 
Number of objects: 64 
set.seed(2)
km.out = kmeans(sd.data, 4, nstart = 20)
km.clusters = km.out$cluster
table(km.clusters, hc.clusters)
           hc.clusters
km.clusters  1  2  3  4
          1 11  0  0  9
          2  0  0  8  0
          3  9  0  0  0
          4 20  7  0  0
hc.out = hclust(dist(pr.out$x[,1:5]))
plot(hc.out, labels = nci.labs, main = "Hier. Clust. on First Five Score Vectors")

table(cutree(hc.out, 4), nci.labs)
   nci.labs
    BREAST CNS COLON K562A-repro K562B-repro
  1      0   2     7           0           0
  2      5   3     0           0           0
  3      0   0     0           1           1
  4      2   0     0           0           0
   nci.labs
    LEUKEMIA MCF7A-repro MCF7D-repro MELANOMA
  1        2           0           0        1
  2        0           0           0        7
  3        4           0           0        0
  4        0           1           1        0
   nci.labs
    NSCLC OVARIAN PROSTATE RENAL UNKNOWN
  1     8       5        2     7       0
  2     1       1        0     2       1
  3     0       0        0     0       0
  4     0       0        0     0       0
LS0tDQp0aXRsZTogIkNhcO10dWxvIDEwLCBMYWIgMyINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KYXV0aG9yOiAiTHVpcyBKaW1lbmV6Ig0KLS0tDQoNCiMjIyBMYWIgMzogTkNJNjAgRGF0YSBFeGFtcGxlDQoNCg0KYGBge3J9DQpsaWJyYXJ5KElTTFIpDQpuY2kubGFicyA9IE5DSTYwJGxhYnMNCm5jaS5kYXRhID0gTkNJNjAkZGF0YQ0KYGBgDQoNCmBgYHtyfQ0KZGltKG5jaS5kYXRhKQ0KYGBgDQoNCmBgYHtyfQ0KbmNpLmxhYnNbMTo0XQ0KdGFibGUobmNpLmxhYnMpDQpgYGANCg0KIyMjIyBQQ0Egb24gdGhlIE5DSTYwIERhdGENCg0KYGBge3J9DQpwci5vdXQgPSBwcmNvbXAobmNpLmRhdGEsIHNjYWxlID0gVFJVRSkNCmBgYA0KDQpgYGB7cn0NCkNvbHMgPSBmdW5jdGlvbiAodmVjKSB7DQogIGNvbHMgPSByYWluYm93KGxlbmd0aCh1bmlxdWUodmVjKSkpDQogIHJldHVybihjb2xzW2FzLm51bWVyaWMoYXMuZmFjdG9yKHZlYykpXSkNCn0NCmBgYA0KDQpgYGB7cn0NCnBhcihtZnJvdyA9IGMoMSwgMikpDQpwbG90KHByLm91dCR4WywxOjJdLCBjb2wgPSBDb2xzKG5jaS5sYWJzKSwgcGNoID0gMTksIHhsYWIgPSAiWjEiLCB5bGFiID0gIloyIikNCnBsb3QocHIub3V0JHhbLGMoMTozKV0sIGNvbCA9IENvbHMobmNpLmxhYnMpLCBwY2ggPSAxOSwgeGxhYiA9ICJaMSIsIHlsYWIgPSAiWjMiKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShwci5vdXQpDQpgYGANCg0KYGBge3J9DQpwbG90KHByLm91dCkNCmBgYA0KDQpgYGB7cn0NCnB2ZSA9IDEwMCAqIHByLm91dCRzZGV2XjIgLyBzdW0ocHIub3V0JHNkZXZeMikNCnBhcihtZnJvdyA9IGMoMSwyKSkNCnBsb3QocHZlLCB0eXBlID0gIm8iLCB5bGFiID0gIlBWRSIsIHhsYWIgPSAiUHJpbmNpcGFsIENvbXBvbmVudCIsIGNvbCA9ICJibHVlIikNCnBsb3QoY3Vtc3VtKHB2ZSksIHR5cGUgPSAibyIsIHlsYWIgPSAiQ3VtdWxhdGl2ZSBQVkUiLCB4bGFiID0gIlByaW5jaXBhbCBDb21wb25lbnQiLCBjb2wgPSAiYnJvd24zIikNCmBgYA0KDQojIyMjIENsdXN0ZXJpbmcgdGhlIE9ic2VydmF0aW9ucyBvZiB0aGUgTkNJNjAgRGF0YQ0KDQpgYGB7cn0NCnNkLmRhdGEgPSBzY2FsZShuY2kuZGF0YSkNCmBgYA0KDQpgYGB7cn0NCnBhcihtZnJvdyA9IGMoMSwzKSkNCmRhdGEuZGlzdCA9IGRpc3Qoc2QuZGF0YSkNCnBsb3QoaGNsdXN0KGRhdGEuZGlzdCksIGxhYmVscyA9IG5jaS5sYWJzLCBtYWluID0gIkNvbXBsZXRlIExpbmthZ2UiLCB4bGFiID0gIiIsIHN1YiA9ICIiLCB5bGFiID0gIiIpDQpwbG90KGhjbHVzdChkYXRhLmRpc3QsIG1ldGhvZCA9ICJhdmVyYWdlIiksIGxhYmVscyA9IG5jaS5sYWJzLCBtYWluID0gIkF2ZXJhZ2UgTGlua2FnZSIsIHhsYWIgPSAiIiwgc3ViID0gIiIsIHlsYWIgPSAiIikNCnBsb3QoaGNsdXN0KGRhdGEuZGlzdCwgbWV0aG9kID0gInNpbmdsZSIpLCBsYWJlbHMgPSBuY2kubGFicywgbWFpbiA9ICJTaW5nbGUgTGlua2FnZSIsIHhsYWIgPSAiIiwgc3ViID0gIiIsIHlsYWIgPSAiIikNCmBgYA0KDQpgYGB7cn0NCmhjLm91dCA9IGhjbHVzdChkaXN0KHNkLmRhdGEpKQ0KaGMuY2x1c3RlcnMgPSBjdXRyZWUoaGMub3V0LCA0KQ0KdGFibGUoaGMuY2x1c3RlcnMsIG5jaS5sYWJzKQ0KYGBgDQoNCmBgYHtyfQ0KcGFyKG1mcm93ID0gYygxLDEpKQ0KcGxvdChoYy5vdXQsIGxhYmVscyA9IG5jaS5sYWJzKQ0KYWJsaW5lKGggPSAxMzksIGNvbCA9ICJyZWQiKQ0KYGBgDQoNCmBgYHtyfQ0KaGMub3V0DQpgYGANCg0KYGBge3J9DQpzZXQuc2VlZCgyKQ0Ka20ub3V0ID0ga21lYW5zKHNkLmRhdGEsIDQsIG5zdGFydCA9IDIwKQ0Ka20uY2x1c3RlcnMgPSBrbS5vdXQkY2x1c3Rlcg0KdGFibGUoa20uY2x1c3RlcnMsIGhjLmNsdXN0ZXJzKQ0KYGBgDQoNCmBgYHtyfQ0KaGMub3V0ID0gaGNsdXN0KGRpc3QocHIub3V0JHhbLDE6NV0pKQ0KcGxvdChoYy5vdXQsIGxhYmVscyA9IG5jaS5sYWJzLCBtYWluID0gIkhpZXIuIENsdXN0LiBvbiBGaXJzdCBGaXZlIFNjb3JlIFZlY3RvcnMiKQ0KdGFibGUoY3V0cmVlKGhjLm91dCwgNCksIG5jaS5sYWJzKQ0KYGBgDQoNCmBgYHtyfQ0KDQpgYGANCg0KYGBge3J9DQoNCmBgYA0KDQpgYGB7cn0NCg0KYGBgDQoNCmBgYHtyfQ0KDQpgYGANCg==