NCI60 Data Example

library (ISLR)
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
[1]   64 6830
nci.labs [1:4]
[1] "CNS"   "CNS"   "CNS"   "RENAL"

PCA on the NCI60 Data

pr.out =prcomp (nci.data , scale=TRUE)
Cols=function (vec ){
  cols=rainbow (length (unique (vec )))
  return (cols[as.numeric (as.factor (vec))])
}
par(mfrow =c(1,2))
plot(pr.out$x [,1:2], col =Cols(nci.labs), pch =19,
xlab ="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3) ], col =Cols(nci.labs), pch =19,
xlab ="Z1",ylab="Z3")

summary (pr.out)
Importance of components:
                           PC1      PC2      PC3      PC4
Standard deviation     27.8535 21.48136 19.82046 17.03256
Proportion of Variance  0.1136  0.06756  0.05752  0.04248
Cumulative Proportion   0.1136  0.18115  0.23867  0.28115
                            PC5      PC6      PC7
Standard deviation     15.97181 15.72108 14.47145
Proportion of Variance  0.03735  0.03619  0.03066
Cumulative Proportion   0.31850  0.35468  0.38534
                            PC8      PC9     PC10
Standard deviation     13.54427 13.14400 12.73860
Proportion of Variance  0.02686  0.02529  0.02376
Cumulative Proportion   0.41220  0.43750  0.46126
                           PC11     PC12     PC13
Standard deviation     12.68672 12.15769 11.83019
Proportion of Variance  0.02357  0.02164  0.02049
Cumulative Proportion   0.48482  0.50646  0.52695
                           PC14     PC15     PC16
Standard deviation     11.62554 11.43779 11.00051
Proportion of Variance  0.01979  0.01915  0.01772
Cumulative Proportion   0.54674  0.56590  0.58361
                           PC17     PC18     PC19    PC20
Standard deviation     10.65666 10.48880 10.43518 10.3219
Proportion of Variance  0.01663  0.01611  0.01594  0.0156
Cumulative Proportion   0.60024  0.61635  0.63229  0.6479
                           PC21    PC22    PC23    PC24
Standard deviation     10.14608 10.0544 9.90265 9.64766
Proportion of Variance  0.01507  0.0148 0.01436 0.01363
Cumulative Proportion   0.66296  0.6778 0.69212 0.70575
                          PC25    PC26    PC27   PC28
Standard deviation     9.50764 9.33253 9.27320 9.0900
Proportion of Variance 0.01324 0.01275 0.01259 0.0121
Cumulative Proportion  0.71899 0.73174 0.74433 0.7564
                          PC29    PC30    PC31    PC32
Standard deviation     8.98117 8.75003 8.59962 8.44738
Proportion of Variance 0.01181 0.01121 0.01083 0.01045
Cumulative Proportion  0.76824 0.77945 0.79027 0.80072
                          PC33    PC34    PC35    PC36
Standard deviation     8.37305 8.21579 8.15731 7.97465
Proportion of Variance 0.01026 0.00988 0.00974 0.00931
Cumulative Proportion  0.81099 0.82087 0.83061 0.83992
                          PC37    PC38    PC39    PC40
Standard deviation     7.90446 7.82127 7.72156 7.58603
Proportion of Variance 0.00915 0.00896 0.00873 0.00843
Cumulative Proportion  0.84907 0.85803 0.86676 0.87518
                          PC41   PC42    PC43   PC44
Standard deviation     7.45619 7.3444 7.10449 7.0131
Proportion of Variance 0.00814 0.0079 0.00739 0.0072
Cumulative Proportion  0.88332 0.8912 0.89861 0.9058
                          PC45   PC46    PC47    PC48
Standard deviation     6.95839 6.8663 6.80744 6.64763
Proportion of Variance 0.00709 0.0069 0.00678 0.00647
Cumulative Proportion  0.91290 0.9198 0.92659 0.93306
                          PC49    PC50    PC51    PC52
Standard deviation     6.61607 6.40793 6.21984 6.20326
Proportion of Variance 0.00641 0.00601 0.00566 0.00563
Cumulative Proportion  0.93947 0.94548 0.95114 0.95678
                          PC53    PC54    PC55    PC56
Standard deviation     6.06706 5.91805 5.91233 5.73539
Proportion of Variance 0.00539 0.00513 0.00512 0.00482
Cumulative Proportion  0.96216 0.96729 0.97241 0.97723
                          PC57   PC58    PC59    PC60
Standard deviation     5.47261 5.2921 5.02117 4.68398
Proportion of Variance 0.00438 0.0041 0.00369 0.00321
Cumulative Proportion  0.98161 0.9857 0.98940 0.99262
                          PC61    PC62    PC63      PC64
Standard deviation     4.17567 4.08212 4.04124 2.148e-14
Proportion of Variance 0.00255 0.00244 0.00239 0.000e+00
Cumulative Proportion  0.99517 0.99761 1.00000 1.000e+00
plot(pr.out)

pve =100* pr.out$sdev ^2/ sum(pr.out$sdev ^2)
par(mfrow =c(1,2))
plot(pve , type ="o", ylab="PVE ", xlab="Principal Component",
col ="blue")
plot(cumsum (pve), type="o", ylab ="Cumulative PVE", xlab="Principal Component", col ="brown3")

Clustering the Observations of the NCI60 Data

sd.data=scale(nci.data)
par(mfrow =c(1,3))
data.dist=dist(sd.data)
plot(hclust (data.dist), labels =nci.labs , main="Complete
Linkage", xlab ="", sub ="", ylab ="")
plot(hclust (data.dist, method ="average"), labels =nci.labs ,
main="Average Linkage", xlab ="", sub ="", ylab ="")
plot(hclust(data.dist , method ="single"), labels =nci.labs ,
main="Single Linkage", xlab="", sub ="", ylab ="")

hc.out =hclust (dist(sd.data))
hc.clusters =cutree (hc.out ,4)
table(hc.clusters ,nci.labs)
           nci.labs
hc.clusters BREAST CNS COLON K562A-repro K562B-repro
          1      2   3     2           0           0
          2      3   2     0           0           0
          3      0   0     0           1           1
          4      2   0     5           0           0
           nci.labs
hc.clusters LEUKEMIA MCF7A-repro MCF7D-repro MELANOMA
          1        0           0           0        8
          2        0           0           0        0
          3        6           0           0        0
          4        0           1           1        0
           nci.labs
hc.clusters NSCLC OVARIAN PROSTATE RENAL UNKNOWN
          1     8       6        2     8       1
          2     1       0        0     1       0
          3     0       0        0     0       0
          4     0       0        0     0       0
par(mfrow =c(1,1))
plot(hc.out,labels =nci.labs)
abline (h=139, col ="red")

hc.out

Call:
hclust(d = dist(sd.data))

Cluster method   : complete 
Distance         : euclidean 
Number of objects: 64 
set.seed (2)
km.out =kmeans (sd.data , 4, nstart =20)
km.clusters =km.out$cluster
table(km.clusters ,hc.clusters )
           hc.clusters
km.clusters  1  2  3  4
          1 11  0  0  9
          2  0  0  8  0
          3  9  0  0  0
          4 20  7  0  0
hc.out =hclust (dist(pr.out$x [ ,1:5]) )
plot(hc.out , labels =nci.labs , main="Hier. Clust . on First
Five Score Vectors")

table(cutree (hc.out ,4) , nci.labs)
   nci.labs
    BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA
  1      0   2     7           0           0        2
  2      5   3     0           0           0        0
  3      0   0     0           1           1        4
  4      2   0     0           0           0        0
   nci.labs
    MCF7A-repro MCF7D-repro MELANOMA NSCLC OVARIAN
  1           0           0        1     8       5
  2           0           0        7     1       1
  3           0           0        0     0       0
  4           1           1        0     0       0
   nci.labs
    PROSTATE RENAL UNKNOWN
  1        2     7       0
  2        0     2       1
  3        0     0       0
  4        0     0       0
LS0tDQp0aXRsZTogIkxhYm9yYXRvcmlvIENhcO10dWxvIDEwLjYiDQpzdWJ0aXRsZTogIkFsZWphbmRyYSBDZXJtZfFvIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCg0KIyMjTkNJNjAgRGF0YSBFeGFtcGxlDQoNCmBgYHtyLCB3YXJuaW5nPSBGQUxTRX0NCmxpYnJhcnkgKElTTFIpDQpuY2kubGFicz1OQ0k2MCRsYWJzDQpuY2kuZGF0YT1OQ0k2MCRkYXRhDQpgYGANCg0KYGBge3J9DQpkaW0obmNpLmRhdGEpDQpgYGANCg0KYGBge3J9DQpuY2kubGFicyBbMTo0XQ0KYGBgDQoNCiMjIyNQQ0Egb24gdGhlIE5DSTYwIERhdGENCg0KYGBge3J9DQpwci5vdXQgPXByY29tcCAobmNpLmRhdGEgLCBzY2FsZT1UUlVFKQ0KYGBgDQoNCmBgYHtyfQ0KQ29scz1mdW5jdGlvbiAodmVjICl7DQogIGNvbHM9cmFpbmJvdyAobGVuZ3RoICh1bmlxdWUgKHZlYyApKSkNCiAgcmV0dXJuIChjb2xzW2FzLm51bWVyaWMgKGFzLmZhY3RvciAodmVjKSldKQ0KfQ0KYGBgDQoNCmBgYHtyfQ0KcGFyKG1mcm93ID1jKDEsMikpDQpwbG90KHByLm91dCR4IFssMToyXSwgY29sID1Db2xzKG5jaS5sYWJzKSwgcGNoID0xOSwNCnhsYWIgPSJaMSIseWxhYj0iWjIiKQ0KcGxvdChwci5vdXQkeFssYygxLDMpIF0sIGNvbCA9Q29scyhuY2kubGFicyksIHBjaCA9MTksDQp4bGFiID0iWjEiLHlsYWI9IlozIikNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkgKHByLm91dCkNCmBgYA0KDQpgYGB7cn0NCnBsb3QocHIub3V0KQ0KYGBgDQoNCmBgYHtyfQ0KcHZlID0xMDAqIHByLm91dCRzZGV2IF4yLyBzdW0ocHIub3V0JHNkZXYgXjIpDQpwYXIobWZyb3cgPWMoMSwyKSkNCnBsb3QocHZlICwgdHlwZSA9Im8iLCB5bGFiPSJQVkUgIiwgeGxhYj0iUHJpbmNpcGFsIENvbXBvbmVudCIsDQpjb2wgPSJibHVlIikNCnBsb3QoY3Vtc3VtIChwdmUpLCB0eXBlPSJvIiwgeWxhYiA9IkN1bXVsYXRpdmUgUFZFIiwgeGxhYj0iUHJpbmNpcGFsIENvbXBvbmVudCIsIGNvbCA9ImJyb3duMyIpDQpgYGANCg0KIyMjI0NsdXN0ZXJpbmcgdGhlIE9ic2VydmF0aW9ucyBvZiB0aGUgTkNJNjAgRGF0YQ0KDQpgYGB7cn0NCnNkLmRhdGE9c2NhbGUobmNpLmRhdGEpDQpgYGANCg0KYGBge3J9DQpwYXIobWZyb3cgPWMoMSwzKSkNCmRhdGEuZGlzdD1kaXN0KHNkLmRhdGEpDQpwbG90KGhjbHVzdCAoZGF0YS5kaXN0KSwgbGFiZWxzID1uY2kubGFicyAsIG1haW49IkNvbXBsZXRlDQpMaW5rYWdlIiwgeGxhYiA9IiIsIHN1YiA9IiIsIHlsYWIgPSIiKQ0KcGxvdChoY2x1c3QgKGRhdGEuZGlzdCwgbWV0aG9kID0iYXZlcmFnZSIpLCBsYWJlbHMgPW5jaS5sYWJzICwNCm1haW49IkF2ZXJhZ2UgTGlua2FnZSIsIHhsYWIgPSIiLCBzdWIgPSIiLCB5bGFiID0iIikNCnBsb3QoaGNsdXN0KGRhdGEuZGlzdCAsIG1ldGhvZCA9InNpbmdsZSIpLCBsYWJlbHMgPW5jaS5sYWJzICwNCm1haW49IlNpbmdsZSBMaW5rYWdlIiwgeGxhYj0iIiwgc3ViID0iIiwgeWxhYiA9IiIpDQpgYGANCg0KYGBge3J9DQpoYy5vdXQgPWhjbHVzdCAoZGlzdChzZC5kYXRhKSkNCmhjLmNsdXN0ZXJzID1jdXRyZWUgKGhjLm91dCAsNCkNCnRhYmxlKGhjLmNsdXN0ZXJzICxuY2kubGFicykNCmBgYA0KDQpgYGB7cn0NCnBhcihtZnJvdyA9YygxLDEpKQ0KcGxvdChoYy5vdXQsbGFiZWxzID1uY2kubGFicykNCmFibGluZSAoaD0xMzksIGNvbCA9InJlZCIpDQpgYGANCg0KYGBge3J9DQpoYy5vdXQNCmBgYA0KDQpgYGB7cn0NCnNldC5zZWVkICgyKQ0Ka20ub3V0ID1rbWVhbnMgKHNkLmRhdGEgLCA0LCBuc3RhcnQgPTIwKQ0Ka20uY2x1c3RlcnMgPWttLm91dCRjbHVzdGVyDQp0YWJsZShrbS5jbHVzdGVycyAsaGMuY2x1c3RlcnMgKQ0KYGBgDQoNCmBgYHtyfQ0KaGMub3V0ID1oY2x1c3QgKGRpc3QocHIub3V0JHggWyAsMTo1XSkgKQ0KcGxvdChoYy5vdXQgLCBsYWJlbHMgPW5jaS5sYWJzICwgbWFpbj0iSGllci4gQ2x1c3QgLiBvbiBGaXJzdA0KRml2ZSBTY29yZSBWZWN0b3JzIikNCnRhYmxlKGN1dHJlZSAoaGMub3V0ICw0KSAsIG5jaS5sYWJzKQ0KYGBgDQoNCg0KDQoNCg0KDQoNCg==