title: “Clustering”
output:
pdf_document: default
html_document: default

Hierichical clusteing is performed to the cell lines in “NC160” data, with the goal of finding out wheather or not the observations cluster into the distinct types of cancer.

library(ISLR)
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
## [1]   64 6830
nci.labs[1:4]
## [1] "CNS"   "CNS"   "CNS"   "RENAL"
table(nci.labs)
## nci.labs
##      BREAST         CNS       COLON K562A-repro K562B-repro    LEUKEMIA 
##           7           5           7           1           1           6 
## MCF7A-repro MCF7D-repro    MELANOMA       NSCLC     OVARIAN    PROSTATE 
##           1           1           8           9           6           2 
##       RENAL     UNKNOWN 
##           9           1
nci.labs
##  [1] "CNS"         "CNS"         "CNS"         "RENAL"       "BREAST"     
##  [6] "CNS"         "CNS"         "BREAST"      "NSCLC"       "NSCLC"      
## [11] "RENAL"       "RENAL"       "RENAL"       "RENAL"       "RENAL"      
## [16] "RENAL"       "RENAL"       "BREAST"      "NSCLC"       "RENAL"      
## [21] "UNKNOWN"     "OVARIAN"     "MELANOMA"    "PROSTATE"    "OVARIAN"    
## [26] "OVARIAN"     "OVARIAN"     "OVARIAN"     "OVARIAN"     "PROSTATE"   
## [31] "NSCLC"       "NSCLC"       "NSCLC"       "LEUKEMIA"    "K562B-repro"
## [36] "K562A-repro" "LEUKEMIA"    "LEUKEMIA"    "LEUKEMIA"    "LEUKEMIA"   
## [41] "LEUKEMIA"    "COLON"       "COLON"       "COLON"       "COLON"      
## [46] "COLON"       "COLON"       "COLON"       "MCF7A-repro" "BREAST"     
## [51] "MCF7D-repro" "BREAST"      "NSCLC"       "NSCLC"       "NSCLC"      
## [56] "MELANOMA"    "BREAST"      "BREAST"      "MELANOMA"    "MELANOMA"   
## [61] "MELANOMA"    "MELANOMA"    "MELANOMA"    "MELANOMA"
sd.data=scale(nci.data) # standarized the variables( mean zero and standard deviation one).
par(mfrow=c(1,1))
data.dist = dist(sd.data)
plot(hclust(data.dist),labels=nci.labs,main="complete linkage",xlab="",sub="",ylab = "",cex=".5")

# Hierarchical clustering of the obsercvations using Complete linkage.
par(mfrow=c(1,1))
plot(hclust(data.dist,method ="average"),labels=nci.labs,main="average linkage",xlab="",sub="",ylab = "",cex=".5") 

# Hierarchical clustering of the observations using Average linkage.
plot(hclust(data.dist,method="single"),labels=nci.labs,main="Single linkage",xlab="",sub="",ylab = "",cex=".5", col="25")

# Hierarchical clustering of the observations using Single linkage.

Single clusters tend to yeild trailing clusters: very large clusters into which indivisual observations attached one by one.

On the other hand, Complete and Average linkage tends to ield more balanced and attractive clusters.

For this reason Complete and Average linkage are preferred to single linkage. Cell lines within a single cancer type do tend to cluster together,although the clustering is not perfect.

hc.out=hclust(dist(sd.data))
hc.clusters=cutree(hc.out,4)
table(hc.clusters,nci.labs)
##            nci.labs
## hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##           1      2   3     2           0           0        0           0
##           2      3   2     0           0           0        0           0
##           3      0   0     0           1           1        6           0
##           4      2   0     5           0           0        0           1
##            nci.labs
## hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##           1           0        8     8       6        2     8       1
##           2           0        0     1       0        0     1       0
##           3           0        0     0       0        0     0       0
##           4           1        0     0       0        0     0       0

Here we can see a clear pattern that all the leukemia cell lines fall under cluster 3 where the breast cancer cells are spreadout over three different cluster.

plot(hc.out,labels=nci.labs,main="Cluster Dendrogram",xlab="",sub="",ylab = "",cex=".5", col="25",cex=".5")
abline(h=139, col="red")

The argument h=139 plot a horizontal line at at height 139. This is the height result in 4 distinct clustering.

hc.out 
## 
## Call:
## hclust(d = dist(sd.data))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 64
set.seed(2)
km.out=kmeans(sd.data,4,nstart=20)
km.clusters=km.out$cluster
table(km.clusters,hc.clusters)
##            hc.clusters
## km.clusters  1  2  3  4
##           1 11  0  0  9
##           2 20  7  0  0
##           3  9  0  0  0
##           4  0  0  8  0

Here we can see the difference between K mean and hierarchical clustering. cluster 2 in k mean and cluster 3 in hierichical clustering are identical.