We will use the USArrests
data (which is in R)
dimnames(USArrests)
## [[1]]
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
##
## [[2]]
## [1] "Murder" "Assault" "UrbanPop" "Rape"
apply(USArrests,2,mean)
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
apply(USArrests,2, var)
## Murder Assault UrbanPop Rape
## 18.97 6945.17 209.52 87.73
We see that Assault
has a much larger variance than the other variables. It would dominate the principal components, so we choose to standardize the variables when we perform PCA
pca.out=prcomp(USArrests, scale=TRUE)
pca.out
## Standard deviations:
## [1] 1.5749 0.9949 0.5971 0.4164
##
## Rotation:
## PC1 PC2 PC3 PC4
## Murder -0.5359 0.4182 -0.3412 0.64923
## Assault -0.5832 0.1880 -0.2681 -0.74341
## UrbanPop -0.2782 -0.8728 -0.3780 0.13388
## Rape -0.5434 -0.1673 0.8178 0.08902
names(pca.out)
## [1] "sdev" "rotation" "center" "scale" "x"
biplot(pca.out, scale=0)
K-means works in any dimension, but is most fun to demonstrate in two, because we can plot pictures. Lets make some data with clusters. We do this by shifting the means of the points around.
set.seed(101)
x=matrix(rnorm(100*2),100,2)
xmean=matrix(rnorm(8,sd=4),4,2)
which=sample(1:4,100,replace=TRUE)
x=x+xmean[which,]
plot(x,col=which,pch=19)
We know the “true” cluster IDs, but we wont tell that to the
kmeans
algorithm.
km.out=kmeans(x,4,nstart=15)
km.out
## K-means clustering with 4 clusters of sizes 21, 30, 32, 17
##
## Cluster means:
## [,1] [,2]
## 1 -3.1069 1.1213
## 2 1.7226 -0.2585
## 3 -5.5818 3.3685
## 4 -0.6148 4.8861
##
## Clustering vector:
## [1] 2 3 3 4 1 1 4 3 2 3 2 1 1 3 1 1 2 3 3 2 2 3 1 3 1 1 2 2 3 1 1 4 3 1 3
## [36] 3 1 2 2 3 2 2 3 3 1 3 1 3 4 2 1 2 2 4 3 3 2 2 3 2 1 2 3 4 2 4 3 4 4 2
## [71] 2 4 3 2 3 4 4 2 2 1 2 4 4 3 3 2 3 3 1 2 3 2 4 4 4 2 3 3 1 1
##
## Within cluster sum of squares by cluster:
## [1] 30.83 54.48 71.98 21.05
## (between_SS / total_SS = 87.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(x,col=km.out$cluster,cex=2,pch=1,lwd=2)
points(x,col=which,pch=19)
points(x,col=c(4,3,2,1)[which],pch=19)
We will use these same data and use hierarchical clustering
hc.complete=hclust(dist(x),method="complete")
plot(hc.complete)
hc.single=hclust(dist(x),method="single")
plot(hc.single)
hc.average=hclust(dist(x),method="average")
plot(hc.average)
Lets compare this with the actualy clusters in the data. We will use the function
cutree
to cut the tree at level 4. This will produce a vector of numbers from 1 to 4, saying which branch each observation is on. You will sometimes see pretty plots where the leaves of the dendrogram are colored. I searched a bit on the web for how to do this, and its a little too complicated for this demonstration.
We can use table
to see how well they match:
hc.cut=cutree(hc.complete,4)
table(hc.cut,which)
## which
## hc.cut 1 2 3 4
## 1 0 0 30 0
## 2 1 31 0 2
## 3 17 0 0 0
## 4 0 0 0 19
table(hc.cut,km.out$cluster)
##
## hc.cut 1 2 3 4
## 1 0 30 0 0
## 2 2 0 32 0
## 3 0 0 0 17
## 4 19 0 0 0
or we can use our group membership as labels for the leaves of the dendrogram:
plot(hc.complete,labels=which)
library(ISLR)
nci.labs=NCI60$labs
nci.data=NCI60$data
dim(nci.data)
## [1] 64 6830
nci.labs[1:4]
## [1] "CNS" "CNS" "CNS" "RENAL"
table(nci.labs)
## nci.labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA
## 7 5 7 1 1 6
## MCF7A-repro MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE
## 1 1 8 9 6 2
## RENAL UNKNOWN
## 9 1
PCA on the NCI60 Data
pr.out=prcomp(nci.data, scale=TRUE)
Cols=function(vec){
cols=rainbow(length(unique(vec)))
return(cols[as.numeric(as.factor(vec))])
}
par(mfrow=c(1,2))
plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z2")
plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="Z1",ylab="Z3")
summary(pr.out)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 27.853 21.4814 19.8205 17.0326 15.9718 15.7211
## Proportion of Variance 0.114 0.0676 0.0575 0.0425 0.0374 0.0362
## Cumulative Proportion 0.114 0.1812 0.2387 0.2812 0.3185 0.3547
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 14.4715 13.5443 13.1440 12.7386 12.6867 12.1577
## Proportion of Variance 0.0307 0.0269 0.0253 0.0238 0.0236 0.0216
## Cumulative Proportion 0.3853 0.4122 0.4375 0.4613 0.4848 0.5065
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 11.8302 11.6255 11.4378 11.0005 10.6567 10.4888
## Proportion of Variance 0.0205 0.0198 0.0192 0.0177 0.0166 0.0161
## Cumulative Proportion 0.5270 0.5467 0.5659 0.5836 0.6002 0.6163
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 10.4352 10.3219 10.1461 10.0544 9.9027 9.6477
## Proportion of Variance 0.0159 0.0156 0.0151 0.0148 0.0144 0.0136
## Cumulative Proportion 0.6323 0.6479 0.6630 0.6778 0.6921 0.7057
## PC25 PC26 PC27 PC28 PC29 PC30 PC31
## Standard deviation 9.5076 9.3325 9.2732 9.0900 8.9812 8.7500 8.5996
## Proportion of Variance 0.0132 0.0127 0.0126 0.0121 0.0118 0.0112 0.0108
## Cumulative Proportion 0.7190 0.7317 0.7443 0.7564 0.7682 0.7794 0.7903
## PC32 PC33 PC34 PC35 PC36 PC37
## Standard deviation 8.4474 8.3730 8.21579 8.15731 7.97465 7.90446
## Proportion of Variance 0.0104 0.0103 0.00988 0.00974 0.00931 0.00915
## Cumulative Proportion 0.8007 0.8110 0.82087 0.83061 0.83992 0.84907
## PC38 PC39 PC40 PC41 PC42 PC43
## Standard deviation 7.82127 7.72156 7.58603 7.45619 7.3444 7.10449
## Proportion of Variance 0.00896 0.00873 0.00843 0.00814 0.0079 0.00739
## Cumulative Proportion 0.85803 0.86676 0.87518 0.88332 0.8912 0.89861
## PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 7.0131 6.95839 6.8663 6.80744 6.64763 6.61607
## Proportion of Variance 0.0072 0.00709 0.0069 0.00678 0.00647 0.00641
## Cumulative Proportion 0.9058 0.91290 0.9198 0.92659 0.93306 0.93947
## PC50 PC51 PC52 PC53 PC54 PC55
## Standard deviation 6.40793 6.21984 6.20326 6.06706 5.91805 5.91233
## Proportion of Variance 0.00601 0.00566 0.00563 0.00539 0.00513 0.00512
## Cumulative Proportion 0.94548 0.95114 0.95678 0.96216 0.96729 0.97241
## PC56 PC57 PC58 PC59 PC60 PC61
## Standard deviation 5.73539 5.47261 5.2921 5.02117 4.68398 4.17567
## Proportion of Variance 0.00482 0.00438 0.0041 0.00369 0.00321 0.00255
## Cumulative Proportion 0.97723 0.98161 0.9857 0.98940 0.99262 0.99517
## PC62 PC63 PC64
## Standard deviation 4.08212 4.04124 1.35e-14
## Proportion of Variance 0.00244 0.00239 0.00e+00
## Cumulative Proportion 0.99761 1.00000 1.00e+00
plot(pr.out)
pve=100*pr.out$sdev^2/sum(pr.out$sdev^2)
par(mfrow=c(1,2))
plot(pve, type="o", ylab="PVE", xlab="Principal Component", col="blue")
plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")
sd.data=scale(nci.data)
par(mfrow=c(1,3))
data.dist=dist(sd.data)
plot(hclust(data.dist), labels=nci.labs, main="Complete Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="average"), labels=nci.labs, main="Average Linkage", xlab="", sub="",ylab="")
plot(hclust(data.dist, method="single"), labels=nci.labs, main="Single Linkage", xlab="", sub="",ylab="")
hc.out=hclust(dist(sd.data))
hc.clusters=cutree(hc.out,4)
table(hc.clusters,nci.labs)
## nci.labs
## hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 2 3 2 0 0 0 0
## 2 3 2 0 0 0 0 0
## 3 0 0 0 1 1 6 0
## 4 2 0 5 0 0 0 1
## nci.labs
## hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 8 8 6 2 8 1
## 2 0 0 1 0 0 1 0
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0
par(mfrow=c(1,1))
plot(hc.out, labels=nci.labs)
abline(h=139, col="red")
hc.out
##
## Call:
## hclust(d = dist(sd.data))
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 64
set.seed(2)
km.out=kmeans(sd.data, 4, nstart=20)
km.clusters=km.out$cluster
table(km.clusters,hc.clusters)
## hc.clusters
## km.clusters 1 2 3 4
## 1 11 0 0 9
## 2 0 0 8 0
## 3 9 0 0 0
## 4 20 7 0 0
hc.out=hclust(dist(pr.out$x[,1:5]))
plot(hc.out, labels=nci.labs, main="Hier. Clust. on First Five Score Vectors")
table(cutree(hc.out,4), nci.labs)
## nci.labs
## BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
## 1 0 2 7 0 0 2 0
## 2 5 3 0 0 0 0 0
## 3 0 0 0 1 1 4 0
## 4 2 0 0 0 0 0 1
## nci.labs
## MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
## 1 0 1 8 5 2 7 0
## 2 0 7 1 1 0 2 1
## 3 0 0 0 0 0 0 0
## 4 1 0 0 0 0 0 0