This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973. Also given is the percent of the population living in urban areas. The data frame includes 50 observations (States) on 4 variables.
[,1] Murder numeric Murder arrests (per 100,000) [,2] Assault numeric Assault arrests (per 100,000) [,3] UrbanPop numeric Percent urban populationt [,4] Rape numeric Rape arrests (per 100,000)
# k means clustering Understanding the output
km <- kmeans(USArrests, 2)
km
## K-means clustering with 2 clusters of sizes 29, 21
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 4.841379 109.7586 64.03448 16.24828
## 2 11.857143 255.0000 67.61905 28.11429
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 2 2 2 2 2
## Colorado Connecticut Delaware Florida Georgia
## 2 1 2 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 1 2 1 1
## Kansas Kentucky Louisiana Maine Maryland
## 1 1 2 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 2 1 2 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 1 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 1 2 2 1 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 54762.30 41636.73
## (between_SS / total_SS = 72.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# standardize variable
mu <- apply(USArrests,2,mean)
sigma <- sqrt(apply(USArrests,2,var))
mu
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
sigma
## Murder Assault UrbanPop Rape
## 4.355510 83.337661 14.474763 9.366385
data <- c()
for(i in 1:ncol(USArrests)){
data <- cbind( data, (USArrests[,i]-mu[i])/sigma[i])
}
USA.data <- as.data.frame(data)
names(USA.data) <- names(USArrests)
row.names(USA.data) <- row.names(USArrests)
Wss <- c()
for(i in 1:20){
km <- kmeans(USA.data, i)
Wss <- c(Wss, km$tot.withinss )}
plot(c(1:20), Wss, type="l")
# visualization using PCA
km <- kmeans(USA.data, 3)
km
## K-means clustering with 3 clusters of sizes 20, 13, 17
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 1.0049340 1.0138274 0.1975853 0.8469650
## 2 -0.9615407 -1.1066010 -0.9301069 -0.9667633
## 3 -0.4469795 -0.3465138 0.4788049 -0.2571398
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 3 1
## Colorado Connecticut Delaware Florida Georgia
## 1 3 3 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 2 1 3 2
## Kansas Kentucky Louisiana Maine Maryland
## 3 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 3
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 3 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 46.74796 11.95246 19.62285
## (between_SS / total_SS = 60.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
pca1 <- prcomp(USA.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:3){
points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}
# Cytokine data set
cyto.data <- read.csv("cytokine.csv", header=T)
dim(cyto.data)
## [1] 146 28
cyto.data[1,]
## Description metformin group IL.17F..19. GM.CSF..20.
## 1 CAAN04A CD3/CD28-40 0 CD3/CD28-40 9.278473 9.432661
## IFN.G..25. IL.10..27. CCL.20.MIP.3A..28. IL.12.P70..33. IL.13..35.
## 1 10.84997 9.019513 10.48754 5.42873 9.517088
## IL.15..37. IL.17A..39. IL.22..43. IL.9..45. IL.1B..46. IL.33..47.
## 1 3.481122 8.571371 5.078289 8.636722 8.009439 3.682115
## IL.2..48. IL.21..52. IL.4..53. IL.23..54. IL.5..55. IL.6..57.
## 1 10.68993 7.097955 6.672004 3.921532 9.289968 8.491547
## IL.17E.IL.25..66. IL.27..72. IL.31..73. TNF.A..75. TNF.B..76.
## 1 3.53238 4.343046 5.757042 10.97487 8.802923
## IL.28A..77.
## 1 3.812798
row.names(cyto.data) <- cyto.data$Description
c.data <- log(cyto.data[,4:28]) #transform
mu <- apply(c.data,2,mean)
sigma <- sqrt(apply(c.data,2,var))
z.data <- c()#standardized
for(i in 1:ncol(c.data)){
z.data <- cbind(z.data, (c.data[,i]-mu[i])/sigma[i])
}
Wss <- c()
for(i in 1:20){
km <- kmeans(z.data, i)
Wss <- c(Wss, km$tot.withinss )}
plot(c(1:20), Wss, type="l")
km <- kmeans(z.data,4)
pca1 <- prcomp(z.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:4){
points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}
table(cyto.data$metformin,km$cluster)
##
## 1 2 3 4
## 0 20 32 13 8
## 1 20 32 11 10
table(cyto.data$group,km$cluster)
##
## 1 2 3 4
## CD3/CD28-24 15 0 0 1
## CD3/CD28-40 25 1 0 0
## CPG-24 0 21 0 5
## LPS-24 0 2 24 0
## NS-24 0 21 0 5
## NS-40 0 19 0 7
USA arrests standardized data
hc <- hclust(dist(USA.data) )
plot(hc)
rect.hclust(hc,k=2)
clu.lab <- cutree(hc, k=2)
km <- kmeans(USA.data, 2)
table( km$cluster, clu.lab)
## clu.lab
## 1 2
## 1 0 30
## 2 19 1
pca1 <- prcomp(USA.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:3){
points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)
points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster== i),2],col=i, pch="*")}
Resampling Method to Detect Significant Clusters
set.seed(1234)
p.length <- c(); rand.tree.h <- c()
data <- USA.data;
real.tree <- hclust(dist((data)))
for(i in 1:10){
print(i)
test <- sample(data[,1])
for(ind.c in 2:ncol(data)){
test <- cbind( test, sample((data[,ind.c])))}
rand.tree <- hclust(dist(test))
rand.tree.h <- cbind(rand.tree.h, rand.tree[[2]])
p.length <- rbind(p.length,
quantile(rand.tree[[2]],probs=c(0.925, 0.95, 0.975, 0.99 )))}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
rand.tree.ref <- apply(rand.tree.h,1,mean)
apply(p.length,2,mean)
## 92.5% 95% 97.5% 99%
## 3.980006 4.336291 4.712787 5.198270
plot( (rand.tree.ref), (real.tree[[2]]), xlab="Expected", ylab="Observed");
abline(0,1)
Cut dendrogram to find clusters with
hc <- hclust(dist(USA.data) )
plot(hc)
rect.hclust(hc,h=4.3); title(sub="5% significance", line=2)
plot(hc)
rect.hclust(hc,h=5.2); title(sub="1% significance", line=2)
USA arrests data - average linkage
hc <- hclust(dist(USA.data), method="average" )
plot(hc)
rect.hclust(hc,k=2)
rect.hclust(hc,k=2)
clu.lab <- cutree(hc, k=2)
table( km$cluster, clu.lab)
## clu.lab
## 1 2
## 1 0 30
## 2 20 0
pca1 <- prcomp(USA.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:2){
points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)
points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster== i),2],col=i, pch="*")}
hc <- hclust(dist(USA.data))
clu.lab = cutree(hc, h=4.3)
cluster.summary <-
as.data.frame(
rbind(tapply( USA.data$Murder, clu.lab,mean),
tapply( USA.data$Assault, clu.lab,mean),
tapply( USA.data$Rape, clu.lab,mean),
tapply( USA.data$UrbanPop, clu.lab,mean)))
row.names(cluster.summary) <- c("Murder", "Assault","Rape","UrbPop")
cluster.summary
## 1 2 3 4
## Murder 1.4463290 0.7499801 -0.4400338 -1.057970
## Assault 0.9838289 1.1199128 -0.4353831 -1.104663
## Rape 0.3529110 1.2156432 -0.2830385 -1.025155
## UrbPop -0.8317925 0.9361748 0.3607592 -1.121953
cyto.data <- read.csv("cytokine.csv", header=T)
dim(cyto.data)
## [1] 146 28
cyto.data[1,]
## Description metformin group IL.17F..19. GM.CSF..20.
## 1 CAAN04A CD3/CD28-40 0 CD3/CD28-40 9.278473 9.432661
## IFN.G..25. IL.10..27. CCL.20.MIP.3A..28. IL.12.P70..33. IL.13..35.
## 1 10.84997 9.019513 10.48754 5.42873 9.517088
## IL.15..37. IL.17A..39. IL.22..43. IL.9..45. IL.1B..46. IL.33..47.
## 1 3.481122 8.571371 5.078289 8.636722 8.009439 3.682115
## IL.2..48. IL.21..52. IL.4..53. IL.23..54. IL.5..55. IL.6..57.
## 1 10.68993 7.097955 6.672004 3.921532 9.289968 8.491547
## IL.17E.IL.25..66. IL.27..72. IL.31..73. TNF.A..75. TNF.B..76.
## 1 3.53238 4.343046 5.757042 10.97487 8.802923
## IL.28A..77.
## 1 3.812798
row.names(cyto.data) <- cyto.data$Description
c.data <- log(cyto.data[,4:28])
mu <- apply(c.data,2,mean)
sigma <- sqrt(apply(c.data,2,var))
z.data <- c()
for(i in 1:ncol(c.data)){
z.data <- cbind(z.data, (c.data[,i]-mu[i])/sigma[i])
}
z.data <- as.data.frame(z.data)
names(z.data) <- names(cyto.data[4:28])
row.names(z.data) <- row.names(cyto.data)
hc <- hclust(dist(z.data) )
plot(hc)
p.length <- c(); rand.tree.h <- c()
data <- z.data; real.tree <- hclust(dist((data)))
for(i in 1:10){
print(i)
test <- sample(data[,1])
for(ind.c in 2:ncol(data)){
test <- cbind( test, sample((data[,ind.c])))}
rand.tree <- hclust(dist(test))
rand.tree.h <- cbind(rand.tree.h, rand.tree[[2]])
p.length <- rbind(p.length,
quantile(rand.tree[[2]],probs=c(0.925, 0.95, 0.975, 0.99 )))}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
rand.tree.ref <- apply(rand.tree.h,1,mean)
plot( (rand.tree.ref), (real.tree[[2]]), xlab="Expected", ylab="Observed"); abline(0,1)
abline(0,1)
apply(p.length,2,mean)
## 92.5% 95% 97.5% 99%
## 8.502671 8.873187 9.495468 10.023159
Compare to k-means clustering
km <- kmeans(z.data,4)
clu.lab <- cutree(hc, h=8.9)
table( km$cluster, clu.lab)
## clu.lab
## 1 2 3 4
## 1 0 8 9 1
## 2 19 0 0 21
## 3 0 24 0 0
## 4 0 0 64 0
pca1 <- prcomp(z.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:4){
points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}
pca1 <- prcomp(z.data, scale=T)
plot(pca1$x[,1], pca1$x[, 2])
for(i in 1:4){
points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)}
table(cyto.data$metformin,clu.lab)
## clu.lab
## 1 2 3 4
## 0 10 16 36 11
## 1 9 16 37 11
table(cyto.data$group,clu.lab)
## clu.lab
## 1 2 3 4
## CD3/CD28-24 2 0 0 14
## CD3/CD28-40 17 0 1 8
## CPG-24 0 1 25 0
## LPS-24 0 24 2 0
## NS-24 0 1 25 0
## NS-40 0 6 20 0
table(cyto.data$group,km$cluster)
##
## 1 2 3 4
## CD3/CD28-24 1 15 0 0
## CD3/CD28-40 0 25 0 1
## CPG-24 5 0 0 21
## LPS-24 0 0 24 2
## NS-24 5 0 0 21
## NS-40 7 0 0 19