Example of USA arrest data

This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973. Also given is the percent of the population living in urban areas. The data frame includes 50 observations (States) on 4 variables.

[,1] Murder numeric Murder arrests (per 100,000) [,2] Assault numeric Assault arrests (per 100,000) [,3] UrbanPop numeric Percent urban populationt [,4] Rape numeric Rape arrests (per 100,000)

# k means clustering Understanding the output

  km <- kmeans(USArrests, 2)
  km

## K-means clustering with 2 clusters of sizes 29, 21
## 
## Cluster means:
##      Murder  Assault UrbanPop     Rape
## 1  4.841379 109.7586 64.03448 16.24828
## 2 11.857143 255.0000 67.61905 28.11429
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              2              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              2              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              1              2              1              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              1              2              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              2              1              2              1 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              1              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              2              2              1              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              1              1              1 
## 
## Within cluster sum of squares by cluster:
## [1] 54762.30 41636.73
##  (between_SS / total_SS =  72.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

 # standardize variable
      mu <- apply(USArrests,2,mean)
   sigma <- sqrt(apply(USArrests,2,var))
   mu

##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232

   sigma

##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385

    data <- c()
     for(i in 1:ncol(USArrests)){
       data <- cbind( data, (USArrests[,i]-mu[i])/sigma[i])
     }
  USA.data <- as.data.frame(data)
  names(USA.data) <- names(USArrests)
  row.names(USA.data) <- row.names(USArrests)
  
 Wss <- c()
   for(i in 1:20){
     km <- kmeans(USA.data, i)
    Wss <- c(Wss, km$tot.withinss )}
  plot(c(1:20), Wss, type="l")

   # visualization using PCA
    km <- kmeans(USA.data, 3)
    km

## K-means clustering with 3 clusters of sizes 20, 13, 17
## 
## Cluster means:
##       Murder    Assault   UrbanPop       Rape
## 1  1.0049340  1.0138274  0.1975853  0.8469650
## 2 -0.9615407 -1.1066010 -0.9301069 -0.9667633
## 3 -0.4469795 -0.3465138  0.4788049 -0.2571398
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              3              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              1              3              3              1              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              1              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              1              2              1              1 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              1              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              1              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3 
## 
## Within cluster sum of squares by cluster:
## [1] 46.74796 11.95246 19.62285
##  (between_SS / total_SS =  60.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

    pca1 <- prcomp(USA.data, scale=T)
 plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:3){
    points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}

# Cytokine data set

 cyto.data <- read.csv("cytokine.csv", header=T)
   dim(cyto.data)

## [1] 146  28

   cyto.data[1,]

##           Description metformin       group IL.17F..19. GM.CSF..20.
## 1 CAAN04A CD3/CD28-40         0 CD3/CD28-40    9.278473    9.432661
##   IFN.G..25. IL.10..27. CCL.20.MIP.3A..28. IL.12.P70..33. IL.13..35.
## 1   10.84997   9.019513           10.48754        5.42873   9.517088
##   IL.15..37. IL.17A..39. IL.22..43. IL.9..45. IL.1B..46. IL.33..47.
## 1   3.481122    8.571371   5.078289  8.636722   8.009439   3.682115
##   IL.2..48. IL.21..52. IL.4..53. IL.23..54. IL.5..55. IL.6..57.
## 1  10.68993   7.097955  6.672004   3.921532  9.289968  8.491547
##   IL.17E.IL.25..66. IL.27..72. IL.31..73. TNF.A..75. TNF.B..76.
## 1           3.53238   4.343046   5.757042   10.97487   8.802923
##   IL.28A..77.
## 1    3.812798

   row.names(cyto.data) <- cyto.data$Description 

   c.data <- log(cyto.data[,4:28]) #transform
   mu <- apply(c.data,2,mean)
   sigma <- sqrt(apply(c.data,2,var))
   
   z.data <- c()#standardized
     for(i in 1:ncol(c.data)){
       z.data <- cbind(z.data, (c.data[,i]-mu[i])/sigma[i])
     }
   
    Wss <- c()
   for(i in 1:20){
     km <- kmeans(z.data, i)
    Wss <- c(Wss, km$tot.withinss )}
  plot(c(1:20), Wss, type="l")

   km <- kmeans(z.data,4)
   pca1 <- prcomp(z.data, scale=T)
    plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:4){
    points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}

    table(cyto.data$metformin,km$cluster)

##    
##      1  2  3  4
##   0 20 32 13  8
##   1 20 32 11 10

    table(cyto.data$group,km$cluster)

##              
##                1  2  3  4
##   CD3/CD28-24 15  0  0  1
##   CD3/CD28-40 25  1  0  0
##   CPG-24       0 21  0  5
##   LPS-24       0  2 24  0
##   NS-24        0 21  0  5
##   NS-40        0 19  0  7

Hierarhical Clustering - complete linkage

USA arrests standardized data

  hc <- hclust(dist(USA.data) )
    plot(hc)
   rect.hclust(hc,k=2)

    clu.lab <- cutree(hc, k=2)
   
       km <- kmeans(USA.data, 2)
      table(  km$cluster, clu.lab)

##    clu.lab
##      1  2
##   1  0 30
##   2 19  1

    pca1 <- prcomp(USA.data, scale=T)
 plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:3){
    points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)
     points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster== i),2],col=i, pch="*")}

Resampling Method to Detect Significant Clusters

set.seed(1234)
        p.length <- c(); rand.tree.h <- c()
  data <- USA.data;  
  real.tree <- hclust(dist((data)))
  
   for(i in 1:10){
     print(i)
       test <-  sample(data[,1])
          for(ind.c in 2:ncol(data)){
              test <- cbind( test,  sample((data[,ind.c])))}
        rand.tree <- hclust(dist(test))
      rand.tree.h <- cbind(rand.tree.h, rand.tree[[2]])
         p.length <- rbind(p.length,
         quantile(rand.tree[[2]],probs=c(0.925, 0.95, 0.975, 0.99 )))}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

            rand.tree.ref <- apply(rand.tree.h,1,mean)
     apply(p.length,2,mean)

##    92.5%      95%    97.5%      99% 
## 3.980006 4.336291 4.712787 5.198270

    plot(  (rand.tree.ref),  (real.tree[[2]]), xlab="Expected", ylab="Observed"); 
    abline(0,1)

Cut dendrogram to find clusters with

hc <- hclust(dist(USA.data) )
    plot(hc)
   rect.hclust(hc,h=4.3); title(sub="5% significance", line=2)

     plot(hc)
   rect.hclust(hc,h=5.2); title(sub="1% significance", line=2)

USA arrests data - average linkage

      hc <- hclust(dist(USA.data), method="average" )
    plot(hc)
    rect.hclust(hc,k=2)
   rect.hclust(hc,k=2)

    clu.lab <- cutree(hc, k=2)
   table(  km$cluster, clu.lab)

##    clu.lab
##      1  2
##   1  0 30
##   2 20  0

    pca1 <- prcomp(USA.data, scale=T)
 plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:2){
    points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)
     points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster== i),2],col=i, pch="*")}

summary of 4 clusters

hc <- hclust(dist(USA.data))

    clu.lab = cutree(hc, h=4.3)

cluster.summary <- 
   as.data.frame(
     rbind(tapply( USA.data$Murder, clu.lab,mean),
          tapply( USA.data$Assault, clu.lab,mean),
          tapply( USA.data$Rape, clu.lab,mean),
          tapply( USA.data$UrbanPop, clu.lab,mean)))
  row.names(cluster.summary) <- c("Murder", "Assault","Rape","UrbPop")
    
  cluster.summary

##                  1         2          3         4
## Murder   1.4463290 0.7499801 -0.4400338 -1.057970
## Assault  0.9838289 1.1199128 -0.4353831 -1.104663
## Rape     0.3529110 1.2156432 -0.2830385 -1.025155
## UrbPop  -0.8317925 0.9361748  0.3607592 -1.121953

Cytokine data set

 cyto.data <- read.csv("cytokine.csv", header=T)
   dim(cyto.data)

## [1] 146  28

   cyto.data[1,]

##           Description metformin       group IL.17F..19. GM.CSF..20.
## 1 CAAN04A CD3/CD28-40         0 CD3/CD28-40    9.278473    9.432661
##   IFN.G..25. IL.10..27. CCL.20.MIP.3A..28. IL.12.P70..33. IL.13..35.
## 1   10.84997   9.019513           10.48754        5.42873   9.517088
##   IL.15..37. IL.17A..39. IL.22..43. IL.9..45. IL.1B..46. IL.33..47.
## 1   3.481122    8.571371   5.078289  8.636722   8.009439   3.682115
##   IL.2..48. IL.21..52. IL.4..53. IL.23..54. IL.5..55. IL.6..57.
## 1  10.68993   7.097955  6.672004   3.921532  9.289968  8.491547
##   IL.17E.IL.25..66. IL.27..72. IL.31..73. TNF.A..75. TNF.B..76.
## 1           3.53238   4.343046   5.757042   10.97487   8.802923
##   IL.28A..77.
## 1    3.812798

   row.names(cyto.data) <- cyto.data$Description

   c.data <- log(cyto.data[,4:28])
   mu <- apply(c.data,2,mean)
   sigma <- sqrt(apply(c.data,2,var))
   
   z.data <- c()
     for(i in 1:ncol(c.data)){
       z.data <- cbind(z.data, (c.data[,i]-mu[i])/sigma[i])
     }
   z.data <- as.data.frame(z.data)
   names(z.data) <- names(cyto.data[4:28])
   row.names(z.data) <- row.names(cyto.data)
   
    hc <- hclust(dist(z.data) )
    plot(hc)

            p.length <- c(); rand.tree.h <- c()
  data <- z.data;  real.tree <- hclust(dist((data)))
   for(i in 1:10){
     print(i)
       test <-  sample(data[,1])
          for(ind.c in 2:ncol(data)){
              test <- cbind( test,  sample((data[,ind.c])))}
        rand.tree <- hclust(dist(test))
      rand.tree.h <- cbind(rand.tree.h, rand.tree[[2]])
         p.length <- rbind(p.length,
         quantile(rand.tree[[2]],probs=c(0.925, 0.95, 0.975, 0.99 )))}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

            rand.tree.ref <- apply(rand.tree.h,1,mean)
     plot(  (rand.tree.ref),  (real.tree[[2]]), xlab="Expected", ylab="Observed"); abline(0,1)
    abline(0,1)

    apply(p.length,2,mean)

##     92.5%       95%     97.5%       99% 
##  8.502671  8.873187  9.495468 10.023159

Compare to k-means clustering

     km <- kmeans(z.data,4)
        clu.lab <- cutree(hc, h=8.9)
   table(  km$cluster, clu.lab)

##    clu.lab
##      1  2  3  4
##   1  0  8  9  1
##   2 19  0  0 21
##   3  0 24  0  0
##   4  0  0 64  0

   pca1 <- prcomp(z.data, scale=T)
    plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:4){
    points(pca1$x[which(km$cluster == i),1],pca1$x[which(km$cluster == i),2],col=i)}

       pca1 <- prcomp(z.data, scale=T)
    plot(pca1$x[,1], pca1$x[, 2])
   for(i in 1:4){
    points(pca1$x[which(clu.lab == i),1],pca1$x[which(clu.lab == i),2],col=i)}

    table(cyto.data$metformin,clu.lab)

##    clu.lab
##      1  2  3  4
##   0 10 16 36 11
##   1  9 16 37 11

    table(cyto.data$group,clu.lab)

##              clu.lab
##                1  2  3  4
##   CD3/CD28-24  2  0  0 14
##   CD3/CD28-40 17  0  1  8
##   CPG-24       0  1 25  0
##   LPS-24       0 24  2  0
##   NS-24        0  1 25  0
##   NS-40        0  6 20  0

     table(cyto.data$group,km$cluster)

##              
##                1  2  3  4
##   CD3/CD28-24  1 15  0  0
##   CD3/CD28-40  0 25  0  1
##   CPG-24       5  0  0 21
##   LPS-24       0  0 24  2
##   NS-24        5  0  0 21
##   NS-40        7  0  0 19

Review on Clustering Methods

Michelle C. Hsu

November 3, 2017

Example of USA arrest data

Hierarhical Clustering - complete linkage

summary of 4 clusters

Cytokine data set