Clustering

With hierarchical clustering, I have tried clustering communities in MA by factoring 3 variables- Robbery per 1000 people, Median household size and median Income. The dataset had over 100 communities in MA. However, to have more a presentable dendogram, I took a simple random sample of 30 communities in MA and then plotted a dendogram using Agglomerative clustering technique (a type of hierarchical clustering).

The dendogram reveals communities that are similar or dissimilar when it comes to the 3 variables. It gives us a pair of communities that are quite similar and the extent to which they are similar. For example, Amesburytown and Milfordtown are quite similar or how Boston is different from all other communities.

We could cut the tree at a height around 3.5 units that would give us 4 appropriate clusters.

df <- cd3
df <- na.omit(df)
df <- scale(df)
head(df)

##                robbbPerPop householdsize  medIncome
## Dracuttown       0.2307237    0.16755006  0.2405226
## Westfordtown    -0.5199446    0.83197271  1.7984081
## Melrosecity     -0.1197858   -0.64131230  0.1337031
## Cantontown      -0.2124152    0.05199829  1.0828389
## Pittsfieldcity   0.3134232   -1.01685554 -1.2948054
## Plymouthtown    -0.2709569    0.22532594 -0.2934738

# Dissimilarity matrix
d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

# Cut tree into 4 groups
sub_grp <- cutree(hc1, k = 4)

# Number of members in each cluster
table(sub_grp)

## sub_grp
##  1  2  3  4 
## 20  8  1  1

cd3 %>%
  mutate(cluster = sub_grp) %>%
  head

##   robbbPerPop householdsize medIncome cluster
## 1       56.53          2.85     45165       1
## 2        5.88          3.08     60566       2
## 3       32.88          2.57     44109       1
## 4       26.63          2.81     53492       2
## 5       62.11          2.44     29987       1
## 6       22.68          2.87     39886       1

cd4 <-cd3 %>%
  mutate(cluster = sub_grp)




# Compute with agnes
hc2 <- agnes(df, method = "complete")

# Agglomerative coefficient
hc2$ac

## [1] 0.898467

## [1] 0.8531583


# methods to assess
#m <- c( "average", "single", "complete", "ward")
#names(m) <- c( "average", "single", "complete", "ward")

# function to compute coefficient
#ac <- function(x) {
 # agnes(df, method = x)$ac
#}

#map_dbl(m, ac)

#hc3 <- agnes(df, method = "ward")
#pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes") 


# compute divisive hierarchical clustering
#hc4 <- diana(df)

# Divise coefficient; amount of clustering structure found
#hc4$dc
## [1] 0.8514345

# plot dendrogram
#pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")

# Ward's method
#hc5 <- hclust(d, method = "ward.D2" )

# Cut tree into 4 groups
#sub_grp <- cutree(hc5, k = 4)

# Number of members in each cluster
#table(sub_grp)
## sub_grp
##  1  2  3  4 
##  7 12 19 12


#cd3 %>%
 # mutate(cluster = sub_grp) %>%
#  head

#cd4 <- cd3 %>%
 # mutate(cluster = sub_grp)





##It's also possible to draw the dendrogram with a border around 
##the 4 clusters. The argument border is used to specify the border 

plot(hc1, cex = 0.6)
rect.hclust(hc1, k = 4, border = 2:5)

##
fviz_cluster(list(data = df, cluster = sub_grp))

Form hierarchical clustering, 4 clusters are ideal to group the communities in MA.

library(tidyverse)  
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization

df <- cd3

df <- na.omit(df)

df <- scale(df)
head(df)

##                robbbPerPop householdsize  medIncome
## Dracuttown       0.2307237    0.16755006  0.2405226
## Westfordtown    -0.5199446    0.83197271  1.7984081
## Melrosecity     -0.1197858   -0.64131230  0.1337031
## Cantontown      -0.2124152    0.05199829  1.0828389
## Pittsfieldcity   0.3134232   -1.01685554 -1.2948054
## Plymouthtown    -0.2709569    0.22532594 -0.2934738

k4 <- kmeans(df, centers = 4, nstart = 25)

str(k4)

## List of 9
##  $ cluster     : Named int [1:30] 3 2 3 2 4 3 1 2 3 2 ...
##   ..- attr(*, "names")= chr [1:30] "Dracuttown" "Westfordtown" "Melrosecity" "Cantontown" ...
##  $ centers     : num [1:4, 1:3] 4.9899 -0.406 -0.0497 -0.1181 -0.3813 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:3] "robbbPerPop" "householdsize" "medIncome"
##  $ totss       : num 87
##  $ withinss    : num [1:4] 0 13.42 5.57 1.36
##  $ tot.withinss: num 20.3
##  $ betweenss   : num 66.7
##  $ size        : int [1:4] 1 9 15 5
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

k4

## K-means clustering with 4 clusters of sizes 1, 9, 15, 5
## 
## Cluster means:
##   robbbPerPop householdsize   medIncome
## 1  4.98994598    -0.3813208 -1.39919722
## 2 -0.40595701     0.9603636  1.10685751
## 3 -0.04973332    -0.1694759 -0.06085812
## 4 -0.11806661    -1.1439625 -1.52992973
## 
## Clustering vector:
##       Dracuttown     Westfordtown      Melrosecity       Cantontown 
##                3                2                3                2 
##   Pittsfieldcity     Plymouthtown    Worcestercity Northboroughtown 
##                4                3                1                2 
##  SouthHadleytown   Longmeadowtown     Pembroketown      Graftontown 
##                3                2                2                3 
##     Franklintown       Dennistown    Watertowntown      Walpoletown 
##                2                4                3                2 
##      Harwichtown    Braintreetown      Harvardtown      Maynardtown 
##                4                3                2                3 
##     Westporttown       Dedhamtown      Danverstown       Oxfordtown 
##                3                3                3                3 
##    Wakefieldtown   Wilmingtontown       Saugustown   NorthAdamscity 
##                3                2                3                4 
##      Webstertown    Stoughtontown 
##                4                3 
## 
## Within cluster sum of squares by cluster:
## [1]  0.000000 13.416035  5.566915  1.364373
##  (between_SS / total_SS =  76.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

fviz_cluster(k4, data = df)

k2 <- kmeans(df, centers = 2, nstart = 25)
k3 <- kmeans(df, centers = 3, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)


p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(p1, p2, p3, p4, nrow = 2)

final <- kmeans(df, 4, nstart = 25)

cd4 <- cd3 %>%
  mutate(Cluster = final$cluster) %>%
  group_by(Cluster) %>%
  summarise_all("mean")

#Quantitative output by cluster 
cd4

## # A tibble: 4 x 4
##   Cluster robbbPerPop householdsize medIncome
##     <int>       <dbl>         <dbl>     <dbl>
## 1       1        13.6          3.12     53729
## 2       2       378            2.66     28955
## 3       3        33.0          2.40     27663
## 4       4        37.6          2.73     42186

Quantitative output by cluster shows how different the clusters with respect to Robbery. Household sizes and median Income for the clusters don’t have a high variance. Cluster