Hierarchical Cluster Analysis

crime <- read.csv("~/GitHub/R-Pubs-Projects/Data/crimes.csv")
library(stats)

We will cluster the communities by the following variables: murders, rapes, robberies, assaults, burglaries, larcenies, auto thefts, arsons, non violent crimes (all of them per 100,000 population)

Create a new data set with the clustering variables

crime_work <- crime[,-1]

Add row names

rownames(crime_work) <- crime$communityname

Compute the distance matrix

dm <- dist(crime_work, method = "euclidean")

Create the clustering model using the hclust function

model <- hclust(dm, method = "ward.D")

Plot the model (as a dendrogram)

plot(model, labels=rownames(crime_work))

Get cluster membership for each case (community) in the case of 3 clusters

member <- cutree(model, k=3)

member
##       Aberdeen            Ada    AgouraHills          Aiken        Alameda 
##              1              2              3              2              2 
##     Alamogordo         Albany      Albemarle    Albuquerque      Alexander 
##              2              1              1              2              3 
##     Alexandria       Alhambra          Alice      Allentown       Alliance 
##              1              3              2              2              1 
##     Alpharetta        Altoona          Altus          Alvin       Amarillo 
##              3              3              2              2              2 
##   AmericanFork       Americus       Amesbury        Amherst      Anacortes 
##              3              2              3              3              3 
##      Anchorage       Anderson        Andrews       Angleton      Annapolis 
##              2              2              3              2              2 
##       Anniston        Ansonia        Antioch ApacheJunction       Appleton 
##              1              3              2              2              3 
##        Arcadia         Arcata        Ardmore    Arkadelphia      Arlington 
##              2              2              1              3              2 
##         Arnold   ArroyoGrande        Artesia         Arvada     AsburyPark 
##              3              3              2              3              2 
##       Asheboro      Asheville        Ashland Ashwaubenonvil      Astontown 
##              2              2              3              2              3 
##        Astoria     Atascadero         Athens          Athol        Atlanta 
##              2              3              2              3              1 
##   AtlanticCity        Atwater         Auburn        Augusta         Aurora 
##              1              2              3              1              2 
##         Austin       Avondale           Avon          Azusa     Bainbridge 
##              1              2              3              3              2 
##    Bakersfield   BalchSprings    BaldwinPark        Ballwin      Baltimore 
##              2              2              3              3              1 
##         Bangor        Banning      Barberton   Barnegattown     Barnstable 
##              2              2              2              3              3 
##     Barrington        Barstow   Bartlesville         Bartow     BatonRouge 
##              3              1              2              1              1 
##        Bayonne 
##              3

Visualize clusters on the dendrogram

plot(model, labels=rownames(crime_work))
rect.hclust(model, k=2, border="red")

Compute some summary data for each cluster

Add a cluster membership variable to the data frame

crime2 <- cbind(crime, cluster3 = member)

Compute the mean of each clustering variable in each cluster

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
crimesumm <- crime2 %>%
  group_by(cluster3) %>%
  summarise(across(c(murdPerPop, rapesPerPop, robbbPerPop, assaultPerPop, burglPerPop,
                   larcPerPop, autoTheftPerPop, arsonsPerPop, nonViolPerPop), .fns = mean, .names = NULL))

Get the list of communities in each cluster

split(crime2$communityname, factor(crime2$cluster3))
## $`1`
##  [1] "Aberdeen"     "Albany"       "Albemarle"    "Alexandria"   "Alliance"    
##  [6] "Anniston"     "Ardmore"      "Atlanta"      "AtlanticCity" "Augusta"     
## [11] "Austin"       "Baltimore"    "Barstow"      "Bartow"       "BatonRouge"  
## 
## $`2`
##  [1] "Ada"            "Aiken"          "Alameda"        "Alamogordo"    
##  [5] "Albuquerque"    "Alice"          "Allentown"      "Altus"         
##  [9] "Alvin"          "Amarillo"       "Americus"       "Anchorage"     
## [13] "Anderson"       "Angleton"       "Annapolis"      "Antioch"       
## [17] "ApacheJunction" "Arcadia"        "Arcata"         "Arlington"     
## [21] "Artesia"        "AsburyPark"     "Asheboro"       "Asheville"     
## [25] "Ashwaubenonvil" "Astoria"        "Athens"         "Atwater"       
## [29] "Aurora"         "Avondale"       "Bainbridge"     "Bakersfield"   
## [33] "BalchSprings"   "Bangor"         "Banning"        "Barberton"     
## [37] "Bartlesville"  
## 
## $`3`
##  [1] "AgouraHills"  "Alexander"    "Alhambra"     "Alpharetta"   "Altoona"     
##  [6] "AmericanFork" "Amesbury"     "Amherst"      "Anacortes"    "Andrews"     
## [11] "Ansonia"      "Appleton"     "Arkadelphia"  "Arnold"       "ArroyoGrande"
## [16] "Arvada"       "Ashland"      "Astontown"    "Atascadero"   "Athol"       
## [21] "Auburn"       "Avon"         "Azusa"        "BaldwinPark"  "Ballwin"     
## [26] "Barnegattown" "Barnstable"   "Barrington"   "Bayonne"