crime <- read.csv("~/GitHub/R-Pubs-Projects/Data/crimes.csv")
library(stats)
We will cluster the communities by the following variables: murders, rapes, robberies, assaults, burglaries, larcenies, auto thefts, arsons, non violent crimes (all of them per 100,000 population)
Create a new data set with the clustering variables
crime_work <- crime[,-1]
Add row names
rownames(crime_work) <- crime$communityname
Compute the distance matrix
dm <- dist(crime_work, method = "euclidean")
Create the clustering model using the hclust function
model <- hclust(dm, method = "ward.D")
Plot the model (as a dendrogram)
plot(model, labels=rownames(crime_work))
Get cluster membership for each case (community) in the case of 3 clusters
member <- cutree(model, k=3)
member
## Aberdeen Ada AgouraHills Aiken Alameda
## 1 2 3 2 2
## Alamogordo Albany Albemarle Albuquerque Alexander
## 2 1 1 2 3
## Alexandria Alhambra Alice Allentown Alliance
## 1 3 2 2 1
## Alpharetta Altoona Altus Alvin Amarillo
## 3 3 2 2 2
## AmericanFork Americus Amesbury Amherst Anacortes
## 3 2 3 3 3
## Anchorage Anderson Andrews Angleton Annapolis
## 2 2 3 2 2
## Anniston Ansonia Antioch ApacheJunction Appleton
## 1 3 2 2 3
## Arcadia Arcata Ardmore Arkadelphia Arlington
## 2 2 1 3 2
## Arnold ArroyoGrande Artesia Arvada AsburyPark
## 3 3 2 3 2
## Asheboro Asheville Ashland Ashwaubenonvil Astontown
## 2 2 3 2 3
## Astoria Atascadero Athens Athol Atlanta
## 2 3 2 3 1
## AtlanticCity Atwater Auburn Augusta Aurora
## 1 2 3 1 2
## Austin Avondale Avon Azusa Bainbridge
## 1 2 3 3 2
## Bakersfield BalchSprings BaldwinPark Ballwin Baltimore
## 2 2 3 3 1
## Bangor Banning Barberton Barnegattown Barnstable
## 2 2 2 3 3
## Barrington Barstow Bartlesville Bartow BatonRouge
## 3 1 2 1 1
## Bayonne
## 3
Visualize clusters on the dendrogram
plot(model, labels=rownames(crime_work))
rect.hclust(model, k=2, border="red")
Compute some summary data for each cluster
Add a cluster membership variable to the data frame
crime2 <- cbind(crime, cluster3 = member)
Compute the mean of each clustering variable in each cluster
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
crimesumm <- crime2 %>%
group_by(cluster3) %>%
summarise(across(c(murdPerPop, rapesPerPop, robbbPerPop, assaultPerPop, burglPerPop,
larcPerPop, autoTheftPerPop, arsonsPerPop, nonViolPerPop), .fns = mean, .names = NULL))
Get the list of communities in each cluster
split(crime2$communityname, factor(crime2$cluster3))
## $`1`
## [1] "Aberdeen" "Albany" "Albemarle" "Alexandria" "Alliance"
## [6] "Anniston" "Ardmore" "Atlanta" "AtlanticCity" "Augusta"
## [11] "Austin" "Baltimore" "Barstow" "Bartow" "BatonRouge"
##
## $`2`
## [1] "Ada" "Aiken" "Alameda" "Alamogordo"
## [5] "Albuquerque" "Alice" "Allentown" "Altus"
## [9] "Alvin" "Amarillo" "Americus" "Anchorage"
## [13] "Anderson" "Angleton" "Annapolis" "Antioch"
## [17] "ApacheJunction" "Arcadia" "Arcata" "Arlington"
## [21] "Artesia" "AsburyPark" "Asheboro" "Asheville"
## [25] "Ashwaubenonvil" "Astoria" "Athens" "Atwater"
## [29] "Aurora" "Avondale" "Bainbridge" "Bakersfield"
## [33] "BalchSprings" "Bangor" "Banning" "Barberton"
## [37] "Bartlesville"
##
## $`3`
## [1] "AgouraHills" "Alexander" "Alhambra" "Alpharetta" "Altoona"
## [6] "AmericanFork" "Amesbury" "Amherst" "Anacortes" "Andrews"
## [11] "Ansonia" "Appleton" "Arkadelphia" "Arnold" "ArroyoGrande"
## [16] "Arvada" "Ashland" "Astontown" "Atascadero" "Athol"
## [21] "Auburn" "Avon" "Azusa" "BaldwinPark" "Ballwin"
## [26] "Barnegattown" "Barnstable" "Barrington" "Bayonne"