With hierarchical clustering, I have tried clustering communities in MA by factoring 3 variables- Robbery per 1000 people, Median household size and median Income. The dataset had over 100 communities in MA. However, to have more a presentable dendogram, I took a simple random sample of 30 communities in MA and then plotted a dendogram using Agglomerative clustering technique (a type of hierarchical clustering).
The dendogram reveals communities that are similar or dissimilar when it comes to the 3 variables. It gives us a pair of communities that are quite similar and the extent to which they are similar. For example, Amesburytown and Milfordtown are quite similar or how Boston is different from all other communities.
We could cut the tree at a height around 3.5 units that would give us 4 appropriate clusters.
library(tibble)
library(psych)
library(tidyverse) # data manipulation
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
library(dendextend) # for comparing two dendrograms
crimedata <- read_csv("C:/Users/shreejit/Downloads/crimedata.csv")
cd <- crimedata
str(cd)
cd[cd=="?"] <- NA
# SUBSET state of MA only
cd2 <- subset(cd,state=="MA")
cd2 = cd2[sample(nrow(cd2),30),]
myvars <- c("Ecommunityname","robbbPerPop", "householdsize", "medIncome")
newdata <- cd2[myvars]
newdata <- na.omit(newdata)
# Naming the rows with community names
cd3 <- newdata %>% remove_rownames %>% tibble::column_to_rownames(var="Ecommunityname") %>%
as.data.frame()
# Converting char data to numeric type
cd3$robbbPerPop=as.numeric(cd3$robbbPerPop)
df <- cd3
df <- na.omit(df)
df <- scale(df)
head(df)
## robbbPerPop householdsize medIncome
## Dracuttown 0.2307237 0.16755006 0.2405226
## Westfordtown -0.5199446 0.83197271 1.7984081
## Melrosecity -0.1197858 -0.64131230 0.1337031
## Cantontown -0.2124152 0.05199829 1.0828389
## Pittsfieldcity 0.3134232 -1.01685554 -1.2948054
## Plymouthtown -0.2709569 0.22532594 -0.2934738
# Dissimilarity matrix
d <- dist(df, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
# Cut tree into 4 groups
sub_grp <- cutree(hc1, k = 4)
# Number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3 4
## 20 8 1 1
cd3 %>%
mutate(cluster = sub_grp) %>%
head
## robbbPerPop householdsize medIncome cluster
## 1 56.53 2.85 45165 1
## 2 5.88 3.08 60566 2
## 3 32.88 2.57 44109 1
## 4 26.63 2.81 53492 2
## 5 62.11 2.44 29987 1
## 6 22.68 2.87 39886 1
cd4 <-cd3 %>%
mutate(cluster = sub_grp)
# Compute with agnes
hc2 <- agnes(df, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.898467
## [1] 0.8531583
# methods to assess
#m <- c( "average", "single", "complete", "ward")
#names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
#ac <- function(x) {
# agnes(df, method = x)$ac
#}
#map_dbl(m, ac)
#hc3 <- agnes(df, method = "ward")
#pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# compute divisive hierarchical clustering
#hc4 <- diana(df)
# Divise coefficient; amount of clustering structure found
#hc4$dc
## [1] 0.8514345
# plot dendrogram
#pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# Ward's method
#hc5 <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
#sub_grp <- cutree(hc5, k = 4)
# Number of members in each cluster
#table(sub_grp)
## sub_grp
## 1 2 3 4
## 7 12 19 12
#cd3 %>%
# mutate(cluster = sub_grp) %>%
# head
#cd4 <- cd3 %>%
# mutate(cluster = sub_grp)
##It's also possible to draw the dendrogram with a border around
##the 4 clusters. The argument border is used to specify the border
plot(hc1, cex = 0.6)
rect.hclust(hc1, k = 4, border = 2:5)
##
fviz_cluster(list(data = df, cluster = sub_grp))
Form hierarchical clustering, 4 clusters are ideal to group the communities in MA.
library(tidyverse)
library(cluster) # clustering algorithms
library(factoextra) # clustering algorithms & visualization
df <- cd3
df <- na.omit(df)
df <- scale(df)
head(df)
## robbbPerPop householdsize medIncome
## Dracuttown 0.2307237 0.16755006 0.2405226
## Westfordtown -0.5199446 0.83197271 1.7984081
## Melrosecity -0.1197858 -0.64131230 0.1337031
## Cantontown -0.2124152 0.05199829 1.0828389
## Pittsfieldcity 0.3134232 -1.01685554 -1.2948054
## Plymouthtown -0.2709569 0.22532594 -0.2934738
k4 <- kmeans(df, centers = 4, nstart = 25)
str(k4)
## List of 9
## $ cluster : Named int [1:30] 3 2 3 2 4 3 1 2 3 2 ...
## ..- attr(*, "names")= chr [1:30] "Dracuttown" "Westfordtown" "Melrosecity" "Cantontown" ...
## $ centers : num [1:4, 1:3] 4.9899 -0.406 -0.0497 -0.1181 -0.3813 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:3] "robbbPerPop" "householdsize" "medIncome"
## $ totss : num 87
## $ withinss : num [1:4] 0 13.42 5.57 1.36
## $ tot.withinss: num 20.3
## $ betweenss : num 66.7
## $ size : int [1:4] 1 9 15 5
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
k4
## K-means clustering with 4 clusters of sizes 1, 9, 15, 5
##
## Cluster means:
## robbbPerPop householdsize medIncome
## 1 4.98994598 -0.3813208 -1.39919722
## 2 -0.40595701 0.9603636 1.10685751
## 3 -0.04973332 -0.1694759 -0.06085812
## 4 -0.11806661 -1.1439625 -1.52992973
##
## Clustering vector:
## Dracuttown Westfordtown Melrosecity Cantontown
## 3 2 3 2
## Pittsfieldcity Plymouthtown Worcestercity Northboroughtown
## 4 3 1 2
## SouthHadleytown Longmeadowtown Pembroketown Graftontown
## 3 2 2 3
## Franklintown Dennistown Watertowntown Walpoletown
## 2 4 3 2
## Harwichtown Braintreetown Harvardtown Maynardtown
## 4 3 2 3
## Westporttown Dedhamtown Danverstown Oxfordtown
## 3 3 3 3
## Wakefieldtown Wilmingtontown Saugustown NorthAdamscity
## 3 2 3 4
## Webstertown Stoughtontown
## 4 3
##
## Within cluster sum of squares by cluster:
## [1] 0.000000 13.416035 5.566915 1.364373
## (between_SS / total_SS = 76.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
fviz_cluster(k4, data = df)
k2 <- kmeans(df, centers = 2, nstart = 25)
k3 <- kmeans(df, centers = 3, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, p4, nrow = 2)
final <- kmeans(df, 4, nstart = 25)
cd4 <- cd3 %>%
mutate(Cluster = final$cluster) %>%
group_by(Cluster) %>%
summarise_all("mean")
#Quantitative output by cluster
cd4
## # A tibble: 4 x 4
## Cluster robbbPerPop householdsize medIncome
## <int> <dbl> <dbl> <dbl>
## 1 1 13.6 3.12 53729
## 2 2 378 2.66 28955
## 3 3 33.0 2.40 27663
## 4 4 37.6 2.73 42186
Quantitative output by cluster shows how different the clusters with respect to Robbery. Household sizes and median Income for the clusters don’t have a high variance. Cluster