Step 1. Find the optimal number of clusters (elbow, gap or silhouette methods)

The result (plot) suggests that 6 is the optimal number of clusters, as it appears to be the bend in the knee/elbow. The plot from average sihlouette method suggests 6 as the optimal number. The plot from gap statistic method suggests 1 as the optimal number - ignore. I choose 6 as the optimal number of clusters.

##https://uc-r.github.io/kmeans_clustering#elbow
#method 1: elbow method
set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(df, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

#6 is the optimal number of clusters
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")

#method 2: average silhouette method
avg_sil <- function(k) {
  km.res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(df))
  mean(ss[, 3])
}

# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15

# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)

plot(k.values, avg_sil_values,
       type = "b", pch = 19, frame = FALSE, 
       xlab = "Number of clusters K",
       ylab = "Average Silhouettes")

fviz_nbclust(df, kmeans, method = "silhouette")

#3 is the optimal number? Still choose 6

#method 3: gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 1
##           logW   E.logW       gap     SE.sim
##  [1,] 2.853226 3.319462 0.4662363 0.05856958
##  [2,] 2.536996 2.990989 0.4539934 0.05926627
##  [3,] 2.272295 2.734368 0.4620727 0.06141266
##  [4,] 2.038943 2.530756 0.4918135 0.06476126
##  [5,] 1.900035 2.373808 0.4737725 0.07111306
##  [6,] 1.748845 2.238805 0.4899595 0.07314937
##  [7,] 1.609755 2.113089 0.5033344 0.07281482
##  [8,] 1.513977 2.001064 0.4870869 0.06957714
##  [9,] 1.374078 1.893535 0.5194576 0.07168146
## [10,] 1.257140 1.796631 0.5394912 0.07193585
fviz_gap_stat(gap_stat)

Step 2. Perform the K-Means cluster analysis

#dftest <- USArrests
#df <- na.omit(df)
#df <- scale(df)

distance <- get_dist(df)
#since the dataset df is too big, I am downsizing it to 
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

k6 <- kmeans(df, centers = 6, nstart = 25)
str(k6)
## List of 9
##  $ cluster     : Named int [1:44] 3 3 3 4 6 2 5 6 3 1 ...
##   ..- attr(*, "names")= chr [1:44] "AK" "AL" "AR" "AZ" ...
##  $ centers     : num [1:6, 1:2] -0.295 0.717 -0.416 -0.348 -0.669 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:2] "mean_7msp" "mean_8tnl"
##  $ totss       : num 86
##  $ withinss    : num [1:6] 1.02 1.47 2.59 0 2.15 ...
##  $ tot.withinss: num 9.01
##  $ betweenss   : num 77
##  $ size        : int [1:6] 4 8 15 1 12 4
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
cluster_plot <- fviz_cluster(k6, data = df)
cluster_plot

#try when number of clusters (k) is 3, 4, and 5
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
#plot to compare
p1 <- fviz_cluster(k6, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

#library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

Step 3. Preform the hierarchical analysis

##https://uc-r.github.io/hc_clustering
# Dissimilarity matrix
d <- dist(df, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)

set.seed(1234)
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)

#text(x + 0.05, y + 0.05, labels = as.character(1:12))

dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
names(kmeansObj)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmeansObj$cluster
##  [1] 3 1 1 3 2 2 2 2 2 2 2 2
plot(kmeansObj$cluster)

##Alternatively, we can use the agnes function.
# Compute with agnes
hc2 <- agnes(df, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.9276342
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

# function to compute coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

map_dbl(m, ac)
##   average    single  complete      ward 
## 0.9114313 0.8612660 0.9276342 0.9509801
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

# compute divisive hierarchical clustering
hc4 <- diana(df)

# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.9182127
# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")

# Ward's method
hc5 <- hclust(d, method = "ward.D2" )

# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)

# Number of members in each cluster
table(sub_grp)
## sub_grp
##  1  2  3  4 
## 26  5  4  9
plot(hc5, cex = 0.6)

#rect.hclust(hc5, k = 6, border = 2:5)

# Cut agnes() tree into 6 groups
hc_a <- agnes(df, method = "ward")
cutree(as.hclust(hc_a), k = 6)
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC 
##  1  1  1  2  3  4  5  3  1  6  5  3  5  4  6  5  1  5  1  4  4  1  5  5  5  5 
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI 
##  1  1  4  1  6  3  5  5  4  5  1  1  1  6  4  4  4  1
# Cut diana() tree into 6 groups
hc_d <- diana(df)
cutree(as.hclust(hc_d), k = 6)
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC 
##  1  2  2  3  4  1  2  5  2  6  2  5  2  1  6  2  2  2  2  5  1  1  2  2  2  2 
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI 
##  2  1  1  2  6  5  2  2  1  2  2  2  2  6  1  1  1  2
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")

# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")

# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)

tanglegram(dend1, dend2)

#compute the entanglement
dend_list <- dendlist(dend1, dend2)

tanglegram(dend1, dend2,
  highlight_distinct_edges = FALSE, # Turn-off dashed lines
  common_subtrees_color_lines = FALSE, # Turn-off line colors
  common_subtrees_color_branches = TRUE, # Color common branches 
  main = paste("entanglement =", round(entanglement(dend_list), 2))
  )

Step 4. Describe the results

Most of the descriptions are imbedded in the separate steps. The column/axis name “mean_7msp” is the mean of median sale price of each state; “mean_8tnl” is the mean of total new listing of each state. From the result of k-means clustering, we do see NY, DC, CA, HI have the highest average sale price. Result of Hirarchical analysis suggested the same.