The result (plot) suggests that 6 is the optimal number of clusters, as it appears to be the bend in the knee/elbow. The plot from average sihlouette method suggests 6 as the optimal number. The plot from gap statistic method suggests 1 as the optimal number - ignore. I choose 6 as the optimal number of clusters.
##https://uc-r.github.io/kmeans_clustering#elbow
#method 1: elbow method
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
#6 is the optimal number of clusters
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")
#method 2: average silhouette method
avg_sil <- function(k) {
km.res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(df))
mean(ss[, 3])
}
# Compute and plot wss for k = 2 to k = 15
k.values <- 2:15
# extract avg silhouette for 2-15 clusters
avg_sil_values <- map_dbl(k.values, avg_sil)
plot(k.values, avg_sil_values,
type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouettes")
fviz_nbclust(df, kmeans, method = "silhouette")
#3 is the optimal number? Still choose 6
#method 3: gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 1
## logW E.logW gap SE.sim
## [1,] 2.853226 3.319462 0.4662363 0.05856958
## [2,] 2.536996 2.990989 0.4539934 0.05926627
## [3,] 2.272295 2.734368 0.4620727 0.06141266
## [4,] 2.038943 2.530756 0.4918135 0.06476126
## [5,] 1.900035 2.373808 0.4737725 0.07111306
## [6,] 1.748845 2.238805 0.4899595 0.07314937
## [7,] 1.609755 2.113089 0.5033344 0.07281482
## [8,] 1.513977 2.001064 0.4870869 0.06957714
## [9,] 1.374078 1.893535 0.5194576 0.07168146
## [10,] 1.257140 1.796631 0.5394912 0.07193585
fviz_gap_stat(gap_stat)
#dftest <- USArrests
#df <- na.omit(df)
#df <- scale(df)
distance <- get_dist(df)
#since the dataset df is too big, I am downsizing it to
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
k6 <- kmeans(df, centers = 6, nstart = 25)
str(k6)
## List of 9
## $ cluster : Named int [1:44] 3 3 3 4 6 2 5 6 3 1 ...
## ..- attr(*, "names")= chr [1:44] "AK" "AL" "AR" "AZ" ...
## $ centers : num [1:6, 1:2] -0.295 0.717 -0.416 -0.348 -0.669 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "mean_7msp" "mean_8tnl"
## $ totss : num 86
## $ withinss : num [1:6] 1.02 1.47 2.59 0 2.15 ...
## $ tot.withinss: num 9.01
## $ betweenss : num 77
## $ size : int [1:6] 4 8 15 1 12 4
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
cluster_plot <- fviz_cluster(k6, data = df)
cluster_plot
#try when number of clusters (k) is 3, 4, and 5
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
#plot to compare
p1 <- fviz_cluster(k6, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
#library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
##https://uc-r.github.io/hc_clustering
# Dissimilarity matrix
d <- dist(df, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
set.seed(1234)
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
#text(x + 0.05, y + 0.05, labels = as.character(1:12))
dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmeansObj$cluster
## [1] 3 1 1 3 2 2 2 2 2 2 2 2
plot(kmeansObj$cluster)
##Alternatively, we can use the agnes function.
# Compute with agnes
hc2 <- agnes(df, method = "complete")
# Agglomerative coefficient
hc2$ac
## [1] 0.9276342
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(df, method = x)$ac
}
map_dbl(m, ac)
## average single complete ward
## 0.9114313 0.8612660 0.9276342 0.9509801
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of agnes")
# compute divisive hierarchical clustering
hc4 <- diana(df)
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.9182127
# plot dendrogram
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram of diana")
# Ward's method
hc5 <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
sub_grp <- cutree(hc5, k = 4)
# Number of members in each cluster
table(sub_grp)
## sub_grp
## 1 2 3 4
## 26 5 4 9
plot(hc5, cex = 0.6)
#rect.hclust(hc5, k = 6, border = 2:5)
# Cut agnes() tree into 6 groups
hc_a <- agnes(df, method = "ward")
cutree(as.hclust(hc_a), k = 6)
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC
## 1 1 1 2 3 4 5 3 1 6 5 3 5 4 6 5 1 5 1 4 4 1 5 5 5 5
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI
## 1 1 4 1 6 3 5 5 4 5 1 1 1 6 4 4 4 1
# Cut diana() tree into 6 groups
hc_d <- diana(df)
cutree(as.hclust(hc_d), k = 6)
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO NC
## 1 2 2 3 4 1 2 5 2 6 2 5 2 1 6 2 2 2 2 5 1 1 2 2 2 2
## NE NH NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI
## 2 1 1 2 6 5 2 2 1 2 2 2 2 6 1 1 1 2
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "complete")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
#compute the entanglement
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
Most of the descriptions are imbedded in the separate steps. The column/axis name “mean_7msp” is the mean of median sale price of each state; “mean_8tnl” is the mean of total new listing of each state. From the result of k-means clustering, we do see NY, DC, CA, HI have the highest average sale price. Result of Hirarchical analysis suggested the same.