load libraries needed
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
load and prepare dataset
df <- USArrests
remove rows with missing values
df <- na.omit(df)
scale each variable to have mean of 0 and standard deviation of 1
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
Start the clustering
Since we don’t know beforehand how many clusters is optimal, we’ll create two different plots that can help us decide
fviz_nbclust(df, pam, method = "wss")
Using gap statistic compares intra-cluster variance for different values of k with predicted values for a distribution without clustering to discover the optimal number of clusters.
calculate gap statistic based on number of clusters
gap_stat <- clusGap(df,
FUN = pam,
K.max = 10, # max clusters to consider
B = 50) # total bootstrapped iterations
fviz_gap_stat(gap_stat)
Perform K-Mediods clustering with optimal K
perform k-medoids clustering with k = 4 clusters
set.seed(1)
kmed <- pam(df, k = 4)
kmed
## Medoids:
## ID Murder Assault UrbanPop Rape
## Alabama 1 1.2425641 0.7828393 -0.5209066 -0.003416473
## Michigan 22 0.9900104 1.0108275 0.5844655 1.480613993
## Oklahoma 36 -0.2727580 -0.2371077 0.1699510 -0.131534211
## New Hampshire 29 -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 4 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 4 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 4 1 2 3 4
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 4 4 3
## Objective function:
## build swap
## 1.035116 1.027102
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
Plot the results of final k-mediods model
fviz_cluster(kmed, data = df)
add cluster assignment to original data
final_data <- cbind(USArrests, cluster = kmed$clustering)
head(final_data)