load libraries needed

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)

load and prepare dataset

df <- USArrests

remove rows with missing values

df <- na.omit(df)

scale each variable to have mean of 0 and standard deviation of 1

df <- scale(df)
head(df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Start the clustering

Since we don’t know beforehand how many clusters is optimal, we’ll create two different plots that can help us decide

fviz_nbclust(df, pam, method = "wss")

Using gap statistic compares intra-cluster variance for different values of k with predicted values for a distribution without clustering to discover the optimal number of clusters.

calculate gap statistic based on number of clusters

gap_stat <- clusGap(df,
                    FUN = pam,
                    K.max = 10, # max clusters to consider
                    B = 50)     # total bootstrapped iterations
fviz_gap_stat(gap_stat)

Perform K-Mediods clustering with optimal K

perform k-medoids clustering with k = 4 clusters

set.seed(1)
kmed <- pam(df, k = 4)
kmed
## Medoids:
##               ID     Murder    Assault   UrbanPop         Rape
## Alabama        1  1.2425641  0.7828393 -0.5209066 -0.003416473
## Michigan      22  0.9900104  1.0108275  0.5844655  1.480613993
## Oklahoma      36 -0.2727580 -0.2371077  0.1699510 -0.131534211
## New Hampshire 29 -1.3059321 -1.3650491 -0.6590781 -1.252564419
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              4              2              3              4 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              4              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              4              1              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              4              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              1              4              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              4              1              2              3              4 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              4              4              3 
## Objective function:
##    build     swap 
## 1.035116 1.027102 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"

Plot the results of final k-mediods model

fviz_cluster(kmed, data = df)

add cluster assignment to original data

final_data <- cbind(USArrests, cluster = kmed$clustering)
head(final_data)