Data
data("USArrests") # Load the data set
df <- scale(USArrests) # Scale the data
head(df, n = 3) # View the firt 3 rows of the data
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Loading Packages
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.2
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Determining the Optimum number of Clusters
fviz_nbclust(df,pam,method="silhouette")+
theme_classic()
From the plot the suggested number of clusters is 2. Lets compute PAM with k=2
pam.res <- pam(df,2)
print(pam.res)
## Medoids:
## ID Murder Assault UrbanPop Rape
## New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Objective function:
## build swap
## 1.441358 1.368969
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
If you want to add this classification in your original data then
dd <- cbind(USArrests,cluster=pam.res$cluster)
head(dd,n=3)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 1
## Arizona 8.1 294 80 31.0 1
*** Accessing to the result of the pam() Function ***
# cluster medioids: New Mexico, Nebraska
pam.res$medoids
## Murder Assault UrbanPop Rape
## New Mexico 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska -0.8008247 -0.8250772 -0.2445636 -0.5052109
#cluster numbers
head(pam.res$clustering)
## Alabama Alaska Arizona Arkansas California Colorado
## 1 1 1 2 1 1
*** Visualising PAM clusters ***
fviz_cluster(pam.res,
palette = c("#00AFBB", "#FC4E07"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_classic()
)