library("cluster")
data("USArrests")
head(USArrests, 3)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
head(USArrests$Murder)
## [1] 13.2 10.0 8.1 8.8 9.0 7.9
USArrests[,"Murder"]
## [1] 13.2 10.0 8.1 8.8 9.0 7.9 3.3 5.9 15.4 17.4 5.3 2.6 10.4 7.2
## [15] 2.2 6.0 9.7 15.4 2.1 11.3 4.4 12.1 2.7 16.1 9.0 6.0 4.3 12.2
## [29] 2.1 7.4 11.4 11.1 13.0 0.8 7.3 6.6 4.9 6.3 3.4 14.4 3.8 13.2
## [43] 12.7 3.2 2.2 8.5 4.0 5.7 2.6 6.8
df <- USArrests
df <- na.omit(df)
df <- scale(df)
head(df, n = 3)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Subset of the data
set.seed(123)
ss <- sample(1:50, 15) # Take 15 random rows
df <- USArrests[ss, ] # Subset the 15 rows
df.scaled <- scale(df) # Standardize the variables
dist.eucl <- dist(df.scaled, method = "euclidean")
# Reformat as a matrix
# Subset the first 3 columns and rows and Round the values
round(as.matrix(dist.eucl)[1:3, 1:3], 1)
## Iowa Rhode Island Maryland
## Iowa 0.0 2.8 4.1
## Rhode Island 2.8 0.0 3.6
## Maryland 4.1 3.6 0.0
dist.cor <- get_dist(df.scaled, method = "pearson")
round(as.matrix(dist.cor)[1:3, 1:3], 1)
## Iowa Rhode Island Maryland
## Iowa 0.0 0.4 1.9
## Rhode Island 0.4 0.0 1.5
## Maryland 1.9 1.5 0.0
data(flower)
head(flower, 3)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 0 1 1 4 3 15 25 15
## 2 1 0 0 2 1 3 150 50
## 3 0 1 0 3 3 1 150 50
str(flower)
## 'data.frame': 18 obs. of 8 variables:
## $ V1: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 2 2 ...
## $ V2: Factor w/ 2 levels "0","1": 2 1 2 1 2 2 1 1 2 2 ...
## $ V3: Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
## $ V4: Factor w/ 5 levels "1","2","3","4",..: 4 2 3 4 5 4 4 2 3 5 ...
## $ V5: Ord.factor w/ 3 levels "1"<"2"<"3": 3 1 3 2 2 3 3 2 1 2 ...
## $ V6: Ord.factor w/ 18 levels "1"<"2"<"3"<"4"<..: 15 3 1 16 2 12 13 7 4 14 ...
## $ V7: num 25 150 150 125 20 50 40 100 25 100 ...
## $ V8: num 15 50 50 50 15 40 20 15 15 60 ...
# Distance matrix
dd <- daisy(flower)
round(as.matrix(dd)[1:3, 1:3], 2)
## 1 2 3
## 1 0.00 0.89 0.53
## 2 0.89 0.00 0.51
## 3 0.53 0.51 0.00
library(factoextra)
fviz_dist(dist.eucl)

data("USArrests")
df <- scale(USArrests)
head(df, n = 3)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
library(factoextra)
fviz_nbclust(df, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)

# Compute k-means with k = 4
set.seed(123)
km.res <- kmeans(df, 4, nstart = 25)
print(km.res)
## K-means clustering with 4 clusters of sizes 13, 16, 13, 8
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 3 0.6950701 1.0394414 0.7226370 1.27693964
## 4 1.4118898 0.8743346 -0.8145211 0.01927104
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 4 3 3 4 3
## Colorado Connecticut Delaware Florida Georgia
## 3 2 2 3 4
## Hawaii Idaho Illinois Indiana Iowa
## 2 1 3 2 1
## Kansas Kentucky Louisiana Maine Maryland
## 2 1 4 1 3
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 3 1 4 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 3 1 2
## New Mexico New York North Carolina North Dakota Ohio
## 3 3 4 1 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 4
## South Dakota Tennessee Texas Utah Vermont
## 1 4 3 2 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 11.952463 16.212213 19.922437 8.316061
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
aggregate(USArrests, by=list(cluster=km.res$cluster), mean)
## cluster Murder Assault UrbanPop Rape
## 1 1 3.60000 78.53846 52.07692 12.17692
## 2 2 5.65625 138.87500 73.87500 18.78125
## 3 3 10.81538 257.38462 76.00000 33.19231
## 4 4 13.93750 243.62500 53.75000 21.41250
dd <- cbind(USArrests, cluster = km.res$cluster)
head(dd)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 4
## Alaska 10.0 263 48 44.5 3
## Arizona 8.1 294 80 31.0 3
## Arkansas 8.8 190 50 19.5 4
## California 9.0 276 91 40.6 3
## Colorado 7.9 204 78 38.7 3
# Cluster number for each of the observations
km.res$cluster
## Alabama Alaska Arizona Arkansas California
## 4 3 3 4 3
## Colorado Connecticut Delaware Florida Georgia
## 3 2 2 3 4
## Hawaii Idaho Illinois Indiana Iowa
## 2 1 3 2 1
## Kansas Kentucky Louisiana Maine Maryland
## 2 1 4 1 3
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 3 1 4 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 3 1 2
## New Mexico New York North Carolina North Dakota Ohio
## 3 3 4 1 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 4
## South Dakota Tennessee Texas Utah Vermont
## 1 4 3 2 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 1 1 2
head(km.res$cluster, 4)
## Alabama Alaska Arizona Arkansas
## 4 3 3 4
# Cluster size
km.res$size
## [1] 13 16 13 8
# Cluster means
km.res$centers
## Murder Assault UrbanPop Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 3 0.6950701 1.0394414 0.7226370 1.27693964
## 4 1.4118898 0.8743346 -0.8145211 0.01927104
fviz_cluster(km.res, data = df,
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "euclid", # Concentration ellipse
star.plot = TRUE, # Add segments from centroids to items
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_minimal()
)

fviz_nbclust(df, pam, method = "silhouette")+
theme_classic()

pam.res <- pam(df, 2)
print(pam.res)
## Medoids:
## ID Murder Assault UrbanPop Rape
## New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Objective function:
## build swap
## 1.441358 1.368969
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
dd <- cbind(USArrests, cluster = pam.res$cluster)
head(dd, n = 3)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 1
## Alaska 10.0 263 48 44.5 1
## Arizona 8.1 294 80 31.0 1
# Cluster medoids: New Mexico, Nebraska
pam.res$medoids
## Murder Assault UrbanPop Rape
## New Mexico 0.8292944 1.3708088 0.3081225 1.1603196
## Nebraska -0.8008247 -0.8250772 -0.2445636 -0.5052109
# Cluster numbers
head(pam.res$clustering)
## Alabama Alaska Arizona Arkansas California Colorado
## 1 1 1 2 1 1
fviz_cluster(pam.res,
palette = c("#00AFBB", "#FC4E07"), # color palette
ellipse.type = "t", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_classic()
)
