#
#K-Means Clustering in R
#PREPARED BY Alyssa Marie F. Roullo
homedir <- "C:/Users/Lenovo/Documents/4th Year - 2nd Semester/STT158 - M426"
setwd(homedir)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
#load data
df <- USArrests
#remove rows with missing values
df <- na.omit(df)
#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)
#view first six rows of dataset
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
##Since we do not currently have any prior knowledge regarding
##the ideal number of clusters, we will generate two distinct
##plots in order to assist us in making our decision.
#plot clusters vs within sum of squares
fviz_nbclust(df, kmeans, method = "wss")

##For this plot, it appears that there is a bit of an elbow or "bend" at k = 4.
##Clusters another method for determining the optimal number of clusters
##is to utilize a metric known as gap statistics.
#calculate gap statistic based on number of clusters
gap_stat <- clusGap(df,
FUN = kmeans,
nstart = 25,
K.max = 10,
B = 50)
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

##The plot reveals that the gap statistic is greatest at k = 4 clusters,
##which corresponds to the elbow technique we previously employed.
#make this example reproducible
set.seed(1)
#perform k-means clustering with k = 4 clusters
km <- kmeans(df, centers = 4, nstart = 25)
#view results
km
## K-means clustering with 4 clusters of sizes 13, 13, 16, 8
##
## Cluster means:
## Murder Assault UrbanPop Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2 0.6950701 1.0394414 0.7226370 1.27693964
## 3 -0.4894375 -0.3826001 0.5758298 -0.26165379
## 4 1.4118898 0.8743346 -0.8145211 0.01927104
##
## Clustering vector:
## Alabama Alaska Arizona Arkansas California
## 4 2 2 4 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 4
## Hawaii Idaho Illinois Indiana Iowa
## 3 1 2 3 1
## Kansas Kentucky Louisiana Maine Maryland
## 3 1 4 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 1 4 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 4 1 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 4
## South Dakota Tennessee Texas Utah Vermont
## 1 4 2 3 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 1 1 3
##
## Within cluster sum of squares by cluster:
## [1] 11.952463 19.922437 16.212213 8.316061
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#plot results of final k-means model
fviz_cluster(km, data = df)

#find means of each cluster
aggregate(USArrests, by=list(cluster=km$cluster), mean)
## cluster Murder Assault UrbanPop Rape
## 1 1 3.60000 78.53846 52.07692 12.17692
## 2 2 10.81538 257.38462 76.00000 33.19231
## 3 3 5.65625 138.87500 73.87500 18.78125
## 4 4 13.93750 243.62500 53.75000 21.41250
##We interpret this output is as follows:
##. The mean number of murders per 100,000 citizens among the states in cluster 1 is 3.6.
##. The mean number of assaults per 100,000 citizens among the states in cluster 1 is 78.5.
##. The mean percentage of residents living in an urban area among the states in cluster 1 is 52.1%.
##. The mean number of rapes per 100,000 citizens among the states in cluster 1 is 12.2.
##. The mean number of murders per 100,000 citizens among the states in cluster 2 is 10.8.
##. The mean number of assaults per 100,000 citizens among the states in cluster 2 is 257.4.
##. The mean percentage of residents living in an urban area among the states in cluster 2 is 76%.
##. The mean number of murders per 100,000 citizens among the states in cluster 3 is 5.7%.
##. The mean number of assaults per 100,000 citizens among the states in cluster 3 is 138.9.
##. The mean percentage of residents living in an urban area among the states in cluster 3 is 73.9%.
##. The mean number of murders per 100,000 citizens among the states in cluster 4 is 13.9.
##. The mean number of assaults per 100,000 citizens among the states in cluster 4 is 243.6.
##. The mean percentage of residents living in an urban area among the states in cluster 4 is 53.8%.
##. The mean number of rapes per 100,000 citizens among the states in cluster 4 is 21.4.
#add cluster assigment to original data
final_data <- cbind(USArrests, cluster = km$cluster)
#view final data
head(final_data)
## Murder Assault UrbanPop Rape cluster
## Alabama 13.2 236 58 21.2 4
## Alaska 10.0 263 48 44.5 2
## Arizona 8.1 294 80 31.0 2
## Arkansas 8.8 190 50 19.5 4
## California 9.0 276 91 40.6 2
## Colorado 7.9 204 78 38.7 2