#
#K-Means Clustering in R
#PREPARED BY NEIL JOHN P. GUTIERREZ
homedir <- "C:/Users/NP Gutierrez/OneDrive/Desktop/R-Studio Only"
setwd(homedir)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
## Warning: package 'cluster' was built under R version 4.1.3
#load data
df <- USArrests

#remove rows with missing values
df <- na.omit(df)

#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)

#view first six rows of dataset
head(df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207
fviz_nbclust(df, kmeans, method = "wss")

#calculate gap statistic based on number of clusters
gap_stat <- clusGap(df,
                    FUN = kmeans,
                    nstart = 25,
                    K.max = 10,
                    B = 50)

#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#make this example reproducible
set.seed(1)

#perform k-means clustering with k = 4 clusters
km <- kmeans(df, centers = 4, nstart = 25)

#view results
km
## K-means clustering with 4 clusters of sizes 13, 13, 16, 8
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 2  0.6950701  1.0394414  0.7226370  1.27693964
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  1.4118898  0.8743346 -0.8145211  0.01927104
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              4              2              2              4              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              4 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              1              2              3              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              1              4              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              1              4              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              4              1              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              4 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              4              2              3              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              1              1              3 
## 
## Within cluster sum of squares by cluster:
## [1] 11.952463 19.922437 16.212213  8.316061
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
#plot results of final k-means model
fviz_cluster(km, data = df)

#find means of each cluster
aggregate(USArrests, by=list(cluster=km$cluster), mean)
##   cluster   Murder   Assault UrbanPop     Rape
## 1       1  3.60000  78.53846 52.07692 12.17692
## 2       2 10.81538 257.38462 76.00000 33.19231
## 3       3  5.65625 138.87500 73.87500 18.78125
## 4       4 13.93750 243.62500 53.75000 21.41250
#add cluster assigment to original data
final_data <- cbind(USArrests, cluster = km$cluster)

#view final data
head(final_data)
##            Murder Assault UrbanPop Rape cluster
## Alabama      13.2     236       58 21.2       4
## Alaska       10.0     263       48 44.5       2
## Arizona       8.1     294       80 31.0       2
## Arkansas      8.8     190       50 19.5       4
## California    9.0     276       91 40.6       2
## Colorado      7.9     204       78 38.7       2