dataset <- USArrests
str(dataset)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
row.names(dataset)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
summary(dataset)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
## Check for incomplete cases
print(paste("The number of incomplete cases is",
sum(!complete.cases(dataset))
))
## [1] "The number of incomplete cases is 0"
## Remove or impute missing objects
df <- na.omit(dataset)
## Rescale (or normalization, etc.)
df <- scale(df, center = T, scale = T)
summary(df)
## Murder Assault UrbanPop Rape
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
fviz_dist(distance,
gradient = list(low="#00AFBB",
mid = "white",
high = "#FC4E07"))
km_output <- kmeans(df, centers = 2, nstart = 25, iter.max = 100,
algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 46.7 56.1
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 20 30
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 1 2 2 1 1
## Hawaii Idaho Illinois Indiana Iowa
## 2 2 1 2 2
## Kansas Kentucky Louisiana Maine Maryland
## 2 2 1 2 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 2 1 1
## Montana Nebraska Nevada New Hampshire New Jersey
## 2 2 1 2 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 2 2
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 2 2 1
## South Dakota Tennessee Texas Utah Vermont
## 2 1 1 2 2
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 2 2 2
## Sum of Squares
print(paste("SST:", km_output$totss))
## [1] "SST: 196"
print("SSwithin:")
## [1] "SSwithin:"
km_output$withinss
## [1] 46.74796 56.11445
print(paste("SSbetween:", km_output$betweenss))
## [1] "SSbetween: 93.1375995055827"
print(paste("SSwithin + SSbetween:",
sum(c(km_output$withinss, km_output$betweenss))))
## [1] "SSwithin + SSbetween: 196"
cohesion <- sum(km_output$withinss)/ km_output$totss
print(paste("cohesion:", cohesion))
## [1] "cohesion: 0.524808165787843"
## Visualizing Clusters
fviz_cluster(km_output, data = df)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
df %>%
as.data.frame( ) %>%
mutate(cluster = km_output$cluster,
objects_name = row.names(dataset)) %>%
ggplot(aes(x = UrbanPop, y = Murder,
color = factor(cluster),
label = objects_name)) + geom_text( )
cluster_df <-
data.frame(objects_names = tolower(row.names(dataset)),
cluster = unname(km_output$cluster))
library(maps)
## Warning: package 'maps' was built under R version 4.4.2
objects_names <- map_data("state")
objects_names %>%
left_join(cluster_df,
by = c("region"="objects_names")) %>%
ggplot( ) +
geom_polygon(aes(x = long, y = lat,
fill = as.factor(cluster), group =group),
color = "white") +
coord_fixed(1.3) +
guides(fill = "none") +
theme_bw( ) +
theme(panel.grid.major = element_blank( ),
panel.grid.minor = element_blank( ),
panel.border = element_blank( ),
axis.line = element_blank( ),
axis.text = element_blank( ),
axis.ticks = element_blank( ),
axis.title = element_blank( ))
set.seed(8)
wss <- function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")
We see an elbow at \(k=2\), which corroborates are initial choice. Perhaps an alternate choice is \(k=4\).
We use Euclidean distance and complete linkage for the
hclus()
function.
hac_output <- hclust(dist(dataset, method = "euclidean"), method = "complete")
plot(hac_output)
hac_cut <- cutree(hac_output, 2)
for ( i in 1:length(hac_cut)) {
if( hac_cut[i] != km_output$cluster[i])
print(names(hac_cut)[i])
}
## [1] "Colorado"
## [1] "Delaware"
## [1] "Georgia"
## [1] "Missouri"
## [1] "Tennessee"
## [1] "Texas"
The hierarchical method differs from the k-means method for the states shown in the output above.