Code

rm(list = ls())
library(datasets)
str(USArrests )

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

row.names(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

## Data Preprocess
sum(!complete.cases(USArrests))

## [1] 0

summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

## Remove or impute missing objects
df <-  na.omit(USArrests)

## Rescale (or normalization, etc.)
df <-  scale(df[,-5], center = T, scale = T)

summary(df)

##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

## Standardization
apply(USArrests[,-5], 2, sd)

##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385

apply(USArrests[,-5], 2, mean)

##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232

apply(df, 2, mean)

##        Murder       Assault      UrbanPop          Rape 
## -7.663087e-17  1.112408e-16 -4.332808e-16  8.942391e-17

apply(df, 2, sd)

##   Murder  Assault UrbanPop     Rape 
##        1        1        1        1

## Distance function and visualization
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

## K means
km_output <- kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)

## List of 9
##  $ cluster     : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:2] 56.1 46.7
##  $ tot.withinss: num 103
##  $ betweenss   : num 93.1
##  $ size        : int [1:2] 30 20
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_output)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_output)

## [1] "list"

length(km_output)

## [1] 9

km_output$cluster

##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              1              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              1              2              1              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              1              2              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              2              1              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              1              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              2              2              1              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              1              1              1

## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
### SSE

#### Cluster cohesion
#### SSE can be used to compare cluster performance only for a similar number of clusters

km_output$totss

## [1] 196

km_output$withinss      # distance without and within clusters

## [1] 56.11445 46.74796

km_output$betweenss

## [1] 93.1376

sum(c(km_output$withinss, km_output$betweenss))

## [1] 196

cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion

## [1] 0.5248082

### Visualize Clusters
fviz_cluster(km_output, data = df)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

df    %>%
      as.data.frame(  )  %>%
      mutate(cluster = km_output$cluster, objects_names = row.names(USArrests))    %>% 
      ggplot(aes(x = UrbanPop, y = Murder, color = factor(cluster), label = objects_names)) + geom_text(  )

### Put Cluster Output on the Map(1)  
cluster_df <-  data.frame(objects_names = tolower(row.names(USArrests)), cluster = unname(km_output$cluster))

library(maps)

## Warning: package 'maps' was built under R version 4.4.2

objects_names <- map_data("state")

objects_names  %>%
    left_join(cluster_df, by = c("region" = "objects_names"))  %>%
    ggplot( ) +
    geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster), group =group), color = "white") +
    coord_fixed(1.3) +
    guides(fill = F) +
    theme_bw( ) +
    theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
               panel.border = element_blank( ),
               axis.line = element_blank( ),
               axis.text = element_blank( ),
               axis.ticks = element_blank( ),
               axis.title = element_blank( ))

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

###  Elbow method to decide Optimal Number of Clusters(1)

set.seed(8)
wss <- function(k) {
        return(kmeans(df, k,  nstart = 25)$tot.withinss)
}

k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)

plot(x = k_values, y = wss_values,
       type = "b", frame = F,
       xlab = "Number of clusters K",
       ylab = "Total within-clusters sum of square")

###  Hierarchical Clustering
hac_output <- hclust(dist(USArrests, method = "euclidean"), method = "complete")
plot(hac_output)       # Calculating distance using hierarchical clustering, using Euclidean distance

                       # and using complete linkage for hierarchical clustering

### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 2)
for ( i in 1:length(hac_cut)) {
    if( hac_cut[i]   != km_output$cluster[i])   print(names(hac_cut) [i])
}

## [1] "Alabama"
## [1] "Alaska"
## [1] "Arizona"
## [1] "Arkansas"
## [1] "California"
## [1] "Connecticut"
## [1] "Florida"
## [1] "Hawaii"
## [1] "Idaho"
## [1] "Illinois"
## [1] "Indiana"
## [1] "Iowa"
## [1] "Kansas"
## [1] "Kentucky"
## [1] "Louisiana"
## [1] "Maine"
## [1] "Maryland"
## [1] "Massachusetts"
## [1] "Michigan"
## [1] "Minnesota"
## [1] "Mississippi"
## [1] "Montana"
## [1] "Nebraska"
## [1] "Nevada"
## [1] "New Hampshire"
## [1] "New Jersey"
## [1] "New Mexico"
## [1] "New York"
## [1] "North Carolina"
## [1] "North Dakota"
## [1] "Ohio"
## [1] "Oklahoma"
## [1] "Oregon"
## [1] "Pennsylvania"
## [1] "Rhode Island"
## [1] "South Carolina"
## [1] "South Dakota"
## [1] "Utah"
## [1] "Vermont"
## [1] "Virginia"
## [1] "Washington"
## [1] "West Virginia"
## [1] "Wisconsin"
## [1] "Wyoming"

DataMining2_Assignment3_USArrest

Soudeh Khoubrouy

2025-01-31

Code