USAArrests-Clustering

library(mlbench)
library(ggplot2)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.2.1

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data("USArrests")
rownames(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

USAR<-USArrests
USAR<-na.omit(USAR)
USAR<-scale(USAR, center = T, scale = T)
summary(USAR)

##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

apply(USAR, 2, sd)

##   Murder  Assault UrbanPop     Rape 
##        1        1        1        1

apply(USAR, 2, mean)

##        Murder       Assault      UrbanPop          Rape 
## -7.663087e-17  1.112408e-16 -4.332808e-16  8.942391e-17

Adistance<-get_dist(USAR)
fviz_dist(Adistance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

km_out<-kmeans(USAR, centers = 3, nstart = 25, iter.max = 100, algorithm ="Hartigan-Wong")
str(km_out)

## List of 9
##  $ cluster     : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:3, 1:4] -0.447 1.005 -0.962 -0.347 1.014 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:3] 19.6 46.7 12
##  $ tot.withinss: num 78.3
##  $ betweenss   : num 118
##  $ size        : int [1:3] 17 20 13
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_out)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_out)

## [1] "list"

length(km_out)

## [1] 9

km_out$cluster

##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              1              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              3              2              1              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              3              2              3              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              2              3              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              3              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              3              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              2              2              1              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              3              3              1

## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
km_out$totss

## [1] 196

km_out$withinss

## [1] 19.62285 46.74796 11.95246

km_out$betweenss

## [1] 117.6767

sum(c(km_out$withinss, km_out$betweenss))

## [1] 196

cohesion<-sum(km_out$withinss)/km_out$totss
cohesion

## [1] 0.3996085

fviz_cluster(km_out, data = USAR)

USAR<-as.data.frame(USAR)
USAR<- mutate(USAR, cluster=km_out$cluster, objects_names= row.names(USAR))
str(USAR)

## 'data.frame':    50 obs. of  6 variables:
##  $ Murder       : num  1.2426 0.5079 0.0716 0.2323 0.2783 ...
##  $ Assault      : num  0.783 1.107 1.479 0.231 1.263 ...
##  $ UrbanPop     : num  -0.521 -1.212 0.999 -1.074 1.759 ...
##  $ Rape         : num  -0.00342 2.4842 1.04288 -0.18492 2.06782 ...
##  $ cluster      : Named int  2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ objects_names: chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...

ggplot(USAR, aes(x = UrbanPop, y = Murder, color = factor(cluster), label = objects_names)) + geom_text(  )

USAArrests-Clustering

Lila Ghemri

2022-10-14