USAArrests-Clustering

library(mlbench)
library(ggplot2)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.2.1

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(maps)

## Warning: package 'maps' was built under R version 4.2.1

library(mapdata)

## Warning: package 'mapdata' was built under R version 4.2.1

library(ggmap)

## Warning: package 'ggmap' was built under R version 4.2.1

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

data("USArrests")
rownames(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

USAR<-USArrests
USAR<-na.omit(USAR)
USAR<-scale(USAR, center = T, scale = T)
summary(USAR)

##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

apply(USAR, 2, sd)

##   Murder  Assault UrbanPop     Rape 
##        1        1        1        1

apply(USAR, 2, mean)

##        Murder       Assault      UrbanPop          Rape 
## -7.663087e-17  1.112408e-16 -4.332808e-16  8.942391e-17

Adistance<-get_dist(USAR)
fviz_dist(Adistance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

km_out<-kmeans(USAR, centers = 3, nstart = 25, iter.max = 100, algorithm ="Hartigan-Wong")
str(km_out)

## List of 9
##  $ cluster     : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:3, 1:4] -0.447 1.005 -0.962 -0.347 1.014 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:3] 19.6 46.7 12
##  $ tot.withinss: num 78.3
##  $ betweenss   : num 118
##  $ size        : int [1:3] 17 20 13
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_out)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_out)

## [1] "list"

length(km_out)

## [1] 9

km_out$cluster

##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              1              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              3              2              1              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              3              2              3              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              2              3              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              3              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              3              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              2              2              1              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              3              3              1

##Code6:
### Put Cluster Output on the Map(1)  
cluster_AR<-data.frame(objects_names = tolower(row.names(USAR)), cluster = unname(km_out$cluster))



str(cluster_AR)

## 'data.frame':    50 obs. of  2 variables:
##  $ objects_names: chr  "alabama" "alaska" "arizona" "arkansas" ...
##  $ cluster      : int  2 2 2 1 2 2 1 1 2 2 ...

## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
km_out$totss

## [1] 196

km_out$withinss

## [1] 19.62285 46.74796 11.95246

km_out$betweenss

## [1] 117.6767

sum(c(km_out$withinss, km_out$betweenss))

## [1] 196

cohesion<-sum(km_out$withinss)/km_out$totss
cohesion

## [1] 0.3996085

fviz_cluster(km_out, data = USAR)

USAR<-as.data.frame(USAR)
USAR<- mutate(USAR, cluster=km_out$cluster, objects_names= row.names(USAR))
str(USAR)

## 'data.frame':    50 obs. of  6 variables:
##  $ Murder       : num  1.2426 0.5079 0.0716 0.2323 0.2783 ...
##  $ Assault      : num  0.783 1.107 1.479 0.231 1.263 ...
##  $ UrbanPop     : num  -0.521 -1.212 0.999 -1.074 1.759 ...
##  $ Rape         : num  -0.00342 2.4842 1.04288 -0.18492 2.06782 ...
##  $ cluster      : Named int  2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ objects_names: chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...

ggplot(USAR, aes(x = UrbanPop, y = Murder, color = factor(cluster), label =objects_names)) + geom_text(  )

##Putting clusters on the map
##Code 7
USAR$objects_names<-tolower(USAR$objects_names)
library(maps)
data("stateMapEnv")
states<-map_data("state")
ST<-states
ST$cluster<-c(0)
ST1<-ST[, -6]
ST1$region<-replace(ST1$region, ST1$region=='district of columbia', 'virginia')
ST<-ST1
addCluster<-function(x){
   for (j in 1:length(USAR$objects_names)){
        if(x==USAR$objects_names[j])
          return (USAR$cluster[j])
   }
}

for (i in 1:length(ST$region)){
  ST$cluster[i]<-addCluster(ST$region[i])
} 
    



ggplot( ST) +
geom_polygon(aes(x = long, y = lat, fill =cluster, group =group ), color = "white") +
coord_fixed(1.3)

#Code7:
###  Elbow method to decide Optimal Number of Clusters(1)
set.seed(9)
elbow<-function(k) {
    return(kmeans(USArrests, k,  nstart = 25)$tot.withinss)
}



k_values <- 1:15
  wss_values<-purrr::map_dbl(k_values, elbow)
  plot(x = k_values, y = wss_values,
       type = "b", frame = F,
      xlab = "Number of clusters K",
      ylab = "Total within-clusters sum of square")

##code 8 Hierarchical clustering
# Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
hacR_output <-hclust(dist(USAR[, -c(5,6)], method = "euclidean"), method = "complete")
plot(hacR_output)

### Output Desirable Number of Clusters after Modeling
hacR_cut <- cutree(hacR_output, 3)
##Visualizing the hierarchical clusters 

hR_clust1<-data.frame(index=which(hacR_cut==1))
hR_clust1<-mutate(hR_clust1, cluster=1)
hR_clust2<-data.frame(index=which(hacR_cut==2))
hR_clust2<- mutate(hR_clust2, cluster=2)
hR_clust3<-data.frame(index=which(hacR_cut==3))
hR_clust3<- mutate(hR_clust3, cluster=3)
colnames(hR_clust2)<-colnames(hR_clust1)
colnames(hR_clust3)<-colnames(hR_clust1)
hR_clust<-rbind(hR_clust1, hR_clust2, hR_clust3)
colnames(hR_clust)<-c('index', 'cluster')
ggplot(USAR, aes(x =UrbanPop , y = Murder, color = factor(hR_clust$cluster),label =objects_names)) + geom_text()

##Printing the number of  mismatched clusters and indexes of objects mismatched

length(which(hacR_cut!= USAR$cluster))

## [1] 26

#Comparing performance of each clustering method with original dataset


##Comparing the two approaches 
print(" Mismatched table")

## [1] " Mismatched table"

table(hacR_cut, USAR$cluster)

##         
## hacR_cut  1  2  3
##        1  0  8  0
##        2  0 11  0
##        3 17  1 13

print("Proportion of Mismatched clusters")

## [1] "Proportion of Mismatched clusters"

cm<-table(hacR_cut, USAR$cluster)
print(1-diag(cm)/sum(cm))

##    1    2    3 
## 1.00 0.78 0.74

USAArrests-Clustering

Lila Ghemri

2022-10-14