library(mlbench)
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(maps)
## Warning: package 'maps' was built under R version 4.2.1
library(mapdata)
## Warning: package 'mapdata' was built under R version 4.2.1
library(ggmap)
## Warning: package 'ggmap' was built under R version 4.2.1
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
data("USArrests")
rownames(USArrests)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
USAR<-USArrests
USAR<-na.omit(USAR)
USAR<-scale(USAR, center = T, scale = T)
summary(USAR)
## Murder Assault UrbanPop Rape
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
apply(USAR, 2, sd)
## Murder Assault UrbanPop Rape
## 1 1 1 1
apply(USAR, 2, mean)
## Murder Assault UrbanPop Rape
## -7.663087e-17 1.112408e-16 -4.332808e-16 8.942391e-17
Adistance<-get_dist(USAR)
fviz_dist(Adistance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

km_out<-kmeans(USAR, centers = 3, nstart = 25, iter.max = 100, algorithm ="Hartigan-Wong")
str(km_out)
## List of 9
## $ cluster : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:3, 1:4] -0.447 1.005 -0.962 -0.347 1.014 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:3] 19.6 46.7 12
## $ tot.withinss: num 78.3
## $ betweenss : num 118
## $ size : int [1:3] 17 20 13
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_out)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_out)
## [1] "list"
length(km_out)
## [1] 9
km_out$cluster
## Alabama Alaska Arizona Arkansas California
## 2 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 1 1 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 3 2 1 3
## Kansas Kentucky Louisiana Maine Maryland
## 1 3 2 3 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 2 3 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 3 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 3 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 3 2 2 1 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 3 3 1
##Code6:
### Put Cluster Output on the Map(1)
cluster_AR<-data.frame(objects_names = tolower(row.names(USAR)), cluster = unname(km_out$cluster))
str(cluster_AR)
## 'data.frame': 50 obs. of 2 variables:
## $ objects_names: chr "alabama" "alaska" "arizona" "arkansas" ...
## $ cluster : int 2 2 2 1 2 2 1 1 2 2 ...
## Cluster Validation Evaluation -
## Objective function: Sum of Square Error (SSE)
km_out$totss
## [1] 196
km_out$withinss
## [1] 19.62285 46.74796 11.95246
km_out$betweenss
## [1] 117.6767
sum(c(km_out$withinss, km_out$betweenss))
## [1] 196
cohesion<-sum(km_out$withinss)/km_out$totss
cohesion
## [1] 0.3996085
fviz_cluster(km_out, data = USAR)

USAR<-as.data.frame(USAR)
USAR<- mutate(USAR, cluster=km_out$cluster, objects_names= row.names(USAR))
str(USAR)
## 'data.frame': 50 obs. of 6 variables:
## $ Murder : num 1.2426 0.5079 0.0716 0.2323 0.2783 ...
## $ Assault : num 0.783 1.107 1.479 0.231 1.263 ...
## $ UrbanPop : num -0.521 -1.212 0.999 -1.074 1.759 ...
## $ Rape : num -0.00342 2.4842 1.04288 -0.18492 2.06782 ...
## $ cluster : Named int 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ objects_names: chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
ggplot(USAR, aes(x = UrbanPop, y = Murder, color = factor(cluster), label =objects_names)) + geom_text( )

##Putting clusters on the map
##Code 7
USAR$objects_names<-tolower(USAR$objects_names)
library(maps)
data("stateMapEnv")
states<-map_data("state")
ST<-states
ST$cluster<-c(0)
ST1<-ST[, -6]
ST1$region<-replace(ST1$region, ST1$region=='district of columbia', 'virginia')
ST<-ST1
addCluster<-function(x){
for (j in 1:length(USAR$objects_names)){
if(x==USAR$objects_names[j])
return (USAR$cluster[j])
}
}
for (i in 1:length(ST$region)){
ST$cluster[i]<-addCluster(ST$region[i])
}
ggplot( ST) +
geom_polygon(aes(x = long, y = lat, fill =cluster, group =group ), color = "white") +
coord_fixed(1.3)

#Code7:
### Elbow method to decide Optimal Number of Clusters(1)
set.seed(9)
elbow<-function(k) {
return(kmeans(USArrests, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values<-purrr::map_dbl(k_values, elbow)
plot(x = k_values, y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")

##code 8 Hierarchical clustering
# Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
hacR_output <-hclust(dist(USAR[, -c(5,6)], method = "euclidean"), method = "complete")
plot(hacR_output)

### Output Desirable Number of Clusters after Modeling
hacR_cut <- cutree(hacR_output, 3)
##Visualizing the hierarchical clusters
hR_clust1<-data.frame(index=which(hacR_cut==1))
hR_clust1<-mutate(hR_clust1, cluster=1)
hR_clust2<-data.frame(index=which(hacR_cut==2))
hR_clust2<- mutate(hR_clust2, cluster=2)
hR_clust3<-data.frame(index=which(hacR_cut==3))
hR_clust3<- mutate(hR_clust3, cluster=3)
colnames(hR_clust2)<-colnames(hR_clust1)
colnames(hR_clust3)<-colnames(hR_clust1)
hR_clust<-rbind(hR_clust1, hR_clust2, hR_clust3)
colnames(hR_clust)<-c('index', 'cluster')
ggplot(USAR, aes(x =UrbanPop , y = Murder, color = factor(hR_clust$cluster),label =objects_names)) + geom_text()

##Printing the number of mismatched clusters and indexes of objects mismatched
length(which(hacR_cut!= USAR$cluster))
## [1] 26
#Comparing performance of each clustering method with original dataset
##Comparing the two approaches
print(" Mismatched table")
## [1] " Mismatched table"
table(hacR_cut, USAR$cluster)
##
## hacR_cut 1 2 3
## 1 0 8 0
## 2 0 11 0
## 3 17 1 13
print("Proportion of Mismatched clusters")
## [1] "Proportion of Mismatched clusters"
cm<-table(hacR_cut, USAR$cluster)
print(1-diag(cm)/sum(cm))
## 1 2 3
## 1.00 0.78 0.74