##28
library(cluster)
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
row.names(iris)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
## [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24"
## [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
## [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48"
## [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"
## Data Preprocess
sum(!complete.cases(iris))
## [1] 0
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
##Remove or impute missing objects
df <- na.omit(iris)
## Rescale(or normalization, etc)
df[, -5] <- scale(df[, -5], center = TRUE, scale = TRUE)
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 -0.8976739 1.01560199 -1.335752 -1.311052 setosa
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052 setosa
## 3 -1.3807271 0.32731751 -1.392399 -1.311052 setosa
## 4 -1.5014904 0.09788935 -1.279104 -1.311052 setosa
## 5 -1.0184372 1.24503015 -1.335752 -1.311052 setosa
## 6 -0.5353840 1.93331463 -1.165809 -1.048667 setosa
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
## Standardization
apply(df[,-5], 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
apply(df[,-5], 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## -4.484318e-16 2.034094e-16 -2.895326e-17 -3.663049e-17
apply(df, 2, sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 1 1 1 NA
## Distance function and visualization
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.1
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
## Warning in stats::dist(x, method = method, ...): NAs introduced by coercion
fviz_dist(distance, gradient = list(low = "#00afbb", mid= "white", high ="#FC4E07"))
#FC4E07
##K means
km_output <- kmeans(df[,-5], centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ centers : num [1:2, 1:4] 0.506 -1.011 -0.425 0.85 0.65 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:2] 173.5 47.4
## $ tot.withinss: num 221
## $ betweenss : num 375
## $ size : int [1:2] 100 50
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 141 142 143 144 145 146 147 148 149 150
## 1 1 1 1 1 1 1 1 1 1
##Cluster Validation Evaluation-
##Objective function: Sum of Square Error (SSE)
## SSE
##Cluster Cohesion
## SSE can be used to compare cluster performance only for similar number of clusters
km_output$totss
## [1] 596
km_output$withinss
## [1] 173.52867 47.35062
km_output$betweenss
## [1] 375.1207
sum(c(km_output$withinss, km_output$betweenss))
## [1] 596
cohesion <- sum(km_output$withinss)/km_output$totss
cohesion
## [1] 0.3706028
## Visualize Clusters
fviz_cluster(km_output,data = df[,-5])
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#df %>% # as.data.frame( )%>% # mutate(cluster=km_output$cluster, Species = #row.names(iris))%>% # ggplot(aes(x = UrbanPop, y = Murder, colour = #factor(cluster), label = Species)) + geom_text()
## Arrest Data
rm(list=ls())
#library(cluster)
#library(tidyverse)
data("USArrests")
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
row.names(USArrests)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
dataset <-USArrests
#Data Preprocess
sum(!complete.cases(dataset))
## [1] 0
summary(dataset)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
#Remove or impute missing objects
df <- na.omit(dataset)
#Rescale or Normalization, etc
df[, -5] <- scale(df[, -5], center = T, scale = T)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
dataset <- df[,-5]
df <- dataset
summary(df)
## Murder Assault UrbanPop Rape
## Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
## 1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
## Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
## Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
apply(dataset,2, sd)
## Murder Assault UrbanPop Rape
## 1 1 1 1
apply(dataset, 2, mean)
## Murder Assault UrbanPop Rape
## -7.663087e-17 1.112408e-16 -4.332808e-16 8.942391e-17
apply(df,2,sd)
## Murder Assault UrbanPop Rape
## 1 1 1 1
# Distance function and visualization
library(factoextra)
distance <- get_dist(df, stand = TRUE, method = "pearson")
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
## Code3:
## K means
km_output <- kmeans(df[,-5], centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 56.1 46.7
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 30 20
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## Alabama Alaska Arizona Arkansas California
## 2 2 2 1 2
## Colorado Connecticut Delaware Florida Georgia
## 2 1 1 2 2
## Hawaii Idaho Illinois Indiana Iowa
## 1 1 2 1 1
## Kansas Kentucky Louisiana Maine Maryland
## 1 1 2 1 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 1 2 1 2 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 1 1 2 1 1
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 2 1 1
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 1 1 1 1 2
## South Dakota Tennessee Texas Utah Vermont
## 1 2 2 1 1
## Virginia Washington West Virginia Wisconsin Wyoming
## 1 1 1 1 1
## Cluster Validation Evaluation -
## Objective function: Sum of Square Error (SSE)
### SSE
## Code4:
#### Cluster cohesion
#### SSE can be used to compare cluster performance only for a similar number of clusters
km_output$totss
## [1] 196
km_output$withinss # distance without and within clusters
## [1] 56.11445 46.74796
km_output$betweenss
## [1] 93.1376
sum(c(km_output$withinss, km_output$betweenss) )
## [1] 196
cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion
## [1] 0.5248082
### Visualize Clusters
# library(factoextra)
fviz_cluster(km_output, data = df)
library(dplyr)
library(ggplot2)
## Code5:
# df %>%
# as.data.frame( df ) %>%
df %>% mutate(cluster = km_output$cluster, objects_name = row.names(dataset)) %>%
ggplot(aes(x = UrbanPop, y = Murder, color = factor(km_output$cluster), label = rownames(df) )) + geom_text( )
### Put Cluster Output on the Map(1)
cluster_df <- data.frame(objects_names = tolower(row.names(dataset)), cluster = unname(km_output$cluster))
head(cluster_df)
## objects_names cluster
## 1 alabama 2
## 2 alaska 2
## 3 arizona 2
## 4 arkansas 1
## 5 california 2
## 6 colorado 2
#install.packages("maps")
cluster_df <- cluster_df %>% rename(state = "objects_names")
library(maps)
## Warning: package 'maps' was built under R version 4.5.1
##
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
##
## votes.repub
states <- map_data("state")
objects_names <- map_data("state")
objects_names %>%
left_join(cluster_df, by = c("region" = "state")) %>%
ggplot( ) +
geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster)), color = "white") +
coord_fixed(1.3) +
guides(fill = F) +
theme_bw( ) +
theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
panel.border = element_blank( ),
axis.line = element_blank( ),
axis.text = element_blank( ),
axis.ticks = element_blank( ),
axis.title = element_blank( ))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
### Elbow method to decide Optimal Number of Clusters(1)
set.seed(8)
wss <- function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")
### Hierarchical Clustering
hac_output <- hclust( dist(dataset, method = "euclidean"), method = "complete")
plot(hac_output) # Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 2)
for ( i in 1:length(hac_cut)) {
if( hac_cut[i] != km_output$cluster[i]) print(names(hac_cut) [i])
}
## [1] "Alabama"
## [1] "Alaska"
## [1] "Arizona"
## [1] "Arkansas"
## [1] "California"
## [1] "Colorado"
## [1] "Connecticut"
## [1] "Delaware"
## [1] "Florida"
## [1] "Georgia"
## [1] "Hawaii"
## [1] "Idaho"
## [1] "Illinois"
## [1] "Indiana"
## [1] "Iowa"
## [1] "Kansas"
## [1] "Kentucky"
## [1] "Louisiana"
## [1] "Maine"
## [1] "Maryland"
## [1] "Massachusetts"
## [1] "Michigan"
## [1] "Minnesota"
## [1] "Mississippi"
## [1] "Montana"
## [1] "Nebraska"
## [1] "Nevada"
## [1] "New Hampshire"
## [1] "New Jersey"
## [1] "New Mexico"
## [1] "New York"
## [1] "North Carolina"
## [1] "North Dakota"
## [1] "Ohio"
## [1] "Oklahoma"
## [1] "Oregon"
## [1] "Pennsylvania"
## [1] "Rhode Island"
## [1] "South Carolina"
## [1] "South Dakota"
## [1] "Tennessee"
## [1] "Texas"
## [1] "Utah"
## [1] "Vermont"
## [1] "Virginia"
## [1] "Washington"
## [1] "West Virginia"
## [1] "Wisconsin"
## [1] "Wyoming"