Code
rm(list = ls())
str(iris )
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
row.names(iris)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
## [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24"
## [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
## [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48"
## [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"
## Data Preprocess
sum(!complete.cases(iris))
## [1] 0
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
## Remove or impute missing objects
df <- na.omit(iris)
## Rescale (or normalization, etc.)
df <- scale(df[,-5], center = T, scale = T)
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
## Standardization
apply(iris[,-5], 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.8280661 0.4358663 1.7652982 0.7622377
apply(iris[,-5], 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 5.843333 3.057333 3.758000 1.199333
apply(df, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## -4.484318e-16 2.034094e-16 -2.895326e-17 -3.663049e-17
apply(df, 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
## Distance function and visualization
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

## K means
# When I ran the code, the elbow plot shows 3 clusters as the best option. That is why I modified the code and put 3 instead of 2 centers.
km_output <- kmeans(df, centers = 3, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : Named int [1:150] 3 3 3 3 3 3 3 3 3 3 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ centers : num [1:3, 1:4] -0.0501 1.1322 -1.0112 -0.8804 0.0881 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 44.1 47.5 47.4
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 53 47 50
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 3 3 3 3 3 3 3 3 3 3 2 2 2 1 1 1 2 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 2 2 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 1 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 1 2 1 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 2
## 141 142 143 144 145 146 147 148 149 150
## 2 2 1 2 2 2 1 2 2 1
## Cluster Validation Evaluation -
## Objective function: Sum of Square Error (SSE)
### SSE
#### Cluster cohesion
#### SSE can be used to compare cluster performance only for a similar number of clusters
km_output$totss
## [1] 596
km_output$withinss # distance without and within clusters
## [1] 44.08754 47.45019 47.35062
km_output$betweenss
## [1] 457.1116
sum(c(km_output$withinss, km_output$betweenss))
## [1] 596
cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion
## [1] 0.2330342
### Visualize Clusters
fviz_cluster(km_output, data = df)

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
df %>%
as.data.frame( ) %>%
mutate(cluster = km_output$cluster, Species = row.names(iris)) %>%
ggplot(aes(x = Sepal.Length, y = Sepal.Width, color = factor(cluster), label = Species)) + geom_text( )

### Put Cluster Output on the Map(1)
## Goegraphical coordinations are not available for the iris dataset. So I commented this section.
# cluster_df <- data.frame(Species = tolower(row.names(iris)), cluster = unname(km_output$cluster))
#
# library(maps)
# Species <- map_data("Species")
#
# Species %>%
# left_join(cluster_df, by = c("region" = "Species")) %>%
# ggplot( ) +
# geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster), group =group), color = "white") +
# coord_fixed(1.3) +
# guides(fill = F) +
# theme_bw( ) +
# theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
# panel.border = element_blank( ),
# axis.line = element_blank( ),
# axis.text = element_blank( ),
# axis.ticks = element_blank( ),
# axis.title = element_blank( ))
### Elbow method to decide Optimal Number of Clusters(1)
set.seed(8)
wss <- function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")

### Hierarchical Clustering
hac_output <- hclust(dist(iris, method = "euclidean"), method = "complete")
## Warning in dist(iris, method = "euclidean"): NAs introduced by coercion
plot(hac_output) # Calculating distance using hierarchical clustering, using Euclidean distance

# and using complete linkage for hierarchical clustering
### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 3)
for ( i in 1:length(hac_cut)) {
if( hac_cut[i] != km_output$cluster[i]) print(names(hac_cut) [i])
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL