dataset <- iris
label_col = 5
str(dataset)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
row.names(dataset)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
## [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24"
## [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
## [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48"
## [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"
summary(dataset)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
## Check for incomplete cases
print(paste("The number of incomplete cases is",
sum(!complete.cases(dataset))
))
## [1] "The number of incomplete cases is 0"
## Remove or impute missing objects
df <- na.omit(dataset)
## Rescale (or normalization, etc.)
df <- df[,-label_col]
df <- scale(df, center = T, scale = T)
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
fviz_dist(distance,
gradient = list(low="#00AFBB",
mid = "white",
high = "#FC4E07"))
km_output <- kmeans(df, centers = 3, nstart = 25, iter.max = 100,
algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
## $ centers : num [1:3, 1:4] -0.0501 -1.0112 1.1322 -0.8804 0.8504 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:3] 44.1 47.4 47.5
## $ tot.withinss: num 139
## $ betweenss : num 457
## $ size : int [1:3] 53 50 47
## $ iter : int 2
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 2 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 3 3 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 3 1 3 3 3 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 3 1 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3
## 141 142 143 144 145 146 147 148 149 150
## 3 3 1 3 3 3 1 3 3 1
## Sum of Squares
print(paste("SST:", km_output$totss))
## [1] "SST: 596"
print("SSwithin:")
## [1] "SSwithin:"
km_output$withinss
## [1] 44.08754 47.35062 47.45019
print(paste("SSbetween:", km_output$betweenss))
## [1] "SSbetween: 457.111640282649"
print(paste("SSwithin + SSbetween:",
sum(c(km_output$withinss, km_output$betweenss))))
## [1] "SSwithin + SSbetween: 596"
cohesion <- sum(km_output$withinss)/ km_output$totss
print(paste("cohesion:", cohesion))
## [1] "cohesion: 0.233034160599583"
## Visualizing Clusters
fviz_cluster(km_output, data = df)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
dataset %>%
# as.data.frame( ) %>%
mutate(cluster = km_output$cluster,
objects_name = row.names(dataset)) %>%
ggplot(aes(x = Petal.Width, y = Sepal.Length,
label = factor(cluster),
color = Species)) + geom_text( )
set.seed(8)
wss <- function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
type = "b", frame = F,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")
We see an elbow at \(k=2\), which
matches the two clear clusters in the scatterplot of
Sepal.Length
vs Petal.Width
earlier. However,
we know that there are three species present in the data, and indeed we
see a less sharp “elbow” at \(k=3\).
We use Euclidean distance and complete linkage for the
hclus()
function.
hac_output <- hclust(dist(dataset, method = "euclidean"), method = "complete")
plot(hac_output)
hac_cut <- cutree(hac_output, 3)
table(hac_cut)
## hac_cut
## 1 2 3
## 50 72 28
km_cluster <- km_output$cluster
table(km_cluster)
## km_cluster
## 1 2 3
## 53 50 47
The hierarchical method differs from the k-means method for the three clusters each identifies, as shown in the tables above.