# clean the environment and load the data
rm(list=ls())
data(iris)
iris <- iris[, c(1, 3, 5)]
summary(iris)
## Sepal.Length Petal.Length Species
## Min. :4.300 Min. :1.000 setosa :50
## 1st Qu.:5.100 1st Qu.:1.600 versicolor:50
## Median :5.800 Median :4.350 virginica :50
## Mean :5.843 Mean :3.758
## 3rd Qu.:6.400 3rd Qu.:5.100
## Max. :7.900 Max. :6.900
# scale the first 2 variables
iris$Sepal.Length <- scale(iris$Sepal.Length)
iris$Petal.Length <- scale(iris$Petal.Length)
apply(iris[, -3], 2, sd)
## Sepal.Length Petal.Length
## 1 1
# plot the data to decide how many clusters do we need
# if there are no directional classes.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.0
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
iris %>%
ggplot(aes(x = Sepal.Length, y = Petal.Length)) +
geom_point(aes(color = Species))

# conduct kmeans clustering
# kmeans uses ramdom initial cluster centers
set.seed(2023)
cluster_vars <- iris[, c('Sepal.Length', 'Petal.Length')]
km3 <- kmeans(cluster_vars, centers = 3, nstart = 20)
km3
## K-means clustering with 3 clusters of sizes 43, 54, 53
##
## Cluster means:
## Sepal.Length Petal.Length
## 1 1.2255135 1.0250063
## 2 -1.0117281 -1.2245544
## 3 0.0365328 0.4160503
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 1 3 1 3 3 2 1 3 2 3 3 3 3 1 3 3 3 3 3 3 3 3
## [75] 3 1 1 1 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 3 3 3 3 2 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 3 3
##
## Within cluster sum of squares by cluster:
## [1] 17.54315 13.32996 11.87468
## (between_SS / total_SS = 85.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# plot the clustered groups
iris$group <- as.factor(km3$cluster)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- iris %>%
ggplot(aes(x = Sepal.Length, y = Petal.Length, col = group)) +
geom_point(size = 2)
p2 <- iris %>%
ggplot(aes(x = Sepal.Length, y = Petal.Length, col = Species)) +
geom_point(size = 2)
grid.arrange(p1, p2, ncol = 2)

# using elbow method to determine optimal k
km1 <- kmeans(cluster_vars, centers = 1, nstart = 20)
km2 <- kmeans(cluster_vars, centers = 2, nstart = 20)
km3 <- kmeans(cluster_vars, centers = 3, nstart = 20)
km4 <- kmeans(cluster_vars, centers = 4, nstart = 20)
km5 <- kmeans(cluster_vars, centers = 5, nstart = 20)
km6 <- kmeans(cluster_vars, centers = 6, nstart = 20)
km7 <- kmeans(cluster_vars, centers = 7, nstart = 20)
km8 <- kmeans(cluster_vars, centers = 8, nstart = 20)
km9 <- kmeans(cluster_vars, centers = 9, nstart = 20)
km10 <- kmeans(cluster_vars, centers = 10, nstart = 20)
var.exp <- data.frame(k = 1:10,
bss_tss = c(
km1$betweenss/km1$totss,
km2$betweenss/km2$totss,
km3$betweenss/km3$totss,
km4$betweenss/km4$totss,
km5$betweenss/km5$totss,
km6$betweenss/km6$totss,
km7$betweenss/km7$totss,
km8$betweenss/km8$totss,
km9$betweenss/km9$totss,
km10$betweenss/km10$totss
))
var.exp %>%
ggplot(aes(x = k, y = bss_tss)) +
geom_point() +
geom_line() +
labs(title = 'Elbow plot')

# doing PCA before using the kmeans clustering
# clean the working memeory and load the data
rm(list = ls())
data(iris)
# examine if the scaling is needed
apply(iris[, -5], 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.8280661 0.4358663 1.7652982 0.7622377
# conduct PCA
pcs <- prcomp(~.-Species, data = iris, center = T, scale. = T)
pcs
## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.5210659 -0.37741762 0.7195664 0.2612863
## Sepal.Width -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length 0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width 0.5648565 -0.06694199 -0.6342727 0.5235971
# generating the first 2 PCs
iris$pc1 <- pcs$rotation[1, 1] * scale(iris[, 1]) +
pcs$rotation[2, 1] * scale(iris[, 2]) +
pcs$rotation[3, 1] * scale(iris[, 3]) +
pcs$rotation[4, 1] * scale(iris[, 4])
iris$pc2 <- pcs$rotation[1, 2] * scale(iris[, 1]) +
pcs$rotation[2, 2] * scale(iris[, 2]) +
pcs$rotation[3, 2] * scale(iris[, 3]) +
pcs$rotation[4, 2] * scale(iris[, 4])
iris[, c(1, 2, 3, 4)] <- NULL
summary(iris)
## Species pc1.V1 pc2.V1
## setosa :50 Min. :-2.765081 Min. :-2.6773152
## versicolor:50 1st Qu.:-2.095701 1st Qu.:-0.5920508
## virginica :50 Median : 0.416914 Median :-0.0174436
## Mean : 0.000000 Mean : 0.0000000
## 3rd Qu.: 1.338543 3rd Qu.: 0.5964892
## Max. : 3.299641 Max. : 2.6452111
# plot the pcs
iris %>%
ggplot(aes(x = pc1, y = pc2, col = Species)) +
geom_point(size = 2)

# conduct the kmeans with the new pcs
km.new <- kmeans(iris[, -1], centers = 3, nstart = 20)
km.new
## K-means clustering with 3 clusters of sizes 53, 50, 47
##
## Cluster means:
## pc1 pc2
## 1 0.5707095 0.8045137
## 2 -2.2173249 -0.2879627
## 3 1.7152903 -0.6008742
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1
## [75] 1 3 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3 3 1 3 3 3 3
## [112] 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3 1 3 3 3 1 3
## [149] 3 1
##
## Within cluster sum of squares by cluster:
## [1] 34.85204 44.58037 34.82154
## (between_SS / total_SS = 80.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# plot the new cluster
iris$group <- as.factor(km.new$cluster)
iris %>%
ggplot(aes(x = pc1, y = pc2, col = group)) +
geom_point(size = 2)

# determint the optimal k value
set.seed(2023)
km1 <- kmeans(iris[, c(2, 3)], centers = 1, nstart = 20)
km2 <- kmeans(iris[, c(2, 3)], centers = 2, nstart = 20)
km3 <- kmeans(iris[, c(2, 3)], centers = 3, nstart = 20)
km4 <- kmeans(iris[, c(2, 3)], centers = 4, nstart = 20)
km5 <- kmeans(iris[, c(2, 3)], centers = 5, nstart = 20)
km6 <- kmeans(iris[, c(2, 3)], centers = 6, nstart = 20)
km7 <- kmeans(iris[, c(2, 3)], centers = 7, nstart = 20)
km8 <- kmeans(iris[, c(2, 3)], centers = 8, nstart = 20)
km9 <- kmeans(iris[, c(2, 3)], centers = 9, nstart = 20)
km10 <- kmeans(iris[, c(2, 3)], centers = 10, nstart = 20)
var.exp <- data.frame(k = 1:10,
bss_tss = c(
km1$betweenss/km1$totss,
km2$betweenss/km2$totss,
km3$betweenss/km3$totss,
km4$betweenss/km4$totss,
km5$betweenss/km5$totss,
km6$betweenss/km6$totss,
km7$betweenss/km7$totss,
km8$betweenss/km8$totss,
km9$betweenss/km9$totss,
km10$betweenss/km10$totss
))
var.exp %>%
ggplot(aes(x = k, y = bss_tss)) +
geom_point(size = 2) +
geom_line() +
labs(title = 'Elbow plot')

# conduct the decision tree by using the pc1 and pc2
library(rpart)
library(rpart.plot)
rt <- rpart(Species ~ pc1 + pc2, data = iris, control = rpart.control(max_height = 10,
minbucket = 5,
cp = 0.01816))
rpart.plot(rt)

rt$cptable
## CP nsplit rel error xerror xstd
## 1 0.50000 0 1.0 1.21 0.04836666
## 2 0.40000 1 0.5 0.65 0.06069047
## 3 0.01816 2 0.1 0.18 0.03979950
# conduct the hierarchical clustering
# clean work memory and load the data
rm(list = ls())
data(iris)
iris <- iris[, c(1, 3, 5)]
iris$Sepal.Length <- scale(iris$Sepal.Length)
iris$Petal.Length <- scale(iris$Petal.Length)
# calculate the distance between each observations
dist_matrix <- dist(iris[, c(1, 2)])
hc.complete <- hclust(dist_matrix)
hc.single <- hclust(dist_matrix, method = 'single')
hc.average <- hclust(dist_matrix, method = 'average')
plot(hc.complete, cex = 0.3)

plot(hc.single, cex = 0.3)

plot(hc.average, cex = 0.3)

# visualize the clusters
plot_cluster <- function(df, hc_obj, n_clusters) {
df$clusters <- as.factor(cutree(hc_obj, k = n_clusters))
df %>%
ggplot(aes(x = Sepal.Length, y = Petal.Length, col = clusters)) +
geom_point(size = 2)
}
plot_cluster(iris, hc.complete, 2)

plot_cluster(iris, hc.complete, 3)

plot_cluster(iris, hc.complete, 4)

plot_cluster(iris, hc.complete, 5)

plot_cluster(iris, hc.complete, 6)
