Exam PA: Section 6.2 (Cluster Analysis)

# clean the environment and load the data
rm(list=ls())

data(iris)
iris <- iris[, c(1, 3, 5)]
summary(iris)

##   Sepal.Length    Petal.Length         Species  
##  Min.   :4.300   Min.   :1.000   setosa    :50  
##  1st Qu.:5.100   1st Qu.:1.600   versicolor:50  
##  Median :5.800   Median :4.350   virginica :50  
##  Mean   :5.843   Mean   :3.758                  
##  3rd Qu.:6.400   3rd Qu.:5.100                  
##  Max.   :7.900   Max.   :6.900

# scale the first 2 variables
iris$Sepal.Length <- scale(iris$Sepal.Length)
iris$Petal.Length <- scale(iris$Petal.Length)

apply(iris[, -3], 2, sd)

## Sepal.Length Petal.Length 
##            1            1

# plot the data to decide how many clusters do we need
# if there are no directional classes.
library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.0
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

iris %>% 
  ggplot(aes(x = Sepal.Length, y = Petal.Length)) +
  geom_point(aes(color = Species))

# conduct kmeans clustering

# kmeans uses ramdom initial cluster centers
set.seed(2023)

cluster_vars <- iris[, c('Sepal.Length', 'Petal.Length')]
km3 <- kmeans(cluster_vars, centers = 3, nstart = 20)
km3

## K-means clustering with 3 clusters of sizes 43, 54, 53
## 
## Cluster means:
##   Sepal.Length Petal.Length
## 1    1.2255135    1.0250063
## 2   -1.0117281   -1.2245544
## 3    0.0365328    0.4160503
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 1 3 1 3 3 2 1 3 2 3 3 3 3 1 3 3 3 3 3 3 3 3
##  [75] 3 1 1 1 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 3 3 3 3 2 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 17.54315 13.32996 11.87468
##  (between_SS / total_SS =  85.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# plot the clustered groups
iris$group <- as.factor(km3$cluster)

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 <- iris %>% 
  ggplot(aes(x = Sepal.Length, y = Petal.Length, col = group)) +
  geom_point(size = 2)

p2 <- iris %>% 
  ggplot(aes(x = Sepal.Length, y = Petal.Length, col = Species)) +
  geom_point(size = 2)

grid.arrange(p1, p2, ncol = 2)

# using elbow method to determine optimal k
km1 <- kmeans(cluster_vars, centers = 1, nstart = 20)
km2 <- kmeans(cluster_vars, centers = 2, nstart = 20)
km3 <- kmeans(cluster_vars, centers = 3, nstart = 20)
km4 <- kmeans(cluster_vars, centers = 4, nstart = 20)
km5 <- kmeans(cluster_vars, centers = 5, nstart = 20)
km6 <- kmeans(cluster_vars, centers = 6, nstart = 20)
km7 <- kmeans(cluster_vars, centers = 7, nstart = 20)
km8 <- kmeans(cluster_vars, centers = 8, nstart = 20)
km9 <- kmeans(cluster_vars, centers = 9, nstart = 20)
km10 <- kmeans(cluster_vars, centers = 10, nstart = 20)

var.exp <- data.frame(k = 1:10,
                      bss_tss = c(
                        km1$betweenss/km1$totss,
                        km2$betweenss/km2$totss,
                        km3$betweenss/km3$totss,
                        km4$betweenss/km4$totss,
                        km5$betweenss/km5$totss,
                        km6$betweenss/km6$totss,
                        km7$betweenss/km7$totss,
                        km8$betweenss/km8$totss,
                        km9$betweenss/km9$totss,
                        km10$betweenss/km10$totss
                      ))

var.exp %>% 
  ggplot(aes(x = k, y = bss_tss)) +
  geom_point() +
  geom_line() +
  labs(title = 'Elbow plot')

# doing PCA before using the kmeans clustering
# clean the working memeory and load the data
rm(list = ls())

data(iris)

# examine if the scaling is needed
apply(iris[, -5], 2, sd)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

# conduct PCA
pcs <- prcomp(~.-Species, data = iris, center = T, scale. = T)
pcs

## Standard deviations (1, .., p=4):
## [1] 1.7083611 0.9560494 0.3830886 0.1439265
## 
## Rotation (n x k) = (4 x 4):
##                     PC1         PC2        PC3        PC4
## Sepal.Length  0.5210659 -0.37741762  0.7195664  0.2612863
## Sepal.Width  -0.2693474 -0.92329566 -0.2443818 -0.1235096
## Petal.Length  0.5804131 -0.02449161 -0.1421264 -0.8014492
## Petal.Width   0.5648565 -0.06694199 -0.6342727  0.5235971

# generating the first 2 PCs

iris$pc1 <- pcs$rotation[1, 1] * scale(iris[, 1]) +
  pcs$rotation[2, 1] * scale(iris[, 2]) +
  pcs$rotation[3, 1] * scale(iris[, 3]) +
  pcs$rotation[4, 1] * scale(iris[, 4])

iris$pc2 <- pcs$rotation[1, 2] * scale(iris[, 1]) +
  pcs$rotation[2, 2] * scale(iris[, 2]) +
  pcs$rotation[3, 2] * scale(iris[, 3]) +
  pcs$rotation[4, 2] * scale(iris[, 4])

iris[, c(1, 2, 3, 4)] <- NULL
summary(iris)

##        Species         pc1.V1               pc2.V1       
##  setosa    :50   Min.   :-2.765081   Min.   :-2.6773152  
##  versicolor:50   1st Qu.:-2.095701   1st Qu.:-0.5920508  
##  virginica :50   Median : 0.416914   Median :-0.0174436  
##                  Mean   : 0.000000   Mean   : 0.0000000  
##                  3rd Qu.: 1.338543   3rd Qu.: 0.5964892  
##                  Max.   : 3.299641   Max.   : 2.6452111

# plot the pcs
iris %>% 
  ggplot(aes(x = pc1, y = pc2, col = Species)) +
  geom_point(size = 2)

# conduct the kmeans with the new pcs
km.new <- kmeans(iris[, -1], centers = 3, nstart = 20)
km.new

## K-means clustering with 3 clusters of sizes 53, 50, 47
## 
## Cluster means:
##          pc1        pc2
## 1  0.5707095  0.8045137
## 2 -2.2173249 -0.2879627
## 3  1.7152903 -0.6008742
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1
##  [75] 1 3 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3 3 1 3 3 3 3
## [112] 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3 1 3 3 3 1 3
## [149] 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 34.85204 44.58037 34.82154
##  (between_SS / total_SS =  80.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# plot the new cluster
iris$group <- as.factor(km.new$cluster)

iris %>% 
  ggplot(aes(x = pc1, y = pc2, col = group)) +
  geom_point(size = 2)

# determint the optimal k value
set.seed(2023)
km1 <- kmeans(iris[, c(2, 3)], centers = 1, nstart = 20)
km2 <- kmeans(iris[, c(2, 3)], centers = 2, nstart = 20)
km3 <- kmeans(iris[, c(2, 3)], centers = 3, nstart = 20)
km4 <- kmeans(iris[, c(2, 3)], centers = 4, nstart = 20)
km5 <- kmeans(iris[, c(2, 3)], centers = 5, nstart = 20)
km6 <- kmeans(iris[, c(2, 3)], centers = 6, nstart = 20)
km7 <- kmeans(iris[, c(2, 3)], centers = 7, nstart = 20)
km8 <- kmeans(iris[, c(2, 3)], centers = 8, nstart = 20)
km9 <- kmeans(iris[, c(2, 3)], centers = 9, nstart = 20)
km10 <- kmeans(iris[, c(2, 3)], centers = 10, nstart = 20)

var.exp <- data.frame(k = 1:10,
                      bss_tss = c(
                        km1$betweenss/km1$totss,
                        km2$betweenss/km2$totss,
                        km3$betweenss/km3$totss,
                        km4$betweenss/km4$totss,
                        km5$betweenss/km5$totss,
                        km6$betweenss/km6$totss,
                        km7$betweenss/km7$totss,
                        km8$betweenss/km8$totss,
                        km9$betweenss/km9$totss,
                        km10$betweenss/km10$totss
                      ))

var.exp %>% 
  ggplot(aes(x = k, y = bss_tss)) +
  geom_point(size = 2) +
  geom_line() +
  labs(title = 'Elbow plot')

# conduct the decision tree by using the pc1 and pc2
library(rpart)
library(rpart.plot)

rt <- rpart(Species ~ pc1 + pc2, data = iris, control = rpart.control(max_height = 10,
                                                                      minbucket = 5,
                                                                      cp = 0.01816))
rpart.plot(rt)

rt$cptable

##        CP nsplit rel error xerror       xstd
## 1 0.50000      0       1.0   1.21 0.04836666
## 2 0.40000      1       0.5   0.65 0.06069047
## 3 0.01816      2       0.1   0.18 0.03979950

# conduct the hierarchical clustering
# clean work memory and load the data
rm(list = ls())

data(iris)
iris <- iris[, c(1, 3, 5)]

iris$Sepal.Length <- scale(iris$Sepal.Length)
iris$Petal.Length <- scale(iris$Petal.Length)

# calculate the distance between each observations
dist_matrix <- dist(iris[, c(1, 2)])

hc.complete <- hclust(dist_matrix)
hc.single <- hclust(dist_matrix, method = 'single')
hc.average <- hclust(dist_matrix, method = 'average')

plot(hc.complete, cex = 0.3)

plot(hc.single, cex = 0.3)

plot(hc.average, cex = 0.3)

# visualize the clusters
plot_cluster <- function(df, hc_obj, n_clusters) {
  df$clusters <- as.factor(cutree(hc_obj, k = n_clusters))
  
  df %>%
    ggplot(aes(x = Sepal.Length, y = Petal.Length, col = clusters)) +
    geom_point(size = 2)
}

plot_cluster(iris, hc.complete, 2)

plot_cluster(iris, hc.complete, 3)

plot_cluster(iris, hc.complete, 4)

plot_cluster(iris, hc.complete, 5)

plot_cluster(iris, hc.complete, 6)

Exam PA: Section 6.2 (Cluster Analysis)

Jinpeng Yang