Important packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ISLR)

K means

Performing the K-lustering from 2 to 10 clusters.

kmean2 <- College %>%
select(-Private) %>%
kmeans(centers = 2, iter.max = 100, nstart = 100)
kmean3 <- College %>%
select(-Private) %>%
kmeans(centers = 3, iter.max = 100, nstart = 100)
kmean4 <- College %>%
select(-Private) %>%
kmeans(centers = 4, iter.max = 100, nstart = 100)
kmean5 <- College %>%
select(-Private) %>%
kmeans(centers = 5, iter.max = 100, nstart = 100)
kmean6 <- College %>%
select(-Private) %>%
kmeans(centers = 6, iter.max = 100, nstart = 100)
kmean7 <- College %>%
select(-Private) %>%
kmeans(centers = 7, iter.max = 100, nstart = 100)
kmean8 <- College %>%
select(-Private) %>%
kmeans(centers = 8, iter.max = 100, nstart = 100)
kmean9 <- College %>%
select(-Private) %>%
kmeans(centers = 9, iter.max = 100, nstart = 100)
kmean10 <- College %>%
select(-Private) %>%
kmeans(centers = 10, iter.max = 100, nstart = 100)

#graph the cluster variation

tibble(K = as.factor(seq(2, 10)),
Tvar = c(kmean2$tot.withinss, kmean3$tot.withinss,
kmean4$tot.withinss, kmean5$tot.withinss, kmean6$tot.withinss, kmean7$tot.withinss, kmean8$tot.withinss, kmean9$tot.withinss, kmean10$tot.withinss)) %>%
ggplot(mapping = aes(x = K, y = Tvar)) + geom_col()

#performing a pca, find projections and add he clusters.

pca3 <- College %>% select(-Private) %>% prcomp(scale = TRUE)
Z.withClust <- as_tibble(pca3$x) %>%
mutate(Private = College$Private,
C2 = as.factor(kmean2$cluster),
C3 = as.factor(kmean3$cluster),
C4 = as.factor(kmean4$cluster),
C5 = as.factor(kmean5$cluster),
C6 = as.factor(kmean6$cluster),
C7 = as.factor(kmean7$cluster),
C8 = as.factor(kmean8$cluster),
C9 = as.factor(kmean9$cluster) ,
C10 = as.factor(kmean10$cluster)) %>%
select(Private, C2, C3, C4, C5, C6, C7, C8, C9, C10, everything())

##PLOTING cluster

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C2)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C3)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C4)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C5)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C6)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C7)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C8)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C9)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

 Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C10)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Hierarchical clustering

dist() to calculate the default Euclidean distance between each pair of the 𝑁 observations, and then apply complete, average, and single linkage clustering to building a hierarchical clustering trees

hc.complete <- College %>% select(-Private) %>%
dist() %>%
hclust(method = "complete")

hc.average <- College %>% select(-Private) %>% 
dist() %>%
hclust(method = "average")

hc.single <- College %>% select(-Private) %>% 
dist() %>%
hclust(method = "single")

Plot

plot(hc.complete, labels = FALSE)

plot(hc.average, labels = FALSE)

plot(hc.single, labels = FALSE)

#add the PCA, In an aveage linkage

pca4 <- College %>% select(-Private) %>% prcomp(scale = TRUE)

Z.withClust <- as_tibble(pca4$x) %>%
mutate(Private = College$Private,
C2 = as.factor(cutree(hc.average, k = 2)),
C3 = as.factor(cutree(hc.average, k = 3)),
C4 = as.factor(cutree(hc.average, k = 4)),
C5 = as.factor(cutree(hc.average, k = 5)),
C6 = as.factor(cutree(hc.average, k = 6)),
C7 = as.factor(cutree(hc.average, k = 7)),
C8 = as.factor(cutree(hc.average, k = 8)),
C9 = as.factor(cutree(hc.average, k = 9)) ,
C10 = as.factor(cutree(hc.average, k = 10))) %>%
select(Private, C2, C3, C4, C5, C6, C7, C8, C9, C10, everything())

#Visualization

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C2)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C3)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C4)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C5)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C6)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C7)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C8)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C9)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C10)) + geom_point() +
facet_wrap(~ Private, nrow = 1)

Z.withClust %>%
ggplot(mapping = aes(x = PC1, y = PC2, color = C10)) + geom_point()