Unsupervised Learning Notes

PCA

  • removes dimensions of dataset
  • finds sequence of linear combinations of variables with maximal variance that are uncorrelated with each other
  • first principal component has largest variance and on and on
  • sum of squares of loadings (linear combo coefficients) constrained to be equal to one

Clustering

  • identifies groups within the dataset

K-Means Clustering

  • partition data into k groups so that sum of squares from points to the cluster center is minimized for each group
  • k must be selected by researcher and can be a source of inaccuracy if wrong choice is made

Hierarchical Clustering

  • doesn’t require pre-specified k selection
  • produces tree-based representation of observations (Dendrogram)
  • starts from leaves and combines clusters based on proximity up to the trunk
  • group subdivision done by cutting dendrogram at desired similarity level

Coding Examples

PCA

get data and load libraries

## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"

get means and variances of variables

apply(USArrests, 2, mean)
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
apply(USArrests, 2, var)
##     Murder    Assault   UrbanPop       Rape 
##   18.97047 6945.16571  209.51878   87.72916

PCA with scaling

pr.out <- prcomp(USArrests, scale = TRUE)
names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pr.out$center
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
pr.out$scale
##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385
pr.out$rotation
##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432
dim(pr.out$x)
## [1] 50  4

manipulation of output PVE = proportion of variance explained (standard deviation^2/sum of standard deviation^2)

pr.out$rotation <- -pr.out$rotation
pr.out$x <- -pr.out$x
pr.out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr.var <- pr.out$sdev^2
pr.var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve <- pr.var/sum(pr.var)
pve
## [1] 0.62006039 0.24744129 0.08914080 0.04335752

PCA plot

biplot(pr.out, scale = 0)

Proportion Variance Explained plot

plot(pve, xlab="Principal Component", ylab="Proportion of Variance Explained", ylim=c(0,1),type='b')

Cumulative PVE plot

plot(cumsum(pve), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained", ylim=c(0,1),type='b')

Factoextra multivariate plot

fviz(pr.out, "ind", geom = "auto", mean.point = TRUE, font.family = "Georgia")

fviz_pca_biplot(pr.out, font.family = "Georgia", col.var="firebrick1")

K-means clustering

load libraries and data- computer example

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(RColorBrewer)
library(animation)
computers <- read.csv("https://raw.githubusercontent.com/guru99-edu/R-Programming/master/computers.csv")
rescaled_comp <- computers[4:5] %>%
  mutate(hd_scal = scale(hd),
         ram_scal = scale(ram)) %>%
  select(c(hd_scal, ram_scal))

visualize variables

ggplot(data = rescaled_comp, aes(x = hd_scal, y = ram_scal)) +
  geom_point(pch=20, col = "blue") + theme_bw() +
  labs(x = "Hard drive size (Scaled)", y ="RAM size (Scaled)" ) +
  theme(text = element_text(family="Georgia"))

animate k-means clustering process

set.seed(2345)
kmeans.ani(rescaled_comp[1:2], centers = 4, pch = 15:18, col = 1:4) 

iris example not grouped by species

ggplot(iris, aes(Petal.Length, Petal.Width)) + geom_point() + 
  theme_bw() +
  scale_color_manual(values=c("firebrick1","forestgreen","darkblue"))

grouped by species

ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point() + 
  theme_bw() +
  scale_color_manual(values=c("firebrick1","forestgreen","darkblue"))

k-means clusters 3 clusters

set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 52, 48, 50
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     4.269231    1.342308
## 2     5.595833    2.037500
## 3     1.462000    0.246000
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [38] 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
## [149] 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 13.05769 16.29167  2.02200
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
class(irisCluster$cluster)
## [1] "integer"
table(irisCluster$cluster, iris$Species)
##    
##     setosa versicolor virginica
##   1      0         48         4
##   2      0          2        46
##   3     50          0         0

comparison of charts

irisCluster$cluster <- as.factor(irisCluster$cluster)
actual <- ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point() + 
  theme_bw() +
  scale_color_manual(values=c("firebrick1","forestgreen","darkblue")) +
  theme(legend.position="bottom") +
  theme(text = element_text(family="Georgia")) 
kmc <- ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point() +
  theme_bw() +
  scale_color_manual(values=c("firebrick1", "darkblue", "forestgreen")) +
  theme(legend.position="bottom") +
  theme(text = element_text(family="Georgia")) 
library(grid)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(arrangeGrob(actual, kmc, ncol=2, widths=c(1,1)), nrow=1)

wine example

load("~/Desktop/Spring 2022/Knowledge Mining/Homework/rattle.data/data/wine.RData")
wine_subset <- scale(wine[ , c(2:4)])

create clusters

wine_cluster <- kmeans(wine_subset, centers = 3,
                       iter.max = 10,
                       nstart = 25)
wine_cluster
## K-means clustering with 3 clusters of sizes 48, 60, 70
## 
## Cluster means:
##      Alcohol      Malic        Ash
## 1  0.1470536  1.3907328  0.2534220
## 2  0.8914655 -0.4522073  0.5406223
## 3 -0.8649501 -0.5660390 -0.6371656
## 
## Clustering vector:
##   [1] 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
##  [38] 2 3 1 2 1 2 1 3 1 1 2 2 2 3 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 2 3 3 2 2 2
##  [75] 3 3 3 3 3 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 1 3 3 3 3 3 1 3 3 2 1 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 1 1 1 2 1 1 1 1 1 1
## [149] 1 1 1 1 2 1 3 1 1 1 2 2 1 1 1 1 2 1 1 1 2 1 3 3 2 1 1 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1]  73.71460  67.98619 111.63512
##  (between_SS / total_SS =  52.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

create function to compute sums of squares, plot

wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")
}
wssplot(wine_subset, nc = 9)

plotted by dimension

wine_cluster$cluster <- as.factor(wine_cluster$cluster)
pairs(wine[2:4],
      col = c("firebrick1", "darkblue", "forestgreen")[wine_cluster$cluster],
      pch = c(15:17)[wine_cluster$cluster],
      main = "K-Means Clusters: Wine data")

table(wine_cluster$cluster)
## 
##  1  2  3 
## 48 60 70

Factoextra chart

fviz_nbclust(wine_subset, kmeans, method = "wss")

eclust() procedure for k means

wine.km <- eclust(wine_subset, "kmeans", nboot = 2)

wine.km
## K-means clustering with 3 clusters of sizes 60, 70, 48
## 
## Cluster means:
##      Alcohol      Malic        Ash
## 1  0.8914655 -0.4522073  0.5406223
## 2 -0.8649501 -0.5660390 -0.6371656
## 3  0.1470536  1.3907328  0.2534220
## 
## Clustering vector:
##   [1] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
##  [38] 1 2 3 1 3 1 3 2 3 3 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 1
##  [75] 2 2 2 2 2 3 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 3 2 2 2 2 2 3 2 2 1 3 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 3 3 3 1 3 3 3 3 3 3
## [149] 3 3 3 3 1 3 2 3 3 3 1 1 3 3 3 3 1 3 3 3 1 3 2 2 1 3 3 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1]  67.98619 111.63512  73.71460
##  (between_SS / total_SS =  52.3 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "clust_plot"  
## [11] "silinfo"      "nbclust"      "data"         "gap_stat"

determine optimal number of clusters gap statistic silhouette plot

wine.km$nbclust
## [1] 3
fviz_nbclust(wine_subset, kmeans, method = "gap_stat")

fviz_silhouette(wine.km)
##   cluster size ave.sil.width
## 1       1   60          0.44
## 2       2   70          0.33
## 3       3   48          0.30

other cluster plots

fviz_cluster(wine_cluster, data = wine_subset) + 
  theme_bw() +
  theme(text = element_text(family="Georgia")) 

fviz_cluster(wine_cluster, data = wine_subset, ellipse.type = "norm") + 
  theme_bw() +
  theme(text = element_text(family="Georgia")) 

## Hierarchical clustering

arrest.hc <- USArrests %>%
  scale() %>%                    
  dist(method = "euclidean") %>% 
  hclust(method = "ward.D2")

fviz_dend(arrest.hc, k = 4, # Four groups
          cex = 0.5, 
          k_colors = c("firebrick1","forestgreen","blue", "purple"),
          color_labels_by_k = TRUE,
          rect = TRUE,
          main = "Cluster Dendrogram: USA Arrest data"
) + theme(text = element_text(family="Georgia")) 
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.