K-Means Clustering

In this exercise we use K-Means clustering on randomly generated data using the kmeans() function.

set.seed(2)
x <- matrix(rnorm(50 * 2), ncol = 2)
x[1:25, 1] <- x[1:25, 1] + 3
x[1:25, 2] <- x[1:25, 2] - 4

Let’s start by clustering the data into two clusters with K = 2.

km.out <- kmeans(x, 2, nstart = 20)

The kmeans() function returns the cluster assignments in the cluster component.

km.out$cluster
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2

Now let’s plot the clusters.

plot(x, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K=2", xlab = "", ylab = "", pch = 20, cex = 2)

We can run K-means with different values for the number of clusters such as K = 3 and plot the results.

set.seed(4)
km.out <- kmeans(x, 3, nstart = 20)
km.out
## K-means clustering with 3 clusters of sizes 17, 23, 10
## 
## Cluster means:
##         [,1]        [,2]
## 1  3.7789567 -4.56200798
## 2 -0.3820397 -0.08740753
## 3  2.3001545 -2.69622023
## 
## Clustering vector:
##  [1] 1 3 1 3 1 1 1 3 1 3 1 3 1 3 1 3 1 1 1 1 1 3 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 3 2 3 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 25.74089 52.67700 19.56137
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
plot(x, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K=3", xlab = "", ylab = "", pch = 20, cex = 2)

We can control the initial cluster assignments with the nstart argument to kmeans().

set.seed(3)
km.out <- kmeans(x, 3, nstart = 1)
km.out$tot.withinss
## [1] 97.97927
km.out <- kmeans(x, 3, nstart = 20)
km.out$tot.withinss
## [1] 97.97927

Hierarchical Clustering

We can use hierarchical clustering on the dataset we generated in the previous exercise using the hclust() function.

hc.complete <- hclust(dist(x), method = "complete")

The hclust() function supports various agglomeration methods including “single”, “complete”, and “average” linkages.

hc.average <- hclust(dist(x), method = "average")
hc.single <- hclust(dist(x), method = "single")

We can compare the different linkages by plotting the results obtained with different methods.

par(mfrow = c(1, 3))
plot(hc.complete, main = "Complete Linkage", xlab = "", sub = "", cex = 0.9)
plot(hc.average, main = "Average Linkage", xlab = "", sub = "", cex = 0.9)
plot(hc.single, main = "Single Linkage", xlab = "", sub = "", cex = 0.9)

We can cut the tree into different groups using the cutree() function.

cutree(hc.complete, 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
## [39] 2 2 2 2 2 2 2 2 2 2 2 2
cutree(hc.average, 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
## [39] 2 2 2 2 2 1 2 1 2 2 2 2
cutree(hc.single, 2)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1
cutree(hc.single, 4)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [39] 3 3 3 4 3 3 3 3 3 3 3 3

We can scale the dataset before it to the clustering algorithm by first calling scale().

xsc <- scale(x)
plot(hclust(dist(xsc), method = "complete"), main = "Hierarchical Clustering with Scaled Features ")

x <- matrix(rnorm(30 * 3), ncol = 3)
dd <- as.dist(1 - cor(t(x)))
plot(hclust(dd, method = "complete"), main = "Complete Linkage with Correlation -Based Distance", xlab = "", sub = "")

Clustering the Observations of the NCI60 Data

In this final exercise we use Hierarchical and K-means clustering on the NC160 dataset. We first scale the data to have a zero mean and standard deviation of one.

library(ISLR)
nci.labs <- NCI60$labs
nci.data <- NCI60$data
sd.data <- scale(nci.data)
#nci.data=readRDS('C:/STAT646/mouse_liver_cts.RDS')
#sd.data <- scale(nci.data)
                      
#tpbmc3 <- t(pbmc3_sc)
#nci.labs=readRDS('C:/STAT646/mouse_liver_annot.RDS')
library(irlba)
## Loading required package: Matrix

We run Hierarchical clustering with different linakges and plot the results.

par(mfrow = c(1, 3))
data.dist <- dist(sd.data)
plot(hclust(data.dist), labels = nci.labs, main = "Complete Linkage", xlab = "", sub = "", ylab = "")
plot(hclust(data.dist, method = "average"), labels = nci.labs, main = "Average Linkage", xlab = "", sub = "", ylab = "")
plot(hclust(data.dist, method = "single"), labels = nci.labs, main = "Single Linkage", xlab = "", sub = "", ylab = "")

We cut the tree to give us four clusters using cutree().

hc.out <- hclust(dist(sd.data))
hc.clusters <- cutree(hc.out, 4)
table(hc.clusters, nci.labs)
##            nci.labs
## hc.clusters BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro
##           1      2   3     2           0           0        0           0
##           2      3   2     0           0           0        0           0
##           3      0   0     0           1           1        6           0
##           4      2   0     5           0           0        0           1
##            nci.labs
## hc.clusters MCF7D-repro MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##           1           0        8     8       6        2     8       1
##           2           0        0     1       0        0     1       0
##           3           0        0     0       0        0     0       0
##           4           1        0     0       0        0     0       0

And plot the results with four clusters.

par(mfrow = c(1, 1))
plot(hc.out, labels = nci.labs)
abline(h = 139, col = "red")

We can get a summary of the result from the return value of hclust().

hc.out
## 
## Call:
## hclust(d = dist(sd.data))
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 64

For clustering the cancer types in four groups with K-means, we simply run kmeans() with K = 4.

set.seed(2)
km.out <- kmeans(sd.data, 4, nstart = 20)
km.clusters <- km.out$cluster
table(km.clusters, hc.clusters)
##            hc.clusters
## km.clusters  1  2  3  4
##           1 11  0  0  9
##           2 20  7  0  0
##           3  9  0  0  0
##           4  0  0  8  0

We can also combine the different algorithms by first running principal component analysis and then performing hierarchical clustering on the first few principal components.

# PCA
pr.out <- prcomp(nci.data, scale = TRUE)
pve <- pr.out$sdev^2/sum(pr.out$sdev^2)
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained ", type = "b", pch=16)

# Hierarchical clustering
hc.out <- hclust(dist(pr.out$x[, 1:8]))
plot(hc.out, labels = nci.labs, main = "Hier. Clust. on First Eight PCs", xlab='')

table(cutree(hc.out, 4), nci.labs)
##    nci.labs
##     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro MCF7D-repro
##   1      3   5     0           0           0        0           0           0
##   2      0   0     0           1           1        6           0           0
##   3      2   0     7           0           0        0           0           0
##   4      2   0     0           0           0        0           1           1
##    nci.labs
##     MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##   1        1     8       6        2     9       1
##   2        0     0       0        0     0       0
##   3        7     1       0        0     0       0
##   4        0     0       0        0     0       0
# Now let's try k-means
km.out <- kmeans(dist(pr.out$x[, 1:8]), 6, nstart = 20)
table(km.out$cluster, nci.labs)
##    nci.labs
##     BREAST CNS COLON K562A-repro K562B-repro LEUKEMIA MCF7A-repro MCF7D-repro
##   1      0   0     6           0           0        2           0           0
##   2      0   0     0           1           1        4           0           0
##   3      2   0     1           0           0        0           1           1
##   4      2   2     0           0           0        0           0           0
##   5      2   0     0           0           0        0           0           0
##   6      1   3     0           0           0        0           0           0
##    nci.labs
##     MELANOMA NSCLC OVARIAN PROSTATE RENAL UNKNOWN
##   1        0     2       0        0     0       0
##   2        0     0       0        0     0       0
##   3        0     0       0        0     0       0
##   4        0     1       1        0     2       0
##   5        7     0       0        0     0       0
##   6        1     6       5        2     7       1

We can compute various metrics to assess each clustering method’s performance.

library(aricode)
ARI(km.out$cluster, nci.labs)
## [1] 0.2052548
ARI(cutree(hc.out, 4), nci.labs)
## [1] 0.2040697
ARI(cutree(hc.out, 10), nci.labs)
## [1] 0.3048968
library(cluster)
silhouette_score <- function(k){
  km <- kmeans(nci.data, centers = k, nstart=25)
  ss <- silhouette(km$cluster, dist(nci.data))
  mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)

Spectral clustering

library(kernlab)
data(spirals)
sc <- specc(spirals, centers=2)
plot(spirals, col=sc)