In this lesson students will …
caret
package for cross validationWhile there are variants on the k-means clustering algorithm, there is agreement that it is an unsupervised algorithm used to identify \(k\) homogeneous groups.
We will used the following alogorithm from ISLR: * Step 1: Randomly assign a number, from 1 to \(K\), to each of the observations. These serve as initial cluster assignments for the observations. * Step 2: Iterate until the cluster assignments stop changing: * (A) For each \(K\) clusters, computer the cluster centroid. The \(k^{th}\) cluster centroid is the vector of the p feature means for the observations in the \(k^{th}\) cluster. * (B) Assign each observation to the cluster whose centroid is closest (where closest is defined using Euclidian distance)
Let’s limit this example to two variables to start so that we can see how the k-means algorithm works.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
data(iris)
irisSP <- iris %>%
select(Sepal.Length, Petal.Length)
head(irisSP)
## Sepal.Length Petal.Length
## 1 5.1 1.4
## 2 4.9 1.4
## 3 4.7 1.3
## 4 4.6 1.5
## 5 5.0 1.4
## 6 5.4 1.7
### Randomly assign
## Number of clusters
k<-3
## Sample Size
n<-dim(irisSP)[1]
## initialize
iter<-0
## How many times should we repeat the clusters?
ceiling_k<-ceiling(dim(irisSP)[1]/3)
## Dont forget to set a seed
set.seed(123)
## Creating a random order
rep_k<-sample(rep(1:k, ceiling_k)[1:dim(irisSP)[1]])
## Bind the random assignment to the data
irisKM<-irisSP%>%
cbind(rep_k)
## Initial graphic
ggplot(irisKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)))+
geom_point()
### Calculate Centroids
irisCentroid<-irisKM%>%
group_by(rep_k)%>%
summarise_all(mean) ## Find means for all variables
irisCentroid
## # A tibble: 3 × 3
## rep_k Sepal.Length Petal.Length
## <int> <dbl> <dbl>
## 1 1 5.76 3.59
## 2 2 6.00 4.04
## 3 3 5.77 3.64
### PLOT
ggplot()+
geom_point(data=irisKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), alpha=.6)+
geom_point(data=irisCentroid, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), pch=3, size=6)+
theme_minimal()+
ggtitle(paste("K-Means Clusters: k=", k, ", Step=", iter))+
scale_color_discrete("Cluster")
### Closest
## Create a distance matrix
distMat<-matrix(nrow=n, ncol=k+1)
for(i in 1:n){
for(j in 1:k){
thisRBIND<-irisCentroid[j,]%>%
select(-c(rep_k))%>%
rbind(irisSP[i,])
distMat[i, j]<-dist(thisRBIND, method="euclidean")
}
## Find which centroid is the closest
distMat[i, k+1]<-which.min(distMat[i,])
}
## Check
distMat[sample(1:n, 6),]
## [,1] [,2] [,3] [,4]
## [1,] 1.111563 0.6113101 1.061659 2
## [2,] 3.115505 2.6096935 3.064167 2
## [3,] 2.194350 2.6916352 2.246847 1
## [4,] 2.615334 3.1211697 2.666893 1
## [5,] 1.313153 0.8220097 1.264721 2
## [6,] 1.525638 1.0640019 1.473744 2
We want to update the labels if they differ from the previous iteration.
### UPDATE if new
old<-irisKM$rep_k
new<-distMat[, k+1]
How many changed labels?
sum(old!=new)
## [1] 97
We will need to repeat the algorithm until there are no label changes.
if(sum(old!=new)>0){
iter<-iter+1
irisKM$rep_k<-distMat[, k+1]
### Re-Calculate Centroids
irisCentroid<-irisKM%>%
group_by(rep_k)%>%
summarise_all(mean)
}
ggplot()+
geom_point(data=irisKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), alpha=.6)+
geom_point(data=irisCentroid, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), pch=3, size=6)+
theme_minimal()+
ggtitle(paste("K-Means Clusters: k=", k, ", Step=", iter))+
scale_color_discrete("Cluster")
We want to continue assigning to observations to clusters until there is no change.
## NUMBER OF CLUSTERS
k<-3
## SAMPLE SIZE
n<-dim(irisSP)[1]
ceiling_k<-ceiling(dim(irisSP)[1]/3)
set.seed(123)
#set.seed(252)
rep_k<-sample(rep(1:k, ceiling_k)[1:dim(irisSP)[1]])
irisKM<-irisSP%>%
cbind(rep_k)
## FIND FIRST CENTROIDS
irisCentroid<-irisKM%>%
group_by(rep_k)%>%
summarise_all(mean)
## INITIALIZE
stop<-0
iter<-0
while(stop==0){
irisCentroid<-irisKM%>%
group_by(rep_k)%>%
summarise_all(mean)
if(iter==0){
holdKM<-irisKM%>%
cbind(iter=rep(0, n))
holdCentroid<-irisCentroid%>%
cbind(iter=rep(0, n))
}
### Closest
distMat<-matrix(nrow=n, ncol=k+1)
for(i in 1:n){
for(j in 1:k){
thisRBIND<-irisCentroid[j,]%>%
select(-c(rep_k))%>%
rbind(irisSP[i,])
distMat[i, j]<-dist(thisRBIND, method="euclidean")
}
distMat[i, k+1]<-which.min(distMat[i,])
}
### UPDATE if new
old<-irisKM$rep_k
new<-distMat[, k+1]
## SUM OF DISTANCES
score<-sum(distMat[,which.min(distMat[i,])])
## STOP IF NO UPDATES
if(sum(old!=new)==0){
stop<-1
}
if(sum(old!=new)!=0){
iter<-iter+1
## UPDATES CLUSTERS
irisKM$rep_k<-distMat[, k+1]
### STORE DATA FROM INTERATIONS
thisKM<-irisKM%>%
cbind(iter=rep(iter, n))
thisCentroid<-irisCentroid%>%
cbind(iter=rep(iter, n))
holdKM<-holdKM%>%
rbind(thisKM)
holdCentroid<-holdCentroid%>%
rbind(thisCentroid)
}
}
### PLOT LAST CLUSTERING
ggplot()+
geom_point(data=irisKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), alpha=.6)+
geom_point(data=irisCentroid, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k)), pch=3, size=6)+
theme_minimal()+
ggtitle(paste("K-Means Clusters: k=", k, ", Step=", iter))+
scale_color_discrete("Cluster")
How does this compare to the known species?
table(iris$Species, irisKM$rep_k)
##
## 1 2 3
## setosa 50 0 0
## versicolor 1 4 45
## virginica 0 37 13
Questions:
### USE PLOTLY FOR INTERACTIVITY
library(plotly)
g<-ggplot(data=holdKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k), frame=iter), alpha=.6)+
geom_point()+
theme_minimal()+
scale_color_discrete("Cluster")
ggplotly(g)
my_kmeans<-function(this.data, this.k, seed){
## NUMBER OF CLUSTERS
k<-this.k
## SAMPLE SIZE
n<-dim(this.data)[1]
ceiling_k<-ceiling(dim(this.data)[1]/k)
set.seed(seed)
rep_k<-sample(rep(1:k, ceiling_k)[1:dim(this.data)[1]])
this.data_KM<-this.data%>%
cbind(rep_k)
## FIND FIRST CENTROIDS
this.data_Centroid<-this.data_KM%>%
group_by(rep_k)%>%
summarise_all(mean)
## INITIALIZE
stop<-0
iter<-0
while(stop==0){
this.data_Centroid<-this.data_KM%>%
group_by(rep_k)%>%
summarise_all(mean)
if(iter==0){
holdKM<-this.data_KM%>%
cbind(iter=rep(0, n))
holdCentroid<-this.data_Centroid%>%
cbind(iter=rep(0, n))
}
### Closest
distMat<-matrix(nrow=n, ncol=k+1)
for(i in 1:n){
for(j in 1:k){
thisRBIND<-this.data_Centroid[j,]%>%
select(-c(rep_k))%>%
rbind(this.data[i,])
distMat[i, j]<-dist(thisRBIND, method="euclidean")
}
distMat[i, k+1]<-which.min(distMat[i,])
}
### UPDATE if new
old<-this.data_KM$rep_k
new<-distMat[, k+1]
## STOP IF NO UPDATES
if(sum(old!=new)==0){
stop<-1
}
if(sum(old!=new)!=0){
iter<-iter+1
## UPDATES CLUSTERS
this.data_KM$rep_k<-distMat[, k+1]
### STORE DATA FROM INTERATIONS
thisKM<-this.data_KM%>%
cbind(iter=rep(iter, n))
thisCentroid<-this.data_Centroid%>%
cbind(iter=rep(iter, n))
holdKM<-holdKM%>%
rbind(thisKM)
holdCentroid<-holdCentroid%>%
rbind(thisCentroid)
}
}
output<-list(iterations=iter,
final=this.data_KM,
allKM=holdKM,
allCentroids=holdCentroid)
return(output)
}
iris3<-my_kmeans(irisSP, 3, 123)
iris3$iterations
## [1] 9
iris5<-my_kmeans(irisSP, 5, 123)
g<-ggplot(data=iris5$allKM, aes(x=Sepal.Length, y=Petal.Length, color=as.factor(rep_k), frame=iter), alpha=.6)+
geom_point()+
theme_minimal()+
scale_color_discrete("Cluster")
ggplotly(g)
If we want to incorporate all four variables (sepal length and width and petal length and width) we can use PCA to perform dimension reduction, which helps for visualization purposes.
irisS<- iris %>%
select(-c(Species))
irisPCA <- princomp(irisS)
irisD1D2 <- irisPCA$scores %>%
as.data.frame() %>%
select(Comp.1, Comp.2)
irisPCA3<-my_kmeans(irisD1D2, 3, 123)
g<-ggplot(data=irisPCA3$allKM, aes(x=Comp.1, y=Comp.2, color=as.factor(rep_k), frame=iter), alpha=.6)+
geom_point()+
theme_minimal()+
scale_color_discrete("Cluster")
ggplotly(g)
### How does this relate to species
table(iris$Species, irisPCA3$final$rep_k)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 3 47
## virginica 0 36 14
The following examples in this section are from Machine Learning for Biostatistics.
bdiag<- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/bdiag.csv", stringsAsFactors = TRUE)
#select a subset of the variables
bdiag.2vars <- bdiag[,c("radius_mean", "texture_mean")]
kmeans
#let's compute the 3 clusters
km <- kmeans(bdiag.2vars, centers = 3)
km
## K-means clustering with 3 clusters of sizes 310, 125, 134
##
## Cluster means:
## radius_mean texture_mean
## 1 12.44730 16.33861
## 2 19.47728 21.76760
## 3 13.02317 23.80515
##
## Clustering vector:
## [1] 1 2 2 3 2 1 2 3 3 3 3 1 2 3 3 3 3 2 2 1 1 1 1 2 2 1 3 2 3 1 2 1 2 2 1 2 3
## [38] 1 3 3 3 3 2 3 3 2 1 1 1 3 3 1 1 2 3 1 2 3 1 1 1 3 3 1 3 3 3 1 1 1 2 1 2 1
## [75] 1 2 1 1 2 1 3 1 2 2 1 2 3 2 3 1 3 3 1 1 1 2 1 1 1 1 3 1 3 1 1 1 1 1 2 3 1
## [112] 3 1 1 1 3 1 1 3 2 1 2 2 1 1 1 3 2 1 2 1 1 2 1 2 3 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 3 3 1 1 1 1 2 2 1 1 3 2 2 3 2 1 1 2 2 1 1 1 1 1 1 1 1 2 3 1 2 2 2 1 3
## [186] 1 2 1 1 1 3 3 1 3 3 1 3 2 2 3 1 2 2 3 1 1 1 2 3 1 2 1 2 2 3 1 1 1 2 2 1 1
## [223] 1 2 1 1 1 1 3 3 2 3 3 2 1 3 2 2 3 3 1 1 1 3 2 1 1 1 3 1 2 1 2 1 2 1 2 1 3
## [260] 3 2 2 2 1 2 2 1 3 1 3 1 1 2 1 2 1 1 2 1 1 2 1 2 2 1 1 3 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 3 2 1 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 2 1 1 1
## [334] 1 1 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 1 1 3 1 1 1 2 2 1 2 2
## [371] 3 1 2 2 1 1 1 3 1 1 1 1 3 1 1 3 1 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1
## [408] 3 2 1 1 1 3 3 3 3 3 3 1 3 1 1 1 1 1 3 1 3 1 1 3 1 2 2 1 1 1 1 1 1 1 2 1 1
## [445] 2 3 3 1 1 2 3 2 3 1 1 3 3 3 3 3 3 2 3 1 1 3 3 1 2 1 1 3 1 3 1 1 3 1 1 2 1
## [482] 1 1 1 1 1 1 2 1 2 3 1 2 1 3 3 1 1 2 2 1 3 1 2 1 1 1 1 1 3 1 1 3 1 1 1 2 2
## [519] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 3 3 3 3 1 3 3 3 3 3 1 1 1 3 3 3 3 3 3
## [556] 3 1 3 3 3 3 3 3 2 2 2 3 2 3
##
## Within cluster sum of squares by cluster:
## [1] 2773.943 2013.213 1958.659
## (between_SS / total_SS = 61.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(km, data = bdiag.2vars, label=NA)+theme_bw()
We want to choose the optimal number of clusters for our data.
Look for the point of diminishing returns.
fviz_nbclust(bdiag.2vars, kmeans, method = "wss", k.max = 10)
This plot suggests 2 or 3 clusters for these data.
We desire high average silhouette width, since this is an indication of how well each data point lies within its cluster.
fviz_nbclust(bdiag.2vars, kmeans, method = "silhouette", k.max = 10)
Two is the optimal number of clusters in this case.
We wish to compare the “total intracluster variation for different number of cluster k with their expected values under a data with no clustering (these data generated using Monte Carlo simulations). The higher the gap between the observed and expected, the better the clustering.”
fviz_nbclust(bdiag.2vars, kmeans, method = "gap", nboot=200, k.max = 10)
This suggests one cluster.
Can you implement k-means with more than two variables? Will you need to change the code? Try it!
Pre-process the Breast Cancer diagnosis data with PCA then fit a k-means.
What would be the best number of clusters?