## Lloyd's Algorithm
values <- c(0,2,3,4,5,10,16,18,20)
k <- 3
plot(values)
Where should the three groups be centered? Which values should be in which group?
sample.int(length(values), k)
centroids <- values[sample.int(length(values), k)]
centroids
centroids <- centroids[order(centroids)]
centroids
cluster <- rep(NA, length(values))
for (i in 1:length(values)){
cluster[i] <- which.min(abs(values[i]-centroids))
}
values
cluster
for (i in 1:k){
centroids[i] <- sum((cluster==i)*values)/sum(cluster==i)
}
centroids
old.centroids <- rep(centroids[1], k)
iter <- 1
while(sum(centroids == old.centroids)!=k){
old.centroids <- centroids
for (i in 1:length(values)){
cluster[i] <- which.min(abs(values[i]-centroids))
}
# find the average value for each cluster
for (i in 1:k){
centroids[i] <- sum((cluster==i)*values)/sum(cluster==i)
}
print(iter)
print(old.centroids)
print(centroids)
iter <- iter + 1
}
MSE <- mean((centroids[cluster] - values)^2)
MSE
Is this algorithm deterministic? Will it always ultimately find the same centroids? The best centroids?
df <- data.frame(values = c(0,2,3,4,5,10,16,18,20))
k.clusters <- kmeans(df, centers = 3, nstart = 1, algorithm="Lloyd")
k.clusters <- kmeans(df, centers = 3, nstart = 25, algorithm="Lloyd")
k.clusters
library(cluster.datasets)
data(cake.ingredients.1961)
df <- cake.ingredients.1961
df[is.na(df)] <- 0
row.names(df) <- df$Cake
df <- df[,-1]
df <- scale(df)
head(df)
df <- df[,-2]
k5 <- kmeans(df, centers = 5, nstart = 25)
for (i in 1:5) {print(paste("cluster", i)); print(row.names(df)[k5$cluster==i])}
How did we do?
When deciding on how many clusters to use, you can look at how much each additional cluster reduces the within-cluster spread. Try the following code:
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(df, centers=k, nstart=25)$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Here, there’s no big jump and no point at which more clusters stop reducing the within-cluster spread. If there’s an “elbow” in the plot, that would tell us how many clusters we should use but in this case, there is no obvious elbow.
The following data set has the speed, horizontal movement and vertical movement of every pitch that Max Scherzer has thrown in 2019:
df <- read.csv("/home/jcross/data/max_scherzer_2019.csv")
library(ggplot2)
mid<-85
ggplot(df, aes(pfx_x, pfx_z, color=start_speed))+geom_point()+scale_color_gradient2(midpoint=mid, mid="yellow", low="blue", high="red")
We can see how k means clustering would group these pitches using either 4 or 5 clusters:
scaled.df <- scale(df)
k4 <- kmeans(scaled.df, centers = 4, nstart = 25)
k5 <- kmeans(scaled.df, centers = 5, nstart = 25)
df %>%
as_tibble() %>%
mutate(cluster = k4$cluster) %>%
ggplot(aes(pfx_x, pfx_z, color = factor(cluster))) +
geom_point()
df %>%
as_tibble() %>%
mutate(cluster = k5$cluster) %>%
ggplot(aes(pfx_x, pfx_z, color = factor(cluster))) +
geom_point()
We can also look at how the within-cluster spread changes based on the number of clusters:
k.max <- 8
wss <- sapply(1:k.max,
function(k){kmeans(scaled.df, centers=k, nstart=25)$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Notice how there’s very little additional reduction in within-cluster sum of the squares after 5 clusters. That tells us that there’s little to be gained by adding a 6th cluster. There’s also not that much gain between the 4th and 5th clusters. So, we might say that Scherzer throws 4 truly distinct pitches or we might say that he throws 5 pitchers two of which (his cutter and slider, as it turns out) are somewhat similar.