K-means clustering sorts data into a specified number of clusters. The algorithm works by iteratively assigning each data point to the nearest cluster center and then recalculating the cluster centers. The process continues until the cluster centers no longer change.
load("cfb_2021_off.rda")
head(off_dat)
## teams Rush_freq_per_game Pass_freq_per_game Punt_freq_per_game
## 1 Miami 29.66667 40.55556 4.888889
## 2 Alabama 34.40000 36.60000 3.000000
## 4 Arkansas 42.40000 25.70000 4.500000
## 5 Auburn 32.40000 36.70000 3.700000
## 8 Florida 36.80000 31.10000 3.300000
## 9 Clemson 32.30000 35.10000 5.600000
## Rush_avg_epa Pass_avg_epa Rush_prop_per_game Pass_prop_per_game
## 1 -0.04433453 0.23545661 0.3550532 0.4853723
## 2 0.08625343 0.38573148 0.4080664 0.4341637
## 4 0.07531054 0.16662277 0.5196078 0.3149510
## 5 0.19101496 0.09622955 0.3898917 0.4416366
## 8 0.14253222 0.36383853 0.4693878 0.3966837
## 9 0.03755856 -0.06593065 0.3997525 0.4344059
## Rush_avg_epa_first_down Pass_avg_epa_first_down Rush_prop_per_game_first_down
## 1 -0.04879862 0.13776235 0.3556851
## 2 -0.03464534 0.22125503 0.4753695
## 4 0.07371210 0.40727046 0.5312500
## 5 0.10462887 0.15392606 0.4387755
## 8 0.16815587 0.32256052 0.4502618
## 9 -0.01246624 0.03770129 0.4871795
## Pass_prop_per_game_first_down Rush_avg_epa_second_down
## 1 0.4723032 -0.07542871
## 2 0.3177340 0.02949730
## 4 0.2942708 0.07379798
## 5 0.3724490 0.20709742
## 8 0.3926702 0.17540518
## 9 0.3532764 0.10612068
## Pass_avg_epa_second_down Rush_prop_per_game_second_down
## 1 0.174027153 0.4726368
## 2 0.152021185 0.4156379
## 4 -0.085523805 0.6936937
## 5 0.007464204 0.4716157
## 8 0.447040353 0.5772727
## 9 0.014451324 0.4608696
## Pass_prop_per_game_second_down Rush_avg_epa_third_down
## 1 0.5223881 0.007212225
## 2 0.5720165 0.499830318
## 4 0.3063063 0.125645971
## 5 0.5021834 0.080798621
## 8 0.4090909 -0.200355793
## 9 0.5304348 -0.056877863
## Pass_avg_epa_third_down Rush_prop_per_game_third_down
## 1 0.30938022 0.3233083
## 2 0.91875328 0.3098592
## 4 0.02047649 0.4393939
## 5 0.16672555 0.2720588
## 8 0.49457329 0.4793388
## 9 -0.26045731 0.2797203
## Pass_prop_per_game_third_down
## 1 0.6691729
## 2 0.6690141
## 4 0.5378788
## 5 0.7058824
## 8 0.5123967
## 9 0.6923077
The values for the variables in this data set are on very different scales. To ensure that the clustering algorithm pays equal attention to all variables, scale the data in advance.
# Scale offensive data
off_dat_2 <- scale(off_dat[,2:20])
# Add teams back to data frame
off_dat <- cbind.data.frame(off_dat$teams, off_dat_2)
# Fix name of team column
names(off_dat)[1] <- "teams"
For our initial clustering lets try to fit four clusters to the data.
To run K-means on our data we use the kmeans() function.
The parameters we need to set for the K-means algorithm are:
set.seed(12345) # Set seed for reproducibility
fit_1 <- kmeans(x = off_dat[,2:20], # Set data as explantory variables
centers = 4, # Set number of clusters
nstart = 25, # Set number of starts
iter.max = 100) # Set maximum number of iterations to use
The results of our clustering is stored in fit_1. We can
extract the clusters and center values for this as follows:
# Extract clusters
clusters_1 <- fit_1$cluster
# Extract centers
centers_1 <- fit_1$centers
Lets first check how many samples have ended up in each cluster:
# Check samples per cluster
summary(as.factor(clusters_1))
## 1 2 3 4
## 22 3 27 16
Here we see that we have 9 samples in cluster 1, 26 in cluster 2, 230 in cluster 3, and 3 in cluster 4. We can view the teams in each cluster as follows:
# Check teams in cluster 1
cat("Cluster 1 teams:\n")
## Cluster 1 teams:
off_dat$teams[clusters_1 == 1]
## [1] "Miami" "Clemson" "Mississippi State"
## [4] "Missouri" "New Mexico State" "Texas A&M"
## [7] "NC State" "Pittsburgh" "Virginia"
## [10] "Notre Dame" "Indiana" "Maryland"
## [13] "Penn State" "Purdue" "Connecticut"
## [16] "Iowa State" "Stanford" "Washington"
## [19] "Washington State" "Arizona" "Vanderbilt"
## [22] "USC"
# Check teams in cluster 2
cat("Cluster 2 teams:\n")
## Cluster 2 teams:
off_dat$teams[clusters_1 == 2]
## [1] "Navy" "Army" "Air Force"
# Check teams in cluster 3
cat("Cluster 3 teams:\n")
## Cluster 3 teams:
off_dat$teams[clusters_1 == 3]
## [1] "Alabama" "Auburn" "Florida" "Georgia"
## [5] "Kentucky" "UCLA" "Tennessee" "Ole Miss"
## [9] "Florida State" "Nebraska" "Michigan" "Michigan State"
## [13] "Ohio State" "Oregon State" "Texas Tech" "Kansas State"
## [17] "Texas" "Louisiana" "Baylor" "Oklahoma"
## [21] "Utah" "Arizona State" "Oregon" "BYU"
## [25] "North Carolina" "Wake Forest" "TCU"
# Check teams in cluster 4
cat("Cluster 4 teams:\n")
## Cluster 4 teams:
off_dat$teams[clusters_1 == 4]
## [1] "Arkansas" "South Carolina" "Duke" "Louisville"
## [5] "Illinois" "Boston College" "Northwestern" "Minnesota"
## [9] "Wisconsin" "Rutgers" "Kansas" "Oklahoma State"
## [13] "Colorado" "Virginia Tech" "Georgia Tech" "Syracuse"
Lets check how the center values for each of the clusters compare to each other. To make this interpret-able lets just use the overall game level values:
# Create vector of clusters
cluster <- c(1: 4)
# Extract centers
center_df <- data.frame(cluster, centers_1)
# Reshape the data
center_reshape <- gather(center_df, features, values, Pass_freq_per_game:Rush_prop_per_game_third_down)
# View first few rows
head(center_reshape)
## cluster Rush_freq_per_game Pass_prop_per_game_third_down features
## 1 1 -0.8735016 0.7759475 Pass_freq_per_game
## 2 2 3.1342358 -3.3496762 Pass_freq_per_game
## 3 3 0.1114570 -0.1978129 Pass_freq_per_game
## 4 4 0.4253117 -0.1050542 Pass_freq_per_game
## 5 1 -0.8735016 0.7759475 Punt_freq_per_game
## 6 2 3.1342358 -3.3496762 Punt_freq_per_game
## values
## 1 0.9022161
## 2 -2.7623834
## 3 -0.1401969
## 4 -0.4860179
## 5 0.5372105
## 6 -0.6796855
# Create plot
g_heat_1 <- ggplot(data = center_reshape, # Set dataset
aes(x = features, y = cluster, fill = values)) + # Set aesthetics
scale_y_continuous(breaks = seq(1, 4, by = 1)) + # Set y axis breaks
geom_tile() + # Geom tile for heatmap
coord_equal() + # Make scale the same for both axis
theme_set(theme_bw(base_size = 22) ) + # Set theme
scale_fill_gradient2(low = "blue", # Choose low color
mid = "white", # Choose mid color
high = "red", # Choose high color
midpoint =0, # Choose mid point
space = "Lab",
na.value ="grey", # Choose NA value
guide = "colourbar", # Set color bar
aesthetics = "fill") + # Select aesthetics to apply
coord_flip() # Rotate plot to view names clearly
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
# Generate plot
g_heat_1
To find the optimal number of clusters, use the elbow method.
# Create function to try different cluster numbers
kmean_withinss <- function(k) {
cluster <- kmeans( x = off_dat[,2:20], # Set data to use
centers = k, # Set number of clusters as k, changes with input into function
nstart = 25, # Set number of starts
iter.max = 100) # Set max number of iterations
return (cluster$tot.withinss) # Return cluster error/within cluster sum of squares
}
# Set maximum cluster number
max_k <-20
# Run algorithm over a range of cluster numbers
wss <- sapply(2:max_k, kmean_withinss)
# Create a data frame to plot the graph
elbow <-data.frame(2:max_k, wss)
# Plot the graph with ggplot
g_e1 <- ggplot(elbow, # Set dataset
aes(x = X2.max_k, y = wss)) + # Set aesthetics
theme_set(theme_bw(base_size = 22) ) + # Set theme
geom_point(color = "blue") + # Set geom point for scatter
geom_line() + # Geom line for a line between points
scale_x_continuous(breaks = seq(1, 20, by = 1)) + # Set breaks for x-axis
labs(x = "Number of Clusters", y="Within Cluster \nSum of Squares") + # Set labels
theme(panel.grid.major = element_blank(), # Turn of the background grid
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
# Generate plot
g_e1