The following code applies K-means clustering of an individual quarterbacks overall college career statistic. The resulting clusters are mapped against the NFL statistics to determine whether the clusters show distinct trends between clusters.
stats.college.scaled = scale(stats.college[, -1])
# K-Means cluster analysis for 2 clusters
fit.two <- kmeans(stats.college.scaled, 2)
fit.two
## K-means clustering with 2 clusters of sizes 119, 121
##
## Cluster means:
## c_avg_cmpp c_ya c_aya c_rate c_pct c_avg_att c_avg_tds
## 1 0.7403 0.5129 0.6187 0.6294 0.5978 0.6924 0.7318
## 2 -0.7281 -0.5044 -0.6085 -0.6190 -0.5880 -0.6809 -0.7197
## c_avg_inter c_avg_yds c_numyrs
## 1 0.2688 0.7654 0.2374
## 2 -0.2643 -0.7528 -0.2334
##
## Clustering vector:
## [1] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 2 2
## [36] 1 1 1 1 2 1 1 1 1 1 1 2 2 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 1
## [71] 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 2 2
## [106] 1 2 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 2 2 1 1 1 2 2 2 2 1 2 2 1 2 1
## [141] 1 1 2 2 2 1 2 1 2 1 1 1 2 2 1 2 2 2 2 2 2 2 1 1 1 1 2 1 2 2 1 1 2 1 2
## [176] 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2
## [211] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 669.3 852.9
## (between_SS / total_SS = 36.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size"
i = 0
fit.two.c1.names = data.frame()
fit.two.c2.names = data.frame()
fit.two.c1.indexes = vector()
fit.two.c2.indexes = vector()
for (clusterNumber in fit.two$cluster) {
i = i + 1
if (fit.two$cluster[i] == 1) {
fit.two.c1.names = append(as.data.frame(fit.two.c1.names), as.data.frame(stats.college[i,
1]))
fit.two.c1.indexes = append(fit.two.c1.indexes, i)
} else {
fit.two.c2.names = append(as.data.frame(fit.two.c2.names), as.data.frame(stats.college[i,
1]))
fit.two.c2.indexes = append(fit.two.c2.indexes, i)
}
}
fit.two.c1.names.col = t(as.data.frame(fit.two.c1.names))
fit.two.c2.names.col = t(as.data.frame(fit.two.c2.names))
fit.two.c1.nfl = stats.nfl[fit.two.c1.indexes, ]
fit.two.c2.nfl = stats.nfl[fit.two.c2.indexes, ]
fit.two.c1.nfl.means = colMeans(na.omit(fit.two.c1.nfl[, -1]))
fit.two.c2.nfl.means = colMeans(na.omit(fit.two.c2.nfl[, -1]))
fit.two.combine.nfl.means = cbind(fit.two.c1.nfl.means, fit.two.c2.nfl.means)
print(fit.two.combine.nfl.means)
## fit.two.c1.nfl.means fit.two.c2.nfl.means
## year 2010.188 2009.333
## age 24.125 25.000
## height 74.938 77.000
## weight 224.750 221.000
## rating 76.900 81.300
## completions 221.688 240.000
## yds_per_comp 11.988 11.533
## yds_lost_by_sack 179.875 219.333
## ints 12.938 11.000
## net_yds_per_att 5.918 5.953
## games_played 13.250 13.667
## longest_pass 68.688 66.000
## adj_net_yds_per_att 5.194 5.433
## td_percentage 3.900 3.433
## completion_percentage 57.200 60.367
## games_started 12.750 13.333
## int_percentage 3.456 2.767
## avg_yds_per_att 6.081 6.400
## avg_value 8.438 9.333
## comebacks 1.750 1.333
## gwds 2.188 2.000
## sacked 27.500 30.000
## wins 5.750 7.667
## tds 15.000 13.000
## yds_per_att 6.844 6.933
## yds 2651.438 2770.333
## qbr 44.581 51.550
## yds_per_game 197.956 206.300
## att 382.938 400.000
## perc_times_sk 7.031 7.033