The following code applies K-means clustering of an individual quarterbacks overall college career statistic against their first season playing in the NFL.
stats.all <- read.csv("C:/Users/ird/Desktop/dataMiningFinalProject/qb_stats.csv")
stats.college = stats.all[, c("name", "c_avg_cmpp", "c_ya", "c_aya", "c_rate",
"c_pct", "c_avg_att", "c_avg_tds", "c_avg_inter", "c_avg_yds", "c_numyrs")]
stats.nfl = stats.all[, c("name", "year", "age", "height", "weight", "rating",
"completions", "yds_per_comp", "yds_lost_by_sack", "ints", "net_yds_per_att",
"games_played", "longest_pass", "adj_net_yds_per_att", "td_percentage",
"completion_percentage", "games_started", "int_percentage", "avg_yds_per_att",
"avg_value", "comebacks", "gwds", "sacked", "wins", "tds", "yds_per_att",
"yds", "qbr", "yds_per_game", "att", "perc_times_sk")]
k = 2
college.data.col = 2
nfl.data.col = 18 # select an nfl target variable
for (i in college.data.col:(ncol(stats.college))) {
cat(colnames(stats.college)[i], colnames(stats.nfl[nfl.data.col]), "\n")
df = data.frame(stats.college[, i], stats.nfl[, nfl.data.col])
colnames(df) <- c(colnames(stats.college)[i], colnames(stats.nfl[nfl.data.col]))
df = na.omit(df)
fit = kmeans(df, k)
plot(df, col = fit$cluster, main = paste(paste(paste(paste("NFL", capitalize(colnames(stats.nfl[nfl.data.col]))),
"vs"), capitalize(stats.college.title[i])), "using K-means(2)"), xlab = (paste("College",
capitalize(stats.college.title[i]))), ylab = (paste("NFL", capitalize(colnames(stats.nfl[nfl.data.col])))),
cex.lab = 1.25)
}
## c_avg_cmpp int_percentage
## c_ya int_percentage
## c_aya int_percentage
## c_rate int_percentage
## c_pct int_percentage
## c_avg_att int_percentage
## c_avg_tds int_percentage
## c_avg_inter int_percentage
## c_avg_yds int_percentage
## c_numyrs int_percentage