# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
yds = qb_stats["yds"]
# Generate clean data set
data.scaled.no_combine.for_yds = data.frame(scale(na.omit(cbind(yds, college_stats))))
# Generate the linear model
lm.scaled.no_combine.yds <- lm(formula = yds ~ ., data = data.scaled.no_combine.for_yds)
# Find optimum linear regression model for yds
step_reg.scaled.no_combine.yds <- stepAIC(lm.scaled.no_combine.yds, direction = "both")
## Start: AIC=-16.55
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.01 202 -18.5
## - height 1 0.02 202 -18.5
## - c_numyrs 1 0.03 202 -18.5
## - c_avg_cmpp 1 0.07 202 -18.5
## - c_rate 1 0.47 202 -18.0
## - c_avg_att 1 0.67 202 -17.8
## - c_pct 1 0.70 202 -17.7
## - c_avg_inter 1 0.73 202 -17.7
## - age 1 1.49 203 -16.8
## <none> 202 -16.6
## - c_avg_yds 1 2.50 204 -15.6
## - weight 1 6.93 209 -10.5
##
## Step: AIC=-18.53
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.02 202 -20.5
## - c_numyrs 1 0.03 202 -20.5
## - c_avg_cmpp 1 0.06 202 -20.5
## - c_rate 1 0.48 202 -20.0
## - c_pct 1 0.71 202 -19.7
## - c_avg_att 1 0.72 202 -19.7
## - c_avg_inter 1 0.73 202 -19.7
## - age 1 1.48 203 -18.8
## <none> 202 -18.5
## - c_avg_yds 1 2.81 204 -17.2
## + c_avg_tds 1 0.01 202 -16.6
## - weight 1 6.94 209 -12.4
##
## Step: AIC=-20.52
## yds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.03 202 -22.48
## - c_avg_cmpp 1 0.06 202 -22.45
## - c_rate 1 0.49 202 -21.94
## - c_avg_inter 1 0.72 202 -21.66
## - c_avg_att 1 0.73 202 -21.66
## - c_pct 1 0.73 202 -21.65
## - age 1 1.51 203 -20.73
## <none> 202 -20.52
## - c_avg_yds 1 2.81 204 -19.21
## + height 1 0.02 202 -18.53
## + c_avg_tds 1 0.01 202 -18.53
## - weight 1 12.11 214 -8.59
##
## Step: AIC=-22.48
## yds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.08 202 -24.4
## - c_rate 1 0.50 202 -23.9
## - c_avg_att 1 0.70 202 -23.6
## - c_pct 1 0.81 203 -23.5
## - c_avg_inter 1 0.83 203 -23.5
## - age 1 1.50 203 -22.7
## <none> 202 -22.5
## - c_avg_yds 1 2.82 205 -21.2
## + c_numyrs 1 0.03 202 -20.5
## + c_avg_tds 1 0.01 202 -20.5
## + height 1 0.01 202 -20.5
## - weight 1 12.57 214 -10.0
##
## Step: AIC=-24.39
## yds ~ weight + age + c_rate + c_pct + c_avg_inter + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.43 202 -25.9
## - c_avg_inter 1 0.80 203 -25.4
## - c_pct 1 1.00 203 -25.2
## - age 1 1.57 203 -24.5
## <none> 202 -24.4
## - c_avg_att 1 2.38 204 -23.6
## - c_avg_yds 1 3.26 205 -22.6
## + c_avg_cmpp 1 0.08 202 -22.5
## + c_numyrs 1 0.05 202 -22.4
## + height 1 0.01 202 -22.4
## + c_avg_tds 1 0.00 202 -22.4
## - weight 1 12.76 215 -11.7
##
## Step: AIC=-25.89
## yds ~ weight + age + c_pct + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.57 203 -27.2
## - c_avg_inter 1 0.60 203 -27.2
## <none> 202 -25.9
## - age 1 1.92 204 -25.6
## - c_avg_att 1 2.16 204 -25.4
## + c_rate 1 0.43 202 -24.4
## + c_numyrs 1 0.04 202 -23.9
## + c_avg_tds 1 0.03 202 -23.9
## + height 1 0.02 202 -23.9
## + c_avg_cmpp 1 0.00 202 -23.9
## - c_avg_yds 1 3.69 206 -23.6
## - weight 1 12.99 215 -13.0
##
## Step: AIC=-27.21
## yds ~ weight + age + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 1.05 204 -28.0
## <none> 203 -27.2
## - age 1 2.28 205 -26.5
## - c_avg_att 1 2.54 205 -26.2
## + c_pct 1 0.57 202 -25.9
## + c_avg_cmpp 1 0.26 203 -25.5
## + c_numyrs 1 0.08 203 -25.3
## + height 1 0.04 203 -25.3
## + c_avg_tds 1 0.04 203 -25.3
## + c_rate 1 0.01 203 -25.2
## - c_avg_yds 1 5.57 208 -22.7
## - weight 1 13.71 216 -13.6
##
## Step: AIC=-27.97
## yds ~ weight + age + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 204 -28.0
## - age 1 2.31 206 -27.3
## + c_avg_inter 1 1.05 203 -27.2
## + c_pct 1 1.03 203 -27.2
## + c_avg_cmpp 1 0.88 203 -27.0
## + c_numyrs 1 0.25 204 -26.3
## + c_rate 1 0.18 204 -26.2
## + height 1 0.02 204 -26.0
## + c_avg_tds 1 0.00 204 -26.0
## - c_avg_att 1 5.80 210 -23.3
## - c_avg_yds 1 7.71 212 -21.1
## - weight 1 17.65 222 -10.1
summary(step_reg.scaled.no_combine.yds)
##
## Call:
## lm(formula = yds ~ weight + age + c_avg_yds + c_avg_att, data = data.scaled.no_combine.for_yds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0219 -0.6273 -0.0863 0.6654 2.3950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.95e-16 6.04e-02 0.00 1.0000
## weight 2.90e-01 6.43e-02 4.50 1.1e-05 ***
## age 1.01e-01 6.21e-02 1.63 0.1045
## c_avg_yds 7.31e-01 2.46e-01 2.98 0.0032 **
## c_avg_att -6.30e-01 2.44e-01 -2.58 0.0105 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.933 on 234 degrees of freedom
## Multiple R-squared: 0.143, Adjusted R-squared: 0.129
## F-statistic: 9.79 on 4 and 234 DF, p-value: 2.45e-07
plot(step_reg.scaled.no_combine.yds)
leaps.scaled.no_combine.yds <- regsubsets(yds ~ ., data = data.scaled.no_combine.for_yds,
nbest = 10)
subsets(leaps.scaled.no_combine.yds, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_yds, step_reg.scaled.no_combine.yds, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: yds
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 23.0 23.03 26.43 5.8e-07 ***
## age 1 2.2 2.22 2.54 0.112
## c_avg_yds 1 3.1 3.06 3.52 0.062 .
## c_avg_att 1 5.8 5.80 6.65 0.011 *
## Residuals 234 203.9 0.87
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 12 15 20 22 43 44 45
## Predicted 0.546 0.502 0.0827 0.263 0.431 0.359 0.334 0.1679
## cvpred 0.495 0.495 0.1374 0.308 0.427 0.329 0.348 0.1575
## yds 1.572 -0.188 -0.6541 -0.311 2.375 -0.997 -0.530 -0.0894
## CV residual 1.076 -0.683 -0.7916 -0.619 1.948 -1.327 -0.878 -0.2469
## 49 57 58 61 63 70 71 75 78
## Predicted -0.01481 0.154 0.755 0.325 0.777 0.345 0.170 0.890 0.771
## cvpred 0.00247 0.150 0.724 0.297 0.768 0.346 0.159 0.811 0.720
## yds -1.51661 1.367 0.875 -0.557 1.245 -0.680 2.121 1.013 1.547
## CV residual -1.51907 1.217 0.151 -0.854 0.478 -1.026 1.963 0.202 0.826
## 80 94 97 100 105 106 108 116
## Predicted 0.021834 -0.0478 -0.0415 0.415 0.203 0.484 -0.0167 -0.04997
## cvpred 0.020748 -0.0172 -0.0478 0.401 0.192 0.474 -0.0162 -0.00328
## yds 0.021312 0.1173 -0.9236 0.537 0.792 0.820 -0.5533 -1.23732
## CV residual 0.000564 0.1345 -0.8757 0.136 0.599 0.346 -0.5371 -1.23404
## 122 135 139 143 147 148 160 167
## Predicted -0.154 -0.415 -0.0124 0.0673 -0.406 0.3099 0.0456 -0.0941
## cvpred -0.126 -0.359 -0.0679 0.0236 -0.374 0.3070 0.0128 -0.1442
## yds 0.574 -1.178 0.0139 2.4623 0.762 0.2145 1.5137 -1.2090
## CV residual 0.700 -0.819 0.0818 2.4387 1.136 -0.0925 1.5009 -1.0648
## 173 175 179 180 183 188 197 200
## Predicted -0.0226 -0.297 -0.472 -0.413 0.208 -0.192 0.05805 -0.2263
## cvpred -0.0391 -0.279 -0.438 -0.413 0.190 -0.186 0.00728 -0.2148
## yds 1.3427 -0.385 -0.767 0.229 -1.578 -0.649 -0.13617 -0.1534
## CV residual 1.3818 -0.106 -0.329 0.642 -1.769 -0.464 -0.14345 0.0614
## 202 206 215 218 220 225
## Predicted -0.417 -0.283 -0.385 0.04988 -0.6731 -0.342
## cvpred -0.324 -0.253 -0.401 -0.00704 -0.6472 -0.313
## yds -0.738 -1.104 -1.499 -0.88791 -0.6714 0.409
## CV residual -0.414 -0.852 -1.099 -0.88087 -0.0242 0.722
##
## Sum of squares = 43.5 Mean square = 0.93 n = 47
##
## fold 2
## Observations in test set: 48
## 8 11 16 18 21 26 28 31 32
## Predicted 0.108 0.800 -0.0205 0.142 0.387 1.154 -0.210 0.0773 0.373
## cvpred 0.156 0.936 -0.0074 0.140 0.400 1.257 -0.205 0.0829 0.391
## yds 1.588 -0.466 -0.6763 0.948 -0.946 0.388 -0.708 1.7093 0.164
## CV residual 1.431 -1.402 -0.6689 0.807 -1.346 -0.869 -0.503 1.6264 -0.227
## 33 35 36 39 42 54 56 66
## Predicted 0.0224 0.00478 0.165 0.423 0.173 0.2453 0.209 0.162
## cvpred 0.0814 -0.00179 0.209 0.513 0.210 0.3072 0.289 0.284
## yds -0.2420 -0.41177 -0.712 0.112 0.426 0.0754 -1.079 1.019
## CV residual -0.3234 -0.40997 -0.921 -0.401 0.216 -0.2318 -1.368 0.735
## 68 72 81 83 90 91 107 109 110
## Predicted 0.0328 0.140 0.0821 1.315 0.089 0.765 0.128 -0.0394 0.3622
## cvpred 0.0725 0.194 0.1709 1.434 0.134 0.867 0.188 0.0243 0.4069
## yds 0.2292 0.905 -0.5680 2.251 -1.053 -1.007 0.057 1.1643 0.4753
## CV residual 0.1568 0.710 -0.7389 0.817 -1.186 -1.875 -0.131 1.1400 0.0684
## 115 125 126 133 137 141 142
## Predicted 0.2052 0.2340 -0.00736 -0.5408 -0.262 -0.4320 -0.0838
## cvpred 0.2662 0.3202 0.04878 -0.5186 -0.231 -0.4207 -0.0322
## yds 0.2120 0.0865 -1.03432 0.0139 -0.676 -0.4671 -0.0537
## CV residual -0.0542 -0.2337 -1.08309 0.5325 -0.446 -0.0465 -0.0215
## 146 152 164 171 186 189 198 201
## Predicted 0.13588 0.0146 0.0888 -0.00195 -0.215 -0.857 -0.0695 -0.1486
## cvpred 0.15178 0.0747 0.1062 0.06397 -0.140 -0.834 -0.0416 -0.1185
## yds 0.14435 -0.6517 0.9822 0.25015 -0.301 -1.491 -1.4662 -0.1399
## CV residual -0.00743 -0.7264 0.8760 0.18619 -0.161 -0.657 -1.4245 -0.0213
## 203 212 216 227 229 230 240
## Predicted 0.2232 -0.434 -0.435 -0.274 -0.396 -0.384 -0.927
## cvpred 0.2767 -0.434 -0.456 -0.220 -0.360 -0.328 -0.884
## yds 0.0619 -0.310 1.108 -0.585 -1.122 -2.358 -0.840
## CV residual -0.2148 0.124 1.564 -0.365 -0.762 -2.030 0.044
##
## Sum of squares = 35.3 Mean square = 0.74 n = 48
##
## fold 3
## Observations in test set: 48
## 4 6 19 23 41 59 60 64
## Predicted -0.00468 0.11351 0.1848 0.244 0.6059 0.0163 -0.0701 0.573
## cvpred -0.10506 0.00299 0.0876 0.120 0.5978 -0.0768 -0.1343 0.584
## yds 1.45957 0.08406 0.4138 1.950 0.5405 0.0718 -1.1180 0.896
## CV residual 1.56463 0.08107 0.3262 1.830 -0.0573 0.1485 -0.9837 0.312
## 69 73 77 79 84 88 89 95
## Predicted 0.0393 0.3156 -0.0349 -0.112 -0.0679 0.424 0.1012 0.3478
## cvpred -0.0320 0.2480 -0.0963 -0.190 -0.1490 0.361 0.0260 0.2796
## yds 0.5959 0.2354 -1.2742 0.814 0.4175 2.007 0.0865 0.3191
## CV residual 0.6278 -0.0126 -1.1779 1.004 0.5665 1.646 0.0605 0.0395
## 113 117 119 120 123 124 127 131
## Predicted -0.1020 -0.0437 -0.122 0.1356 -0.0989 -0.1431 0.385 -0.403
## cvpred -0.1563 -0.0799 -0.207 0.0240 -0.1600 -0.2030 0.297 -0.507
## yds -0.2125 -1.4858 1.188 -0.0673 1.2184 -0.1202 2.043 0.648
## CV residual -0.0561 -1.4060 1.395 -0.0913 1.3785 0.0828 1.745 1.154
## 132 134 136 138 140 149 150 153
## Predicted -0.3292 -0.0254 0.0490 0.0671 0.228 -0.222 0.1435 -0.520
## cvpred -0.4309 -0.1391 -0.0399 0.0321 0.178 -0.282 0.0631 -0.618
## yds 0.0312 -0.6012 -0.5717 0.8346 0.603 -1.234 1.6429 -0.142
## CV residual 0.4621 -0.4621 -0.5318 0.8025 0.425 -0.952 1.5798 0.476
## 154 158 163 177 178 182 192 196
## Predicted -0.274 -0.532 0.00389 -0.0632 -0.668 -0.467 -0.129 -0.599
## cvpred -0.369 -0.569 -0.09596 -0.1589 -0.765 -0.548 -0.226 -0.697
## yds -0.582 1.679 0.12589 -0.0845 1.586 -0.130 -0.548 -0.462
## CV residual -0.212 2.247 0.22185 0.0744 2.351 0.418 -0.322 0.235
## 208 209 213 221 223 226 228 235
## Predicted -0.321 -0.304 -0.652 -0.0859 -0.508 -0.136 -0.384 -0.479
## cvpred -0.411 -0.366 -0.730 -0.1869 -0.607 -0.220 -0.463 -0.545
## yds -1.037 -0.945 -1.118 1.3784 0.138 -0.527 -0.697 -1.226
## CV residual -0.626 -0.578 -0.388 1.5653 0.745 -0.307 -0.235 -0.681
##
## Sum of squares = 43.7 Mean square = 0.91 n = 48
##
## fold 4
## Observations in test set: 48
## 2 5 7 9 14 17 24 37 51
## Predicted 0.789 0.544 0.699 -0.208 0.876 0.284 -0.0101 -0.571 0.373
## cvpred 0.696 0.517 0.639 -0.177 0.754 0.315 0.0890 -0.356 0.382
## yds 2.788 -0.361 2.391 0.479 1.728 0.196 1.6392 -2.593 0.971
## CV residual 2.092 -0.879 1.751 0.656 0.974 -0.119 1.5502 -2.237 0.589
## 52 53 55 62 74 76 85 86
## Predicted 0.792 -0.331 0.0146 -0.355 0.4463 0.0395 -0.1208 -0.0446
## cvpred 0.739 -0.208 0.0804 -0.174 0.4166 0.0925 -0.0465 -0.0328
## yds 0.632 0.496 -1.3099 -1.043 -0.0636 -0.1042 -0.5286 -0.5692
## CV residual -0.107 0.704 -1.3903 -0.869 -0.4802 -0.1967 -0.4821 -0.5365
## 87 92 93 99 102 111 118 128 129
## Predicted 0.359 0.342 0.183 0.308 0.226 -0.0438 -0.181 0.101 0.275
## cvpred 0.344 0.303 0.173 0.221 0.186 -0.0307 -0.134 0.110 0.301
## yds -1.400 -1.411 1.122 0.120 1.247 0.8924 -0.582 -0.441 -0.773
## CV residual -1.744 -1.714 0.950 -0.101 1.060 0.9231 -0.448 -0.552 -1.075
## 130 151 155 156 157 159 162 165
## Predicted 0.608 -0.293 0.124 0.0444 -0.0626 -0.552 0.0872 -0.359
## cvpred 0.539 -0.156 0.153 0.0955 -0.0611 -0.416 0.1370 -0.317
## yds 1.393 -0.326 1.378 0.0361 -0.2075 -0.107 -0.4930 0.901
## CV residual 0.854 -0.170 1.225 -0.0594 -0.1464 0.309 -0.6300 1.218
## 166 169 185 191 194 195 199 205
## Predicted -0.0381 -0.474 0.640 -0.325 -0.117 -0.207 -0.546 -0.287
## cvpred 0.0241 -0.347 0.529 -0.260 -0.107 -0.150 -0.442 -0.220
## yds -0.5471 -1.017 -1.154 -1.283 -0.694 -0.612 -1.438 -1.825
## CV residual -0.5712 -0.670 -1.682 -1.023 -0.587 -0.462 -0.995 -1.605
## 207 210 231 236 237 239
## Predicted -0.447 -0.0625 -0.567 -0.248 -0.1181 -0.690
## cvpred -0.289 -0.0109 -0.486 -0.214 -0.0927 -0.565
## yds -1.161 0.6291 0.801 -0.626 -1.3444 -1.881
## CV residual -0.872 0.6400 1.287 -0.412 -1.2517 -1.316
##
## Sum of squares = 51.2 Mean square = 1.07 n = 48
##
## fold 5
## Observations in test set: 48
## 1 10 13 25 27 29 30 34
## Predicted -0.0731 0.137 0.193 0.147 -0.07162 0.299 0.301 0.0591
## cvpred -0.1917 0.162 0.209 0.254 -0.07156 0.296 0.297 0.1140
## yds 1.2430 -0.313 -0.697 1.062 -0.07588 -0.605 2.066 -0.5877
## CV residual 1.4347 -0.475 -0.906 0.808 -0.00432 -0.901 1.769 -0.7017
## 38 40 46 47 48 50 65 67
## Predicted 0.513 0.336 0.0716 -0.203 -0.0896 -0.0173 -0.282 0.126
## cvpred 0.499 0.425 0.0618 -0.238 -0.0937 -0.0227 -0.396 0.103
## yds 1.575 -0.283 -0.2937 -0.676 -1.1684 0.4655 1.447 -0.347
## CV residual 1.077 -0.707 -0.3554 -0.438 -1.0747 0.4882 1.844 -0.450
## 82 96 98 101 103 104 112 114
## Predicted 0.512 0.0170 0.262 0.526 -0.0669 0.6505 0.671 -0.0734
## cvpred 0.500 0.0108 0.382 0.380 -0.0395 0.7827 0.724 -0.0794
## yds -1.052 -0.4524 0.185 0.988 0.0742 0.7497 -0.412 1.3771
## CV residual -1.552 -0.4632 -0.197 0.609 0.1137 -0.0331 -1.135 1.4565
## 121 144 145 161 168 170 172 174
## Predicted 0.1336 0.0998 -0.390 -0.141 -0.131 -0.404 0.0859 -0.0116
## cvpred 0.1990 0.1242 -0.471 -0.158 -0.274 -0.516 0.0881 -0.0914
## yds -0.0931 1.8274 1.402 0.318 -0.746 0.308 -0.5840 1.4510
## CV residual -0.2921 1.7032 1.872 0.476 -0.472 0.824 -0.6721 1.5424
## 176 181 184 187 190 193 204 211
## Predicted -0.3523 -0.146 0.100 -0.513 -0.232 -0.627 0.0231 0.0263
## cvpred -0.3846 -0.203 0.147 -0.587 -0.252 -0.692 -0.0188 0.0547
## yds -0.3170 0.597 -0.278 -1.059 -0.791 -1.140 -0.2777 -1.6421
## CV residual 0.0676 0.800 -0.425 -0.472 -0.539 -0.448 -0.2588 -1.6968
## 214 217 219 222 224 232 233 238
## Predicted -0.6448 -0.386 -0.710 -1.115 -0.511 -0.821 -0.271 -0.262
## cvpred -0.7763 -0.432 -0.795 -1.180 -0.583 -0.956 -0.308 -0.300
## yds -0.8301 -1.223 -0.641 -0.367 -0.235 -0.136 0.282 -0.601
## CV residual -0.0538 -0.791 0.154 0.812 0.348 0.820 0.590 -0.302
##
## Sum of squares = 39.1 Mean square = 0.81 n = 48
##
## Overall (Sum over all 48 folds)
## ms
## 0.891