# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
cpct = qb_stats["completion_percentage"]
# Generate clean data set
data.scaled.no_combine.for_cpct = data.frame(scale(na.omit(cbind(cpct, college_stats))))
# Generate the linear model
lm.scaled.no_combine.cpct <- lm(formula = completion_percentage ~ ., data = data.scaled.no_combine.for_cpct)
# Find optimum linear regression model for cpct
step_reg.scaled.no_combine.cpct <- stepAIC(lm.scaled.no_combine.cpct, direction = "both")
## Start: AIC=-31.24
## completion_percentage ~ height + weight + age + c_avg_cmpp +
## c_rate + c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.01 187 -33.2
## - c_pct 1 0.01 187 -33.2
## - c_avg_tds 1 0.10 187 -33.1
## - c_avg_yds 1 0.44 187 -32.7
## - c_rate 1 0.56 187 -32.5
## - c_avg_inter 1 0.72 188 -32.3
## - c_numyrs 1 1.51 188 -31.3
## <none> 187 -31.2
## - weight 1 2.15 189 -30.5
## - c_avg_cmpp 1 2.85 190 -29.7
## - c_avg_att 1 4.48 191 -27.7
## - age 1 11.99 199 -18.6
##
## Step: AIC=-33.23
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.01 187 -35.2
## - c_avg_tds 1 0.10 187 -35.1
## - c_avg_yds 1 0.44 187 -34.7
## - c_rate 1 0.56 187 -34.5
## - c_avg_inter 1 0.73 188 -34.3
## - c_numyrs 1 1.54 188 -33.3
## <none> 187 -33.2
## - c_avg_cmpp 1 2.85 190 -31.7
## + height 1 0.01 187 -31.2
## - weight 1 3.25 190 -31.2
## - c_avg_att 1 4.47 191 -29.7
## - age 1 12.02 199 -20.5
##
## Step: AIC=-35.22
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.09 187 -37.1
## - c_avg_yds 1 0.46 187 -36.6
## - c_avg_inter 1 0.75 188 -36.3
## - c_rate 1 1.16 188 -35.8
## <none> 187 -35.2
## - c_numyrs 1 1.65 188 -35.1
## + c_pct 1 0.01 187 -33.2
## + height 1 0.01 187 -33.2
## - weight 1 3.29 190 -33.1
## - c_avg_att 1 5.50 192 -30.4
## - c_avg_cmpp 1 6.60 193 -29.0
## - age 1 12.48 199 -22.0
##
## Step: AIC=-37.1
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_avg_inter + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.83 188 -38.1
## - c_avg_yds 1 0.88 188 -38.0
## - c_rate 1 1.07 188 -37.8
## <none> 187 -37.1
## - c_numyrs 1 1.63 188 -37.1
## + c_avg_tds 1 0.09 187 -35.2
## + height 1 0.01 187 -35.1
## + c_pct 1 0.00 187 -35.1
## - weight 1 3.24 190 -35.0
## - c_avg_att 1 5.51 192 -32.2
## - c_avg_cmpp 1 6.51 193 -31.0
## - age 1 12.52 199 -23.8
##
## Step: AIC=-38.06
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_yds 1 0.68 188 -39.2
## - c_rate 1 0.75 188 -39.1
## <none> 188 -38.1
## - c_numyrs 1 2.13 190 -37.4
## + c_avg_inter 1 0.83 187 -37.1
## + c_avg_tds 1 0.17 188 -36.3
## + c_pct 1 0.10 188 -36.2
## + height 1 0.03 188 -36.1
## - weight 1 4.04 192 -35.0
## - c_avg_att 1 9.09 197 -28.9
## - c_avg_cmpp 1 10.29 198 -27.5
## - age 1 12.41 200 -25.0
##
## Step: AIC=-39.21
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.19 189 -41.0
## <none> 188 -39.2
## - c_numyrs 1 1.85 190 -38.9
## + c_avg_yds 1 0.68 188 -38.1
## + c_avg_inter 1 0.63 188 -38.0
## + c_avg_tds 1 0.56 188 -37.9
## + c_pct 1 0.48 188 -37.8
## + height 1 0.04 188 -37.3
## - weight 1 3.97 192 -36.3
## - c_avg_att 1 9.67 198 -29.4
## - c_avg_cmpp 1 11.56 200 -27.1
## - age 1 12.46 201 -26.1
##
## Step: AIC=-40.97
## completion_percentage ~ weight + age + c_avg_cmpp + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 189 -41.0
## - c_numyrs 1 1.73 190 -40.8
## + c_pct 1 0.66 188 -39.8
## + c_avg_inter 1 0.51 188 -39.6
## + c_rate 1 0.19 188 -39.2
## + c_avg_tds 1 0.13 188 -39.1
## + c_avg_yds 1 0.12 188 -39.1
## + height 1 0.04 188 -39.0
## - weight 1 3.89 192 -38.2
## - c_avg_att 1 11.47 200 -29.0
## - age 1 12.69 201 -27.6
## - c_avg_cmpp 1 14.97 204 -24.9
summary(step_reg.scaled.no_combine.cpct)
##
## Call:
## lm(formula = completion_percentage ~ weight + age + c_avg_cmpp +
## c_numyrs + c_avg_att, data = data.scaled.no_combine.for_cpct)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.066 -0.429 0.012 0.555 1.921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.45e-16 5.89e-02 0.00 1.00000
## weight 1.42e-01 6.53e-02 2.18 0.03037 *
## age 2.39e-01 6.08e-02 3.93 0.00011 ***
## c_avg_cmpp 1.38e+00 3.24e-01 4.27 2.8e-05 ***
## c_numyrs 9.06e-02 6.23e-02 1.45 0.14721
## c_avg_att -1.20e+00 3.21e-01 -3.74 0.00023 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.905 on 230 degrees of freedom
## Multiple R-squared: 0.198, Adjusted R-squared: 0.18
## F-statistic: 11.3 on 5 and 230 DF, p-value: 8.76e-10
plot(step_reg.scaled.no_combine.cpct)
leaps.scaled.no_combine.cpct <- regsubsets(completion_percentage ~ ., data = data.scaled.no_combine.for_cpct,
nbest = 10)
subsets(leaps.scaled.no_combine.cpct, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_cpct, step_reg.scaled.no_combine.cpct,
m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: completion_percentage
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 10.2 10.18 12.42 0.00051 ***
## age 1 13.4 13.44 16.39 7e-05 ***
## c_avg_cmpp 1 9.5 9.46 11.54 0.00080 ***
## c_numyrs 1 1.9 1.91 2.33 0.12838
## c_avg_att 1 11.5 11.47 13.99 0.00023 ***
## Residuals 230 188.6 0.82
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 15 18 24 26 35 37
## Predicted 1.289 1.031 0.349 0.212 0.5560 -0.0707 -0.0735
## cvpred 1.314 1.236 0.415 0.288 0.4713 -0.0744 0.0404
## completion_percentage 0.615 1.134 1.134 1.179 0.0661 0.4016 -8.1397
## CV residual -0.699 -0.102 0.718 0.892 -0.4053 0.4760 -8.1801
## 39 41 45 52 56 62 71
## Predicted 0.162 0.628 0.204 0.778 -0.0221 -0.771 -0.082
## cvpred 0.140 0.758 0.196 0.897 0.0278 -0.725 -0.118
## completion_percentage -0.285 0.524 1.164 1.988 0.1576 -0.239 0.386
## CV residual -0.425 -0.235 0.968 1.091 0.1297 0.486 0.504
## 72 76 79 81 85 88 89
## Predicted 0.196 0.1820 -0.184 -0.267 0.250 0.361 0.0808
## cvpred 0.144 0.1187 -0.125 -0.189 0.239 0.443 0.0929
## completion_percentage 1.607 0.2186 0.158 0.249 0.890 0.508 0.5694
## CV residual 1.463 0.0999 0.283 0.438 0.651 0.065 0.4765
## 103 116 118 119 121 133 136
## Predicted -0.0862 0.0352 -0.214 0.179 0.370 -0.490 -0.153
## cvpred -0.1672 0.1016 -0.180 0.114 0.282 -0.506 -0.165
## completion_percentage 0.7219 -0.6508 -0.315 1.027 1.439 -0.056 -1.413
## CV residual 0.8891 -0.7524 -0.135 0.913 1.157 0.450 -1.248
## 139 149 152 159 165 166 170
## Predicted -0.0623 -0.238 -0.0130 -0.585 0.2258 0.235 -0.4163
## cvpred -0.2553 -0.243 0.0993 -0.621 0.2447 0.320 -0.3834
## completion_percentage 0.3711 -0.666 -0.5135 -0.453 0.0661 -0.895 -0.4830
## CV residual 0.6264 -0.423 -0.6128 0.168 -0.1787 -1.215 -0.0996
## 173 176 189 194 200 207 213
## Predicted 0.0274 -0.6553 -0.784 -0.2291 0.0524 -0.580 -0.784
## cvpred -0.0179 -0.7368 -0.768 -0.4354 0.0488 -0.564 -0.818
## completion_percentage 0.9202 0.0661 0.569 0.0203 0.1881 -0.727 -2.069
## CV residual 0.9381 0.8028 1.337 0.4557 0.1393 -0.163 -1.252
## 222 225 226 232 239
## Predicted -1.403 -0.5906 -0.118 -0.718 -0.738
## cvpred -1.471 -0.6263 -0.179 -0.733 -0.810
## completion_percentage -0.392 -0.7118 -0.590 0.417 -0.254
## CV residual 1.079 -0.0855 -0.411 1.150 0.556
##
## Sum of squares = 91 Mean square = 1.94 n = 47
##
## fold 2
## Observations in test set: 48
## 4 5 7 14 17 33 36
## Predicted 0.235 0.376 0.10329 0.668 -0.2362 -0.324 0.279
## cvpred 0.146 0.317 0.00226 0.591 -0.3162 -0.311 0.193
## completion_percentage 0.752 1.378 1.01170 1.012 -0.0102 0.737 -0.208
## CV residual 0.606 1.060 1.00944 0.421 0.3060 1.048 -0.402
## 42 46 47 66 67 70 74
## Predicted 0.645 0.1648 -0.0734 -0.504 0.329 0.486 0.858
## cvpred 0.581 0.0668 -0.0952 -0.513 0.268 0.518 0.936
## completion_percentage 1.668 -0.2695 0.4626 0.234 1.698 -0.392 0.707
## CV residual 1.087 -0.3363 0.5578 0.747 1.430 -0.909 -0.229
## 78 99 100 102 114 122 126
## Predicted 0.280 0.9733 0.0149 0.698 -0.284 -0.424 0.05217
## cvpred 0.191 1.1609 0.0121 0.850 -0.363 -0.474 -0.00721
## completion_percentage 0.707 -0.0712 -0.1780 0.447 1.637 -0.102 -0.11698
## CV residual 0.516 -1.2321 -0.1901 -0.403 2.000 0.372 -0.10977
## 128 131 138 141 144 147 153
## Predicted 0.10471 -0.291 -0.0251 -0.0788 0.120 -0.0823 -0.475
## cvpred -0.00525 -0.220 -0.0338 -0.1150 0.084 -0.1546 -0.442
## completion_percentage -0.07122 -0.925 0.2338 -0.1627 0.920 0.5389 -0.270
## CV residual -0.06597 -0.706 0.2677 -0.0478 0.836 0.6934 0.172
## 156 158 161 163 164 171 172
## Predicted -0.1568 -0.106 -0.1453 -0.0807 0.226 -0.157 -0.1185
## cvpred -0.1316 -0.130 -0.1230 -0.1933 0.222 -0.161 -0.1240
## completion_percentage 0.0966 0.249 -0.0407 0.7677 0.844 -0.925 -0.1322
## CV residual 0.2281 0.379 0.0823 0.9610 0.622 -0.764 -0.0082
## 174 177 179 184 185 188 192
## Predicted -0.1180 -0.3507 -0.605 -0.2382 -0.738 -0.288 -0.514
## cvpred -0.0337 -0.3479 -0.706 -0.0858 -0.643 -0.332 -0.542
## completion_percentage 0.9507 -0.0255 0.112 -0.2085 -2.405 -1.505 -1.261
## CV residual 0.9844 0.3225 0.818 -0.1227 -1.762 -1.173 -0.719
## 201 203 206 209 215 216
## Predicted -0.389 -0.142 -0.243 0.0295 -0.211 -0.543
## cvpred -0.367 -0.149 -0.255 0.0941 -0.213 -0.574
## completion_percentage -0.941 -0.788 -1.124 -0.9559 -2.329 -0.758
## CV residual -0.573 -0.639 -0.869 -1.0500 -2.115 -0.184
##
## Sum of squares = 33 Mean square = 0.69 n = 48
##
## fold 3
## Observations in test set: 47
## 2 6 25 29 34 43 44
## Predicted 0.755 -0.0275 -0.396 0.553 0.089 0.4856 0.0208
## cvpred 0.734 -0.0292 -0.486 0.524 0.103 0.5403 0.0139
## completion_percentage 0.112 0.4626 1.012 0.417 0.417 -0.0407 -0.0102
## CV residual -0.622 0.4918 1.498 -0.107 0.314 -0.5810 -0.0241
## 48 50 51 55 59 60 65
## Predicted -0.199 0.532 0.496 0.218 0.2510 0.1122 0.246
## cvpred -0.159 0.565 0.556 0.250 0.2694 0.1847 0.291
## completion_percentage -1.215 1.179 1.149 -0.788 -0.0407 -0.0865 1.134
## CV residual -1.056 0.615 0.592 -1.038 -0.3101 -0.2712 0.842
## 77 80 86 87 90 94 101
## Predicted -0.149 0.600 0.532 0.4277 0.657 -0.238 0.79453
## cvpred -0.148 0.703 0.598 0.4413 0.745 -0.203 0.92693
## completion_percentage -0.346 0.585 1.118 0.0203 0.569 -0.056 0.92019
## CV residual -0.198 -0.118 0.521 -0.4210 -0.176 0.147 -0.00675
## 106 107 108 112 123 124 134
## Predicted 0.01727 -0.0855 -0.0893 0.334 0.0016 -0.0967 -0.279
## cvpred -0.00578 -0.0520 -0.0775 0.276 0.0204 -0.0701 -0.303
## completion_percentage 0.09656 -0.3610 -1.2304 0.569 0.1271 -0.0102 0.310
## CV residual 0.10234 -0.3090 -1.1529 0.293 0.1066 0.0599 0.613
## 137 148 157 178 180 181 182
## Predicted -0.0206 0.5234 0.1569 -0.606 -0.377 0.127 -0.709
## cvpred 0.0673 0.5498 0.2266 -0.553 -0.314 0.218 -0.665
## completion_percentage -1.8558 0.0508 0.2033 0.356 0.508 -0.834 -1.032
## CV residual -1.9230 -0.4990 -0.0233 0.909 0.822 -1.052 -0.367
## 186 193 195 199 202 210 217
## Predicted -0.337 -0.462 -0.472 -0.599 -0.600 -0.01361 -0.431
## cvpred -0.297 -0.447 -0.460 -0.563 -0.630 0.00521 -0.383
## completion_percentage -0.483 -1.276 -0.056 -1.352 -0.788 0.03555 -1.932
## CV residual -0.186 -0.829 0.404 -0.789 -0.158 0.03034 -1.549
## 218 219 234 236 238
## Predicted 0.208 -0.735 -0.348 -0.453 -0.6667
## cvpred 0.257 -0.689 -0.364 -0.451 -0.7133
## completion_percentage -0.864 -0.407 0.478 -0.925 0.0966
## CV residual -1.121 0.282 0.842 -0.474 0.8098
##
## Sum of squares = 23.1 Mean square = 0.49 n = 47
##
## fold 4
## Observations in test set: 47
## 9 10 11 13 16 22 27
## Predicted 0.142 0.251 0.679 0.661 0.283 0.470 0.244
## cvpred -0.142 0.427 0.857 0.629 0.237 0.382 0.134
## completion_percentage 1.088 0.142 -1.047 0.142 -0.132 1.561 1.439
## CV residual 1.230 -0.284 -1.904 -0.487 -0.370 1.179 1.304
## 28 31 32 38 49 53 58 64
## Predicted 0.104 0.118 0.952 0.878 -0.1122 -0.330 0.783 0.691
## cvpred -0.175 0.195 1.069 0.846 0.0174 -0.306 1.042 0.466
## completion_percentage 0.752 1.561 1.988 1.271 -0.3763 0.569 0.585 1.332
## CV residual 0.928 1.366 0.918 0.425 -0.3937 0.876 -0.457 0.866
## 68 75 84 92 93 95 97
## Predicted -0.108 0.782 0.622 0.276 0.1315 0.523 -0.1447
## cvpred 0.034 0.816 0.589 0.382 0.2615 0.678 -0.0879
## completion_percentage -0.498 0.142 0.386 -0.376 0.0355 0.249 -0.5440
## CV residual -0.532 -0.673 -0.203 -0.758 -0.2259 -0.429 -0.4562
## 104 105 109 110 113 117 120
## Predicted 0.144 -0.1194 -0.237 -0.347 -0.00504 0.273 0.090
## cvpred 0.464 0.0899 -0.217 -0.230 0.00743 0.125 0.355
## completion_percentage -0.605 -0.4678 -0.361 -0.529 0.31009 -0.758 -0.147
## CV residual -1.069 -0.5576 -0.144 -0.299 0.30266 -0.882 -0.502
## 125 127 130 143 145 146 162
## Predicted 0.421 0.285 0.396 -0.279 -0.3669 0.533 -0.376
## cvpred 0.535 0.464 0.620 -0.227 -0.4004 0.640 -0.258
## completion_percentage 0.661 0.829 -0.880 -0.361 0.0203 -0.681 -0.514
## CV residual 0.126 0.365 -1.500 -0.134 0.4207 -1.321 -0.256
## 183 190 196 198 211 214 224
## Predicted 0.1306 -0.238 -0.7433 0.177 -0.516 -0.6580 -0.624
## cvpred 0.0766 -0.346 -0.7986 0.227 -0.295 -0.8705 -0.524
## completion_percentage -0.0102 -0.925 -0.7423 -0.254 -1.139 -0.0712 -1.307
## CV residual -0.0868 -0.580 0.0562 -0.481 -0.844 0.7993 -0.783
## 227 229 233 237
## Predicted -0.546 -0.345 -0.844 -0.478
## cvpred -0.317 -0.212 -0.730 -0.350
## completion_percentage -1.856 -2.359 -1.139 -1.230
## CV residual -1.539 -2.148 -0.409 -0.880
##
## Sum of squares = 33.7 Mean square = 0.72 n = 47
##
## fold 5
## Observations in test set: 47
## 1 8 12 19 20 21 23
## Predicted 0.227 0.334 0.7609 0.07997 0.0471 0.4065 0.231
## cvpred 0.141 0.262 0.7028 0.06113 0.1041 0.3901 0.141
## completion_percentage 1.637 0.722 0.6609 0.06605 0.1728 -0.0407 1.530
## CV residual 1.497 0.460 -0.0419 0.00492 0.0687 -0.4308 1.390
## 30 40 54 57 61 69 73
## Predicted -0.319 1.004 1.068 0.1346 0.208 0.1051 0.902
## cvpred -0.327 0.906 1.038 0.0756 0.150 0.0148 0.825
## completion_percentage 0.478 1.118 1.149 0.8897 0.402 -0.1322 0.661
## CV residual 0.804 0.212 0.111 0.8141 0.252 -0.1471 -0.164
## 82 83 91 96 98 111 115
## Predicted -0.0771 0.989 -0.205 0.2196 0.517 -0.321 -0.0994
## cvpred -0.0176 1.075 -0.144 0.1358 0.424 -0.417 -0.1028
## completion_percentage -1.3982 1.424 -1.230 -0.0407 1.424 0.463 -0.5746
## CV residual -1.3806 0.348 -1.087 -0.1765 1.000 0.880 -0.4717
## 129 132 135 140 142 150 151
## Predicted 0.1069 -0.252 -0.423 0.4387 0.1122 -0.179 -0.501
## cvpred 0.0644 -0.330 -0.550 0.2925 -0.0428 -0.166 -0.368
## completion_percentage -0.0865 -0.147 -0.300 0.3711 0.0355 0.737 -0.590
## CV residual -0.1508 0.182 0.250 0.0786 0.0783 0.903 -0.222
## 154 155 160 167 168 169 187
## Predicted -0.160 0.331 0.00179 0.211 -0.0677 -0.571 -0.3693
## cvpred -0.242 0.387 -0.01843 0.194 -0.1901 -0.667 -0.4194
## completion_percentage -0.575 1.027 0.64564 -0.300 0.5694 -0.483 -0.5135
## CV residual -0.332 0.640 0.66407 -0.494 0.7595 0.184 -0.0942
## 191 204 205 208 212 220 221
## Predicted -0.324 0.375 -0.425 -0.3678 -0.218 -0.417 -0.1458
## cvpred -0.377 0.413 -0.357 -0.3698 -0.275 -0.493 -0.1924
## completion_percentage 0.127 0.966 -1.963 -0.3458 -0.712 -0.285 -0.2390
## CV residual 0.504 0.553 -1.605 0.0241 -0.437 0.208 -0.0466
## 223 228 230 235 240
## Predicted -0.612 -0.265 -0.668 -0.349 -0.622
## cvpred -0.679 -0.290 -0.729 -0.287 -0.655
## completion_percentage -0.788 0.661 1.240 -0.544 -0.468
## CV residual -0.109 0.951 1.969 -0.257 0.187
##
## Sum of squares = 22.4 Mean square = 0.48 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.861