# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
games_started = qb_stats["games_started"]
# Generate clean data set
data.log.no_combine.for_games_started = data.frame(log(na.omit(cbind(games_started,
college_stats)) + 0.1))
# Generate the linear model
lm.log.no_combine.games_started <- lm(formula = games_started ~ ., data = data.log.no_combine.for_games_started)
# Find optimum linear regression model for games_started
step_reg.log.no_combine.games_started <- stepAIC(lm.log.no_combine.games_started,
direction = "both")
## Start: AIC=-627.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.002 15.9 -630
## - c_numyrs 1 0.003 15.9 -630
## - c_avg_inter 1 0.041 15.9 -629
## - c_rate 1 0.075 15.9 -629
## - age 1 0.098 16.0 -628
## - height 1 0.111 16.0 -628
## <none> 15.9 -628
## - c_pct 1 0.136 16.0 -628
## - c_avg_cmpp 1 0.159 16.0 -627
## - c_avg_yds 1 0.164 16.0 -627
## - c_avg_att 1 0.193 16.1 -627
## - weight 1 0.561 16.4 -621
##
## Step: AIC=-629.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.002 15.9 -632
## - c_avg_inter 1 0.051 15.9 -631
## - age 1 0.099 16.0 -630
## - height 1 0.112 16.0 -630
## <none> 15.9 -630
## - c_pct 1 0.153 16.0 -629
## - c_avg_cmpp 1 0.171 16.1 -629
## - c_avg_att 1 0.198 16.1 -629
## - c_rate 1 0.205 16.1 -629
## + c_avg_tds 1 0.002 15.9 -628
## - c_avg_yds 1 0.290 16.2 -627
## - weight 1 0.575 16.4 -623
##
## Step: AIC=-631.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.049 15.9 -633
## - age 1 0.098 16.0 -632
## - height 1 0.111 16.0 -632
## <none> 15.9 -632
## - c_pct 1 0.167 16.1 -631
## - c_avg_cmpp 1 0.185 16.1 -631
## - c_rate 1 0.205 16.1 -631
## - c_avg_att 1 0.214 16.1 -631
## + c_numyrs 1 0.002 15.9 -630
## + c_avg_tds 1 0.001 15.9 -630
## - c_avg_yds 1 0.291 16.2 -629
## - weight 1 0.580 16.5 -625
##
## Step: AIC=-633
## games_started ~ height + weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 0.084 16.0 -634
## - height 1 0.116 16.0 -633
## <none> 15.9 -633
## - c_rate 1 0.169 16.1 -632
## + c_avg_inter 1 0.049 15.9 -632
## - c_pct 1 0.248 16.2 -631
## + c_avg_tds 1 0.013 15.9 -631
## - c_avg_yds 1 0.257 16.2 -631
## - c_avg_cmpp 1 0.264 16.2 -631
## + c_numyrs 1 0.000 15.9 -631
## - c_avg_att 1 0.285 16.2 -631
## - weight 1 0.691 16.6 -625
##
## Step: AIC=-633.7
## games_started ~ height + weight + c_avg_cmpp + c_rate + c_pct +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.133 16.1 -634
## <none> 16.0 -634
## - c_rate 1 0.153 16.2 -633
## + age 1 0.084 15.9 -633
## + c_avg_inter 1 0.035 16.0 -632
## - c_avg_yds 1 0.251 16.3 -632
## - c_pct 1 0.259 16.3 -632
## + c_avg_tds 1 0.007 16.0 -632
## + c_numyrs 1 0.000 16.0 -632
## - c_avg_cmpp 1 0.272 16.3 -632
## - c_avg_att 1 0.292 16.3 -631
## - weight 1 0.828 16.8 -624
##
## Step: AIC=-633.8
## games_started ~ weight + c_avg_cmpp + c_rate + c_pct + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 16.1 -634
## + height 1 0.133 16.0 -634
## - c_rate 1 0.143 16.3 -634
## + age 1 0.101 16.0 -633
## - c_pct 1 0.215 16.4 -633
## - c_avg_cmpp 1 0.226 16.4 -632
## + c_avg_inter 1 0.039 16.1 -632
## - c_avg_att 1 0.245 16.4 -632
## - c_avg_yds 1 0.248 16.4 -632
## + c_avg_tds 1 0.006 16.1 -632
## + c_numyrs 1 0.000 16.1 -632
## - weight 1 0.750 16.9 -625
summary(step_reg.log.no_combine.games_started)
##
## Call:
## lm(formula = games_started ~ weight + c_avg_cmpp + c_rate + c_pct +
## c_avg_yds + c_avg_att, data = data.log.no_combine.for_games_started)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.517 -0.209 0.000 0.209 0.535
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.962 23.918 1.67 0.0961 .
## weight 0.931 0.283 3.29 0.0012 **
## c_avg_cmpp 9.281 5.141 1.81 0.0723 .
## c_rate -0.540 0.376 -1.44 0.1518
## c_pct -8.892 5.043 -1.76 0.0792 .
## c_avg_yds 0.616 0.326 1.89 0.0597 .
## c_avg_att -9.901 5.262 -1.88 0.0611 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.263 on 233 degrees of freedom
## Multiple R-squared: 0.0849, Adjusted R-squared: 0.0613
## F-statistic: 3.6 on 6 and 233 DF, p-value: 0.00196
plot(step_reg.log.no_combine.games_started)
leaps.log.no_combine.games_started <- regsubsets(games_started ~ ., data = data.log.no_combine.for_games_started,
nbest = 10)
subsets(leaps.log.no_combine.games_started, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_games_started, step_reg.log.no_combine.games_started,
m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: games_started
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 1.02 1.019 14.70 0.00016 ***
## c_avg_cmpp 1 0.00 0.005 0.07 0.79231
## c_rate 1 0.12 0.115 1.67 0.19817
## c_pct 1 0.02 0.016 0.23 0.62930
## c_avg_yds 1 0.10 0.097 1.40 0.23847
## c_avg_att 1 0.25 0.245 3.54 0.06114 .
## Residuals 233 16.15 0.069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 48
## 3 15 25 27 29 34 38 45 46
## Predicted 2.446 2.383 2.68 2.346 2.418 2.396 2.449 2.4024 2.391
## cvpred 2.408 2.309 1.58 2.327 2.409 2.425 2.418 2.3967 2.406
## games_started 2.715 2.092 2.78 2.092 1.960 2.208 2.779 2.3125 2.715
## CV residual 0.307 -0.217 1.20 -0.235 -0.448 -0.217 0.361 -0.0842 0.309
## 47 48 50 61 63 65 67 71 75
## Predicted 2.310 2.346 2.311 2.405 2.4321 2.323 2.397 2.407 2.483
## cvpred 2.314 2.386 2.303 2.437 2.4118 2.294 2.388 2.419 2.483
## games_started 2.208 1.960 2.092 2.092 2.4932 2.779 1.960 2.779 2.715
## CV residual -0.106 -0.426 -0.211 -0.346 0.0814 0.485 -0.427 0.359 0.232
## 78 94 96 97 98 101 110 113 114
## Predicted 2.492 2.354 2.3725 2.325 2.415 2.342 2.4739 2.32 2.368
## cvpred 2.528 2.362 2.3802 2.385 2.460 2.253 2.4890 2.31 2.384
## games_started 2.779 2.208 2.3125 1.960 2.092 2.407 2.4932 2.57 2.573
## CV residual 0.251 -0.154 -0.0677 -0.425 -0.369 0.154 0.0042 0.26 0.189
## 135 138 139 150 158 171 172 174 176
## Predicted 2.2220 2.364 2.3929 2.404 2.244 2.363 2.410 2.290 2.3504
## cvpred 2.1979 2.350 2.4216 2.409 2.218 2.321 2.409 2.286 2.4034
## games_started 2.2083 2.493 2.4932 2.646 2.779 2.208 2.092 2.779 2.4069
## CV residual 0.0104 0.143 0.0716 0.237 0.561 -0.113 -0.318 0.493 0.0035
## 177 183 184 190 193 200 211 214
## Predicted 2.357 2.412 2.4015 2.3510 2.211 2.3163 2.403 2.187
## cvpred 2.431 2.443 2.4265 2.3757 2.240 2.2935 2.471 2.164
## games_started 2.313 1.960 2.4069 2.3125 2.092 2.2083 1.960 1.960
## CV residual -0.118 -0.483 -0.0195 -0.0632 -0.148 -0.0852 -0.511 -0.204
## 215 217 219 231
## Predicted 2.289 2.275 2.266 2.239
## cvpred 2.355 2.316 2.336 2.254
## games_started 1.960 2.208 2.208 2.646
## CV residual -0.395 -0.107 -0.128 0.393
##
## Sum of squares = 5.28 Mean square = 0.11 n = 48
##
## fold 2
## Observations in test set: 48
## 11 12 18 22 26 33 36 42 43
## Predicted 2.514 2.418 2.387 2.425 2.589 2.312 2.403 2.404 2.475
## cvpred 2.532 2.435 2.405 2.448 2.566 2.239 2.423 2.432 2.495
## games_started 2.407 2.208 2.573 2.779 2.715 2.493 2.092 2.573 1.960
## CV residual -0.126 -0.227 0.168 0.331 0.149 0.254 -0.331 0.141 -0.535
## 44 49 57 58 70 80 90 100
## Predicted 2.405 2.395 2.359 2.5296 2.470 2.36559 2.352 2.5024
## cvpred 2.393 2.437 2.346 2.5379 2.477 2.41065 2.387 2.4953
## games_started 2.092 1.960 2.715 2.5726 2.208 2.40695 1.960 2.5726
## CV residual -0.302 -0.477 0.369 0.0347 -0.269 -0.00371 -0.427 0.0773
## 103 105 106 108 116 123 125 128
## Predicted 2.3499 2.4256 2.456 2.383 2.390 2.375 2.453 2.4191
## cvpred 2.3420 2.3908 2.452 2.411 2.413 2.395 2.468 2.4425
## games_started 2.3125 2.4069 2.779 2.092 1.960 2.779 2.646 2.4069
## CV residual -0.0294 0.0162 0.327 -0.319 -0.453 0.384 0.178 -0.0356
## 137 143 144 146 147 149 152 160
## Predicted 2.329 2.473 2.405 2.4350 2.301 2.370 2.392 2.339
## cvpred 2.343 2.466 2.402 2.4623 2.334 2.373 2.419 2.315
## games_started 2.208 2.779 2.779 2.4069 2.208 1.960 2.313 2.779
## CV residual -0.135 0.313 0.377 -0.0553 -0.126 -0.413 -0.107 0.464
## 161 167 175 179 188 191 197 201 202
## Predicted 2.3744 2.282 2.339 2.323 2.433 2.250 2.38630 2.33 2.314
## cvpred 2.3736 2.344 2.333 2.319 2.439 2.238 2.40526 2.31 2.315
## games_started 2.3125 1.960 1.960 2.493 2.646 2.092 2.40695 2.57 1.960
## CV residual -0.0611 -0.383 -0.373 0.174 0.207 -0.146 0.00168 0.26 -0.355
## 210 218 224 225 226 240
## Predicted 2.345 2.44155 2.245 2.2798 2.392 2.2743
## cvpred 2.351 2.40990 2.218 2.2982 2.416 2.2716
## games_started 2.573 2.40695 2.493 2.3125 2.208 2.2083
## CV residual 0.221 -0.00296 0.276 0.0144 -0.207 -0.0633
##
## Sum of squares = 3.44 Mean square = 0.07 n = 48
##
## fold 3
## Observations in test set: 48
## 6 8 16 21 28 31 32 35
## Predicted 2.3958 2.382 2.3782 2.417 2.312 2.387 2.4438 2.332
## cvpred 2.3848 2.376 2.3662 2.403 2.315 2.376 2.4184 2.339
## games_started 2.4069 2.779 2.3125 2.208 2.092 2.779 2.4069 1.960
## CV residual 0.0222 0.403 -0.0536 -0.195 -0.223 0.403 -0.0115 -0.379
## 39 52 54 60 68 73 79 81 83
## Predicted 2.4881 2.5093 2.322 2.325 2.362 2.443 2.341 2.38 2.61
## cvpred 2.4783 2.4845 2.321 2.332 2.368 2.422 2.342 2.39 2.58
## games_started 2.5726 2.5726 2.208 1.960 2.493 2.573 2.779 2.21 2.78
## CV residual 0.0943 0.0881 -0.112 -0.372 0.125 0.151 0.437 -0.18 0.20
## 84 89 91 92 107 109 115 117 119
## Predicted 2.387 2.4013 2.492 2.44 2.386 2.368 2.42 2.360 2.354
## cvpred 2.367 2.3980 2.486 2.44 2.394 2.377 2.42 2.353 2.339
## games_started 2.646 2.4932 2.208 1.96 2.208 2.715 2.57 1.960 2.573
## CV residual 0.279 0.0952 -0.278 -0.48 -0.186 0.337 0.15 -0.393 0.234
## 120 122 126 142 148 154 166 169
## Predicted 2.4200 2.337 2.397 2.3828 2.429 2.381 2.3734 2.2585
## cvpred 2.4269 2.340 2.401 2.3834 2.421 2.423 2.3666 2.2742
## games_started 2.4069 2.646 1.960 2.4069 2.646 1.960 2.3125 2.2083
## CV residual -0.0199 0.306 -0.441 0.0235 0.225 -0.463 -0.0541 -0.0659
## 170 180 187 189 198 206 212 213
## Predicted 2.30169 2.3052 2.2547 2.171 2.356 2.3546 2.301 2.226
## cvpred 2.31442 2.3294 2.2638 2.206 2.358 2.3564 2.305 2.266
## games_started 2.31254 2.4069 2.3125 1.960 2.092 2.3125 2.407 1.960
## CV residual -0.00188 0.0775 0.0488 -0.245 -0.266 -0.0438 0.102 -0.306
## 221 227 232 234 237 239
## Predicted 2.374 2.344 2.2387 2.337 2.4644 2.25
## cvpred 2.388 2.364 2.2550 2.353 2.4793 2.27
## games_started 2.646 2.313 2.3125 2.646 2.4069 1.96
## CV residual 0.258 -0.051 0.0575 0.293 -0.0723 -0.31
##
## Sum of squares = 2.87 Mean square = 0.06 n = 48
##
## fold 4
## Observations in test set: 48
## 2 4 9 19 20 23 41 51 53
## Predicted 2.499 2.39 2.311 2.423 2.467 2.51 2.4414 2.450 2.303
## cvpred 2.492 2.35 2.318 2.387 2.456 2.41 2.4364 2.442 2.278
## games_started 2.715 2.78 2.493 2.715 2.208 2.71 2.4069 2.573 2.573
## CV residual 0.222 0.43 0.175 0.328 -0.247 0.31 -0.0294 0.131 0.295
## 59 69 72 74 76 77 85 111 121
## Predicted 2.3914 2.386 2.433 2.403 2.393 2.380 2.3231 2.384 2.340
## cvpred 2.3711 2.363 2.404 2.402 2.365 2.378 2.2904 2.385 2.319
## games_started 2.4069 2.779 2.646 2.092 2.493 2.092 2.2083 2.779 2.493
## CV residual 0.0358 0.416 0.242 -0.311 0.128 -0.286 -0.0821 0.394 0.174
## 124 127 129 130 131 132 133 134 136
## Predicted 2.3341 2.448 2.448 2.509 2.208 2.323 2.2784 2.391 2.452
## cvpred 2.3144 2.435 2.438 2.499 2.125 2.301 2.2669 2.372 2.457
## games_started 2.4069 2.646 2.092 2.715 2.573 2.573 2.2083 2.573 2.092
## CV residual 0.0925 0.212 -0.347 0.216 0.447 0.272 -0.0586 0.201 -0.365
## 140 156 157 163 164 168 173 178 181
## Predicted 2.446 2.37 2.344 2.389 2.383 2.326 2.359 2.251 2.332
## cvpred 2.438 2.36 2.304 2.371 2.376 2.325 2.343 2.238 2.337
## games_started 2.493 2.21 2.407 2.208 2.573 1.960 2.715 2.779 2.646
## CV residual 0.055 -0.15 0.103 -0.163 0.197 -0.365 0.371 0.541 0.309
## 182 194 196 203 204 208 209 222
## Predicted 2.3188 2.2878 2.286 2.422 2.3935 2.366 2.2417 2.1105
## cvpred 2.3271 2.2519 2.280 2.431 2.3908 2.351 2.1920 2.0652
## games_started 2.4069 2.2083 2.407 2.646 2.3125 2.208 2.2083 2.0919
## CV residual 0.0798 -0.0437 0.127 0.215 -0.0783 -0.143 0.0163 0.0266
## 223 228 230 236
## Predicted 2.2878 2.312 2.2939 2.3576
## cvpred 2.2588 2.269 2.2853 2.3297
## games_started 2.2083 1.960 2.3125 2.3125
## CV residual -0.0505 -0.309 0.0273 -0.0172
##
## Sum of squares = 2.88 Mean square = 0.06 n = 48
##
## fold 5
## Observations in test set: 48
## 1 5 7 10 13 14 17 24 30
## Predicted 2.315 2.446 2.576 2.420 2.413 2.51 2.457 2.39 2.408
## cvpred 2.314 2.459 2.601 2.432 2.424 2.53 2.465 2.40 2.413
## games_started 2.779 1.960 2.779 2.313 2.092 2.78 2.313 2.78 2.715
## CV residual 0.465 -0.499 0.178 -0.119 -0.332 0.25 -0.153 0.38 0.301
## 37 40 55 56 62 64 66 82
## Predicted 2.266 2.40 2.378 2.393 2.2937 2.4257 2.440 2.4124
## cvpred 2.267 2.40 2.384 2.398 2.2971 2.4273 2.440 2.4170
## games_started 2.208 2.09 1.960 1.960 2.2083 2.4069 2.715 2.4069
## CV residual -0.059 -0.31 -0.424 -0.438 -0.0888 -0.0203 0.275 -0.0101
## 86 87 88 93 95 99 102 104 112
## Predicted 2.29 2.402 2.453 2.428 2.463 2.404 2.36 2.5244 2.477
## cvpred 2.28 2.417 2.465 2.430 2.472 2.406 2.36 2.5438 2.492
## games_started 1.96 1.960 2.779 2.779 2.573 1.960 2.78 2.5726 1.960
## CV residual -0.32 -0.457 0.314 0.349 0.101 -0.446 0.42 0.0288 -0.532
## 118 141 145 151 153 155 159 162 165
## Predicted 2.313 2.2840 2.284 2.3009 2.298 2.398 2.259 2.406 2.276
## cvpred 2.303 2.2735 2.276 2.2947 2.295 2.396 2.251 2.411 2.266
## games_started 2.092 2.2083 2.779 2.3125 2.407 2.573 2.573 2.092 2.493
## CV residual -0.211 -0.0652 0.503 0.0179 0.112 0.176 0.322 -0.319 0.227
## 185 186 192 195 199 205 207 216
## Predicted 2.3957 2.3600 2.3754 2.341 2.30 2.385 2.32172 2.291
## cvpred 2.3993 2.3608 2.3723 2.341 2.29 2.371 2.31999 2.290
## games_started 2.3125 2.4069 2.4069 2.092 2.65 1.960 2.31254 2.646
## CV residual -0.0868 0.0461 0.0347 -0.249 0.36 -0.411 -0.00745 0.356
## 220 229 233 235 238
## Predicted 2.175 2.226 2.346 2.291 2.4064
## cvpred 2.166 2.225 2.340 2.283 2.3996
## games_started 2.313 1.960 2.646 2.092 2.4932
## CV residual 0.147 -0.265 0.307 -0.191 0.0936
##
## Sum of squares = 4.03 Mean square = 0.08 n = 48
##
## Overall (Sum over all 48 folds)
## ms
## 0.0771