# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
ints = qb_stats["ints"]
# Generate clean data set
data.log.no_combine.for_ints = data.frame(log(na.omit(cbind(ints, college_stats)) +
0.1))
# Generate the linear model
lm.log.no_combine.ints <- lm(formula = ints ~ ., data = data.log.no_combine.for_ints)
# Find optimum linear regression model for ints
step_reg.log.no_combine.ints <- stepAIC(lm.log.no_combine.ints, direction = "both")
## Start: AIC=-293.4
## ints ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.000 60.9 -295
## - c_pct 1 0.001 60.9 -295
## - c_avg_att 1 0.006 60.9 -295
## - c_numyrs 1 0.013 60.9 -295
## - weight 1 0.014 60.9 -295
## - c_avg_tds 1 0.097 61.0 -295
## - c_rate 1 0.159 61.0 -295
## - c_avg_inter 1 0.231 61.1 -295
## - height 1 0.287 61.2 -294
## - c_avg_yds 1 0.400 61.3 -294
## <none> 60.9 -293
## - age 1 1.434 62.3 -290
##
## Step: AIC=-295.4
## ints ~ height + weight + age + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.013 60.9 -297
## - weight 1 0.014 60.9 -297
## - c_pct 1 0.066 60.9 -297
## - c_avg_tds 1 0.099 61.0 -297
## - c_rate 1 0.161 61.0 -297
## - c_avg_inter 1 0.266 61.1 -296
## - height 1 0.289 61.2 -296
## - c_avg_yds 1 0.400 61.3 -296
## <none> 60.9 -295
## - c_avg_att 1 0.620 61.5 -295
## + c_avg_cmpp 1 0.000 60.9 -293
## - age 1 1.437 62.3 -292
##
## Step: AIC=-297.4
## ints ~ height + weight + age + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.010 60.9 -299
## - c_pct 1 0.056 60.9 -299
## - c_avg_tds 1 0.089 61.0 -299
## - c_rate 1 0.149 61.0 -299
## - c_avg_inter 1 0.302 61.2 -298
## - height 1 0.308 61.2 -298
## - c_avg_yds 1 0.387 61.3 -298
## <none> 60.9 -297
## - c_avg_att 1 0.607 61.5 -297
## + c_numyrs 1 0.013 60.9 -295
## + c_avg_cmpp 1 0.000 60.9 -295
## - age 1 1.434 62.3 -294
##
## Step: AIC=-299.4
## ints ~ height + age + c_rate + c_pct + c_avg_inter + c_avg_tds +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.064 61.0 -301
## - c_avg_tds 1 0.098 61.0 -301
## - c_rate 1 0.163 61.1 -301
## - c_avg_inter 1 0.294 61.2 -300
## - c_avg_yds 1 0.409 61.3 -300
## <none> 60.9 -299
## - height 1 0.596 61.5 -299
## - c_avg_att 1 0.629 61.5 -299
## + weight 1 0.010 60.9 -297
## + c_numyrs 1 0.009 60.9 -297
## + c_avg_cmpp 1 0.000 60.9 -297
## - age 1 1.557 62.5 -295
##
## Step: AIC=-301.1
## ints ~ height + age + c_rate + c_avg_inter + c_avg_tds + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.033 61.0 -303
## - c_rate 1 0.158 61.1 -302
## - c_avg_yds 1 0.503 61.5 -301
## <none> 61.0 -301
## - height 1 0.700 61.7 -300
## - c_avg_inter 1 0.957 61.9 -299
## + c_avg_cmpp 1 0.064 60.9 -299
## + c_pct 1 0.064 60.9 -299
## + weight 1 0.018 60.9 -299
## - c_avg_att 1 1.039 62.0 -299
## + c_numyrs 1 0.001 61.0 -299
## - age 1 1.530 62.5 -297
##
## Step: AIC=-303
## ints ~ height + age + c_rate + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.125 61.1 -304
## - c_avg_yds 1 0.470 61.5 -303
## <none> 61.0 -303
## - height 1 0.685 61.7 -302
## + c_avg_tds 1 0.033 61.0 -301
## + weight 1 0.019 61.0 -301
## + c_numyrs 1 0.000 61.0 -301
## + c_avg_cmpp 1 0.000 61.0 -301
## + c_pct 1 0.000 61.0 -301
## - c_avg_inter 1 1.071 62.1 -301
## - c_avg_att 1 1.094 62.1 -301
## - age 1 1.707 62.7 -298
##
## Step: AIC=-304.5
## ints ~ height + age + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 61.1 -304
## - height 1 0.645 61.8 -304
## + c_rate 1 0.125 61.0 -303
## + c_pct 1 0.056 61.1 -303
## + c_avg_cmpp 1 0.055 61.1 -303
## + weight 1 0.021 61.1 -303
## + c_numyrs 1 0.001 61.1 -302
## + c_avg_tds 1 0.001 61.1 -302
## - c_avg_yds 1 1.048 62.2 -302
## - age 1 1.728 62.8 -300
## - c_avg_att 1 2.424 63.5 -297
## - c_avg_inter 1 3.097 64.2 -295
summary(step_reg.log.no_combine.ints)
##
## Call:
## lm(formula = ints ~ height + age + c_avg_inter + c_avg_yds +
## c_avg_att, data = data.log.no_combine.for_ints)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.408 -0.260 0.090 0.320 0.818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.7994 7.1189 -0.67 0.50088
## height 2.5196 1.6207 1.55 0.12142
## age -0.9759 0.3835 -2.54 0.01159 *
## c_avg_inter 0.3280 0.0963 3.41 0.00078 ***
## c_avg_yds 0.4740 0.2392 1.98 0.04867 *
## c_avg_att -0.8757 0.2906 -3.01 0.00287 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.517 on 229 degrees of freedom
## Multiple R-squared: 0.09, Adjusted R-squared: 0.0701
## F-statistic: 4.53 on 5 and 229 DF, p-value: 0.000588
plot(step_reg.log.no_combine.ints)
leaps.log.no_combine.ints <- regsubsets(ints ~ ., data = data.log.no_combine.for_ints,
nbest = 10)
subsets(leaps.log.no_combine.ints, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_ints, step_reg.log.no_combine.ints, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: ints
## Df Sum Sq Mean Sq F value Pr(>F)
## height 1 0.1 0.141 0.53 0.4676
## age 1 1.4 1.388 5.20 0.0235 *
## c_avg_inter 1 0.0 0.001 0.00 0.9558
## c_avg_yds 1 2.1 2.090 7.83 0.0056 **
## c_avg_att 1 2.4 2.424 9.08 0.0029 **
## Residuals 229 61.1 0.267
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 7 36 39 41 45 46 59 71 75
## Predicted 2.21 2.668 2.2718 2.6289 2.354 2.44 2.260 2.35 2.441 2.289
## cvpred 2.11 2.663 2.2284 2.6118 2.326 2.39 2.227 2.31 2.393 2.192
## ints 2.84 2.839 2.3125 2.5726 2.493 1.96 2.573 2.21 3.096 2.950
## CV residual 0.73 0.176 0.0842 -0.0392 0.168 -0.43 0.345 -0.10 0.703 0.758
## 78 84 87 94 96 99 100 108 110 113
## Predicted 2.403 2.432 2.251 2.404 2.40 2.219 2.615 2.365 2.603 2.361
## cvpred 2.365 2.421 2.163 2.379 2.36 2.064 2.598 2.337 2.612 2.321
## ints 2.573 2.573 1.808 2.715 2.21 2.779 2.715 2.493 2.715 2.208
## CV residual 0.208 0.152 -0.354 0.336 -0.15 0.715 0.117 0.157 0.103 -0.113
## 119 128 135 143 148 153 158 160 161 166 174
## Predicted 2.385 2.485 2.27 2.712 2.413 2.369 2.321 2.24 2.569 2.417 2.50
## cvpred 2.303 2.461 2.20 2.673 2.365 2.343 2.256 2.16 2.515 2.392 2.42
## ints 2.896 2.896 2.31 3.096 2.573 2.839 3.140 2.65 2.646 2.646 3.22
## CV residual 0.593 0.435 0.11 0.423 0.208 0.496 0.884 0.49 0.132 0.254 0.80
## 178 179 188 192 193 201 203 209 211
## Predicted 2.402 2.64107 2.71970 2.568 2.2909 2.476 2.536 2.213 2.728
## cvpred 2.366 2.63644 2.70720 2.549 2.2167 2.430 2.504 2.122 2.728
## ints 3.182 2.64617 2.71469 2.715 2.3125 2.779 2.779 2.407 2.092
## CV residual 0.817 0.00973 0.00749 0.166 0.0958 0.349 0.275 0.285 -0.636
## 220 221 222 232 235 237 240
## Predicted 2.264 2.628 2.148 2.654 2.44 2.888 2.53
## cvpred 2.199 2.577 2.103 2.665 2.40 2.882 2.46
## ints 2.493 3.001 2.646 2.839 2.21 2.573 2.78
## CV residual 0.294 0.424 0.543 0.174 -0.19 -0.309 0.32
##
## Sum of squares = 7.86 Mean square = 0.17 n = 47
##
## fold 2
## Observations in test set: 47
## 4 5 14 17 33 42 43 44 48
## Predicted 2.370 2.26 2.381 2.509 2.4841 2.260 2.458 2.397 2.409
## cvpred 2.368 2.22 2.370 2.566 2.5647 2.226 2.440 2.369 2.421
## ints 2.573 1.13 2.715 3.001 2.4932 2.839 1.808 2.573 2.092
## CV residual 0.204 -1.09 0.344 0.434 -0.0715 0.613 -0.632 0.203 -0.329
## 50 51 55 60 66 69 73 80 101 102
## Predicted 1.861 2.411 2.374 2.161 2.529 2.399 2.185 2.29 2.295 2.205
## cvpred 1.778 2.390 2.374 2.141 2.598 2.413 2.140 2.25 2.272 2.128
## ints 2.313 2.896 2.208 2.092 2.092 2.715 2.313 1.96 2.573 2.313
## CV residual 0.535 0.506 -0.166 -0.049 -0.507 0.302 0.173 -0.29 0.301 0.185
## 106 107 112 114 123 126 129 133 138
## Predicted 2.466 2.355 2.444 2.30 2.492 2.487 2.402 2.513 2.389
## cvpred 2.471 2.382 2.414 2.32 2.525 2.503 2.453 2.559 2.380
## ints 2.896 2.208 2.092 2.57 2.839 1.960 2.208 2.407 2.493
## CV residual 0.424 -0.173 -0.322 0.25 0.314 -0.543 -0.245 -0.152 0.113
## 139 145 150 155 159 163 164 169 176
## Predicted 2.574 2.421 2.4450 2.384 2.262 2.669 2.371 2.477 2.606
## cvpred 2.592 2.441 2.4381 2.367 2.292 2.724 2.355 2.533 2.684
## ints 2.208 3.140 2.4932 2.092 1.960 1.808 3.049 1.960 2.839
## CV residual -0.383 0.699 0.0551 -0.275 -0.332 -0.915 0.694 -0.573 0.155
## 177 182 184 185 189 199 210 213 233 239
## Predicted 2.55 2.805 2.646 2.704 2.47 2.533 2.470 2.47 2.773 2.70
## cvpred 2.59 2.893 2.652 2.740 2.56 2.595 2.467 2.54 2.848 2.81
## ints 2.78 2.779 3.049 2.092 1.41 2.313 2.715 2.78 3.096 1.63
## CV residual 0.19 -0.114 0.397 -0.649 -1.14 -0.283 0.248 0.24 0.247 -1.19
##
## Sum of squares = 10.6 Mean square = 0.23 n = 47
##
## fold 3
## Observations in test set: 47
## 2 6 11 25 27 29 31 32 47
## Predicted 2.385 2.3165 2.248 2.27 2.25 2.351 2.318 2.2625 2.151
## cvpred 2.399 2.3144 2.251 2.08 2.27 2.378 2.321 2.2746 2.144
## ints 2.896 2.4069 1.808 2.49 2.09 2.092 2.646 2.2083 1.808
## CV residual 0.497 0.0925 -0.443 0.41 -0.18 -0.286 0.325 -0.0664 -0.335
## 52 53 58 62 63 64 67 76 79
## Predicted 2.4895 2.454 2.380 2.415 2.227 2.30 2.40 2.362 2.302
## cvpred 2.5065 2.461 2.385 2.403 2.260 2.37 2.42 2.366 2.294
## ints 2.4069 2.313 2.779 2.208 1.808 2.78 1.81 2.092 2.573
## CV residual -0.0995 -0.148 0.394 -0.194 -0.451 0.41 -0.61 -0.274 0.278
## 85 86 89 90 92 93 109 120 121
## Predicted 2.235 2.252 2.363 2.203 2.468 2.266 2.384 2.5499 2.151
## cvpred 2.244 2.303 2.367 2.230 2.499 2.267 2.390 2.5429 2.170
## ints 1.808 1.808 1.808 1.808 2.208 2.839 3.049 2.5726 1.411
## CV residual -0.435 -0.495 -0.559 -0.422 -0.291 0.572 0.659 0.0297 -0.759
## 127 136 146 147 152 165 175 186 191
## Predicted 2.4803 2.621 2.371 2.380 2.617 2.218 2.560 2.593 2.359
## cvpred 2.5058 2.633 2.377 2.368 2.639 2.241 2.573 2.582 2.371
## ints 2.5726 1.960 2.896 2.839 1.960 2.779 2.208 2.839 1.808
## CV residual 0.0669 -0.673 0.519 0.472 -0.679 0.538 -0.364 0.257 -0.562
## 195 202 206 212 215 223 224 225 227
## Predicted 2.5445 2.5589 2.49 2.398 2.515 2.6548 2.482 2.336 2.725
## cvpred 2.5394 2.5669 2.50 2.404 2.506 2.6416 2.462 2.330 2.695
## ints 2.4932 2.4932 2.84 2.839 2.715 2.7147 2.839 3.001 3.182
## CV residual -0.0462 -0.0737 0.34 0.435 0.208 0.0731 0.378 0.671 0.487
## 229 230
## Predicted 2.566 2.57
## cvpred 2.541 2.54
## ints 2.646 1.13
## CV residual 0.105 -1.41
##
## Sum of squares = 9.83 Mean square = 0.21 n = 47
##
## fold 4
## Observations in test set: 47
## 8 9 10 13 16 20 22 23 28
## Predicted 2.255 2.3090 2.376 2.224 2.312 2.537 2.188 2.598 2.135
## cvpred 2.229 2.2953 2.337 2.173 2.259 2.451 2.180 2.685 2.129
## ints 2.573 2.2083 2.573 2.208 2.208 2.896 2.573 2.407 1.131
## CV residual 0.343 -0.0871 0.235 0.035 -0.051 0.445 0.393 -0.278 -0.998
## 38 49 57 61 65 68 72 74 77
## Predicted 2.2536 2.3882 2.309 2.387 2.220 2.349 2.4278 2.123 2.331
## cvpred 2.2203 2.3812 2.299 2.388 2.161 2.327 2.4186 2.130 2.304
## ints 2.2083 2.4069 2.779 2.493 2.779 2.779 2.4932 2.407 1.960
## CV residual -0.0121 0.0257 0.479 0.105 0.618 0.452 0.0746 0.277 -0.344
## 81 83 95 97 98 104 105 115 117
## Predicted 2.428 2.475 2.385 2.35139 2.44 2.575 2.439 2.465 2.2473
## cvpred 2.412 2.445 2.404 2.32009 2.48 2.568 2.389 2.455 2.2254
## ints 2.208 2.779 2.573 2.31254 2.31 2.950 2.573 2.839 2.2083
## CV residual -0.203 0.333 0.169 -0.00755 -0.17 0.382 0.184 0.385 -0.0171
## 122 124 125 131 137 144 149 156 157 162
## Predicted 2.348 2.375 2.431 2.159 2.4356 2.450 2.573 2.485 2.47 2.655
## cvpred 2.306 2.360 2.475 2.121 2.4324 2.460 2.581 2.460 2.52 2.618
## ints 2.779 2.573 1.808 2.646 2.4932 2.839 2.313 2.839 2.65 2.407
## CV residual 0.473 0.212 -0.666 0.525 0.0608 0.379 -0.268 0.379 0.13 -0.211
## 171 187 194 196 197 204 214 218 228 231
## Predicted 2.664 2.421 2.116 2.52 2.504 2.357 2.304 2.560 2.60 2.455
## cvpred 2.632 2.379 2.242 2.44 2.536 2.407 2.298 2.584 2.63 2.458
## ints 2.950 2.646 1.808 2.57 2.313 1.411 2.573 2.839 1.96 2.839
## CV residual 0.318 0.267 -0.433 0.13 -0.224 -0.996 0.274 0.255 -0.67 0.381
##
## Sum of squares = 6.74 Mean square = 0.14 n = 47
##
## fold 5
## Observations in test set: 47
## 1 12 15 18 19 21 24 26 30
## Predicted 2.104 2.14 2.352 2.283 2.488 2.258 2.3967 2.521 2.514
## cvpred 2.187 2.26 2.403 2.387 2.475 2.378 2.4938 2.556 2.585
## ints 2.313 2.09 2.208 2.646 3.001 1.960 2.4069 2.092 2.950
## CV residual 0.126 -0.17 -0.195 0.259 0.526 -0.418 -0.0868 -0.464 0.365
## 34 35 37 40 54 56 70 82 91
## Predicted 2.415 2.133 2.11 2.1515 2.06 2.422 2.40 2.373 2.542
## cvpred 2.503 2.272 2.28 2.2789 2.17 2.463 2.50 2.404 2.558
## ints 2.092 1.808 -2.30 0.0953 2.31 2.208 2.09 1.808 2.715
## CV residual -0.411 -0.463 -4.58 -2.1836 0.14 -0.254 -0.41 -0.596 0.156
## 103 111 116 118 132 134 141 142 151
## Predicted 2.307 2.54 2.494 2.313 2.4034 2.5551 2.295 2.5186 2.233
## cvpred 2.357 2.52 2.542 2.377 2.4644 2.5886 2.349 2.5508 2.368
## ints 1.960 2.84 2.208 2.092 2.4932 2.4932 2.646 2.5726 2.715
## CV residual -0.397 0.32 -0.333 -0.285 0.0288 -0.0954 0.297 0.0218 0.346
## 154 167 168 170 172 173 180 181 183 190
## Predicted 2.670 2.183 2.455 2.49 2.5907 2.512 2.556 2.459 2.478 2.404
## cvpred 2.615 2.280 2.434 2.50 2.5215 2.524 2.555 2.432 2.551 2.504
## ints 2.646 2.092 1.960 2.71 2.5726 2.950 2.839 2.896 1.808 2.646
## CV residual 0.031 -0.188 -0.474 0.21 0.0511 0.425 0.285 0.464 -0.742 0.143
## 198 200 205 207 208 217 219 226 236
## Predicted 2.461 2.334 2.531 2.557 2.639 2.450 2.779 2.67 2.693
## cvpred 2.521 2.408 2.547 2.597 2.649 2.450 2.744 2.62 2.573
## ints 1.960 2.779 2.092 2.208 2.208 2.646 2.950 2.09 2.896
## CV residual -0.561 0.371 -0.455 -0.389 -0.441 0.196 0.205 -0.53 0.323
## 238
## Predicted 2.899
## cvpred 2.726
## ints 2.407
## CV residual -0.319
##
## Sum of squares = 31.3 Mean square = 0.67 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.282