# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
wins = qb_stats["wins"]
# Generate clean data set
data.log.no_combine.for_wins = data.frame(log(na.omit(cbind(wins, college_stats)) +
0.1))
# Generate the linear model
lm.log.no_combine.wins <- lm(formula = wins ~ ., data = data.log.no_combine.for_wins)
# Find optimum linear regression model for wins
step_reg.log.no_combine.wins <- stepAIC(lm.log.no_combine.wins, direction = "both")
## Start: AIC=-23.43
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.045 194 -25.4
## - c_avg_tds 1 0.243 194 -25.1
## - age 1 0.251 194 -25.1
## - c_numyrs 1 0.440 194 -24.9
## - c_rate 1 0.515 194 -24.8
## - height 1 0.852 195 -24.4
## - weight 1 1.385 195 -23.7
## <none> 194 -23.4
## - c_pct 1 1.933 196 -23.1
## - c_avg_cmpp 1 2.108 196 -22.9
## - c_avg_yds 1 2.282 196 -22.7
## - c_avg_att 1 2.569 197 -22.3
##
## Step: AIC=-25.37
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 0.26 194 -27.1
## - c_numyrs 1 0.40 194 -26.9
## - c_avg_tds 1 0.68 195 -26.5
## - height 1 0.88 195 -26.3
## - c_rate 1 0.93 195 -26.2
## - weight 1 1.55 196 -25.5
## <none> 194 -25.4
## - c_pct 1 2.62 197 -24.2
## - c_avg_cmpp 1 2.66 197 -24.1
## - c_avg_att 1 2.97 197 -23.8
## + c_avg_inter 1 0.05 194 -23.4
## - c_avg_yds 1 4.75 199 -21.6
##
## Step: AIC=-27.05
## wins ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.42 195 -28.5
## - height 1 0.83 195 -28.1
## - c_avg_tds 1 0.83 195 -28.0
## - c_rate 1 0.87 195 -28.0
## - weight 1 1.35 196 -27.4
## <none> 194 -27.1
## - c_pct 1 2.56 197 -25.9
## - c_avg_cmpp 1 2.60 197 -25.9
## - c_avg_att 1 2.90 197 -25.5
## + age 1 0.26 194 -25.4
## + c_avg_inter 1 0.05 194 -25.1
## - c_avg_yds 1 4.66 199 -23.4
##
## Step: AIC=-28.54
## wins ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.76 196 -29.6
## - c_avg_tds 1 0.87 196 -29.5
## - c_rate 1 0.91 196 -29.4
## - weight 1 1.10 196 -29.2
## <none> 195 -28.5
## + c_numyrs 1 0.42 194 -27.1
## - c_pct 1 2.95 198 -27.0
## - c_avg_cmpp 1 2.98 198 -26.9
## + age 1 0.28 194 -26.9
## + c_avg_inter 1 0.01 195 -26.6
## - c_avg_att 1 3.31 198 -26.6
## - c_avg_yds 1 4.88 200 -24.7
##
## Step: AIC=-29.62
## wins ~ weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.44 196 -31.1
## - c_avg_tds 1 0.75 196 -30.7
## - c_rate 1 0.96 196 -30.4
## <none> 196 -29.6
## + height 1 0.76 195 -28.5
## - c_pct 1 2.61 198 -28.5
## - c_avg_cmpp 1 2.64 198 -28.4
## - c_avg_att 1 2.96 198 -28.1
## + c_numyrs 1 0.35 195 -28.1
## + age 1 0.22 195 -27.9
## + c_avg_inter 1 0.02 196 -27.6
## - c_avg_yds 1 4.92 200 -25.7
##
## Step: AIC=-31.08
## wins ~ c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.78 197 -32.1
## - c_avg_tds 1 0.95 197 -31.9
## <none> 196 -31.1
## + weight 1 0.44 196 -29.6
## + c_numyrs 1 0.19 196 -29.3
## - c_pct 1 3.17 199 -29.3
## - c_avg_cmpp 1 3.18 199 -29.3
## + height 1 0.10 196 -29.2
## + age 1 0.10 196 -29.2
## + c_avg_inter 1 0.09 196 -29.2
## - c_avg_att 1 3.47 199 -28.9
## - c_avg_yds 1 4.61 201 -27.6
##
## Step: AIC=-32.15
## wins ~ c_avg_cmpp + c_pct + c_avg_tds + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 197 -32.1
## - c_avg_cmpp 1 2.52 199 -31.1
## + c_rate 1 0.78 196 -31.1
## - c_pct 1 2.59 199 -31.0
## - c_avg_att 1 2.73 200 -30.9
## + c_avg_inter 1 0.31 196 -30.5
## + weight 1 0.25 196 -30.5
## + c_numyrs 1 0.24 196 -30.4
## + height 1 0.18 196 -30.4
## + age 1 0.08 197 -30.2
## - c_avg_tds 1 6.36 203 -26.6
## - c_avg_yds 1 6.86 204 -26.0
summary(step_reg.log.no_combine.wins)
##
## Call:
## lm(formula = wins ~ c_avg_cmpp + c_pct + c_avg_tds + c_avg_yds +
## c_avg_att, data = data.log.no_combine.for_wins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.659 -0.316 0.175 0.585 1.339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 130.799 76.665 1.71 0.0893 .
## c_avg_cmpp 29.062 16.897 1.72 0.0868 .
## c_pct -29.322 16.819 -1.74 0.0826 .
## c_avg_tds -0.649 0.238 -2.73 0.0068 **
## c_avg_yds 1.965 0.692 2.84 0.0049 **
## c_avg_att -30.457 17.004 -1.79 0.0746 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.923 on 231 degrees of freedom
## Multiple R-squared: 0.0495, Adjusted R-squared: 0.0289
## F-statistic: 2.41 on 5 and 231 DF, p-value: 0.0376
plot(step_reg.log.no_combine.wins)
leaps.log.no_combine.wins <- regsubsets(wins ~ ., data = data.log.no_combine.for_wins,
nbest = 10)
subsets(leaps.log.no_combine.wins, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_wins, step_reg.log.no_combine.wins, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: wins
## Df Sum Sq Mean Sq F value Pr(>F)
## c_avg_cmpp 1 1.4 1.36 1.60 0.207
## c_pct 1 0.3 0.32 0.38 0.540
## c_avg_tds 1 1.0 0.98 1.16 0.283
## c_avg_yds 1 4.8 4.85 5.69 0.018 *
## c_avg_att 1 2.7 2.73 3.21 0.075 .
## Residuals 231 196.7 0.85
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 8 15 18 19 20 21 23 26 35
## Predicted 1.306 1.306 1.199 1.012 1.15 1.321 1.132 2.22 1.317 1.3052
## cvpred 1.207 1.261 1.077 0.898 1.03 1.286 1.072 2.17 1.275 1.3348
## wins 1.629 2.208 0.742 1.960 2.09 1.131 0.742 2.31 1.629 0.0953
## CV residual 0.422 0.947 -0.335 1.062 1.06 -0.154 -0.330 0.14 0.355 -1.2395
## 46 55 57 69 71 72 73 76 79 81
## Predicted 1.19 1.1705 1.27 1.250 1.43 1.39 1.630 1.398 1.14 1.5257
## cvpred 1.12 1.0904 1.28 1.174 1.45 1.33 1.601 1.362 1.08 1.5862
## wins 2.31 0.0953 2.31 1.411 1.96 2.41 2.407 1.629 2.31 0.0953
## CV residual 1.19 -0.9951 1.03 0.237 0.51 1.08 0.806 0.267 1.23 -1.4909
## 82 91 96 115 118 121 122 124 131 132
## Predicted 1.373 1.349 1.12 1.377 1.346 1.353 1.302 1.199 1.30 1.189
## cvpred 1.401 1.387 1.00 1.374 1.356 1.360 1.329 1.129 1.19 1.130
## wins 0.742 1.131 1.96 1.131 1.808 1.960 1.808 1.808 1.41 1.629
## CV residual -0.659 -0.255 0.96 -0.242 0.452 0.601 0.479 0.679 0.22 0.499
## 133 135 140 150 155 164 176 183 187
## Predicted 1.383 1.438 1.448 1.324 1.445 1.288 1.6648 1.4899 1.063
## cvpred 1.401 1.503 1.414 1.322 1.434 1.263 1.7500 1.5353 0.983
## wins 1.131 1.808 1.131 1.960 1.960 0.742 1.8083 1.6292 1.131
## CV residual -0.269 0.305 -0.283 0.638 0.526 -0.522 0.0583 0.0939 0.149
## 194 205 214 223 228 235 236 237
## Predicted 1.652 2.06 1.0394 1.1655 1.2483 1.148 1.411 1.854
## cvpred 1.781 2.21 0.9783 1.0983 1.1471 1.059 1.405 1.953
## wins 1.960 1.13 0.0953 1.1314 1.1314 1.411 1.629 0.742
## CV residual 0.179 -1.08 -0.8829 0.0331 -0.0157 0.352 0.224 -1.211
##
## Sum of squares = 22.5 Mean square = 0.48 n = 47
##
## fold 2
## Observations in test set: 48
## 24 31 33 36 38 40 42 43 60 63
## Predicted 1.23 1.194 1.091 1.1928 1.31 1.427 1.392 1.555 1.265 1.204
## cvpred 1.26 1.207 0.972 1.2190 1.36 1.492 1.460 1.665 1.290 1.246
## wins 2.41 1.960 2.092 1.1314 2.65 1.629 1.411 0.742 1.629 2.092
## CV residual 1.15 0.753 1.120 -0.0876 1.29 0.137 -0.049 -0.923 0.339 0.845
## 74 85 88 89 99 103 108 110 111
## Predicted 1.354 1.097 1.270 1.32 1.5793 1.297 1.2445 1.391 1.544
## cvpred 1.404 1.083 1.318 1.37 1.6999 1.320 1.2901 1.435 1.636
## wins 0.742 1.808 1.131 1.63 0.0953 1.131 0.0953 1.629 1.808
## CV residual -0.662 0.725 -0.186 0.26 -1.6046 -0.189 -1.1948 0.194 0.172
## 116 119 126 128 138 139 146 149 152 153
## Predicted 1.353 1.337 1.466 1.36 1.29 1.313 1.28 1.593 1.3742 1.200
## cvpred 1.407 1.364 1.544 1.43 1.33 1.369 1.31 1.695 1.4375 1.200
## wins 1.131 1.629 0.742 -2.30 1.96 2.313 -2.30 1.960 1.4110 1.411
## CV residual -0.276 0.266 -0.802 -3.73 0.63 0.943 -3.62 0.265 -0.0265 0.211
## 159 166 170 173 175 178 191 192 198 202
## Predicted 1.243 1.167 1.416 1.23 1.551 1.158 1.083 1.544 1.25 1.09
## cvpred 1.232 1.187 1.476 1.25 1.632 1.128 1.056 1.615 1.27 1.08
## wins 2.208 1.411 1.808 2.41 0.742 1.960 0.742 1.411 1.41 -2.30
## CV residual 0.976 0.224 0.333 1.15 -0.890 0.832 -0.314 -0.204 0.14 -3.38
## 208 209 211 213 215 217 232 233 239
## Predicted 1.858 0.999 1.3690 1.3575 1.385 1.293 1.058 1.2311 1.039
## cvpred 1.989 0.969 1.4074 1.3932 1.366 1.316 1.052 1.2271 0.982
## wins 1.131 1.629 0.0953 0.0953 1.131 1.960 0.742 0.0953 1.131
## CV residual -0.858 0.660 -1.3121 -1.2979 -0.234 0.644 -0.310 -1.1318 0.149
##
## Sum of squares = 62.9 Mean square = 1.31 n = 48
##
## fold 3
## Observations in test set: 48
## 2 4 5 6 7 14 17 47 48 52
## Predicted 1.32 1.169 1.267 1.16 1.479 1.301 1.395 1.355 1.272 1.29
## cvpred 1.29 1.194 1.207 1.12 1.444 1.263 1.322 1.296 1.190 1.25
## wins 2.41 1.960 1.629 1.41 1.808 1.960 0.742 0.742 0.742 2.57
## CV residual 1.12 0.766 0.422 0.29 0.365 0.697 -0.580 -0.554 -0.448 1.32
## 56 61 66 67 70 77 78 80 86
## Predicted 1.2742 1.2149 1.714 1.33 1.394 1.477 1.33 1.3957 1.1891
## cvpred 1.2051 1.1526 1.585 1.30 1.388 1.410 1.25 1.3751 1.1443
## wins 0.0953 0.0953 2.092 1.81 0.742 1.131 2.41 1.4110 1.1314
## CV residual -1.1098 -1.0573 0.507 0.51 -0.646 -0.278 1.16 0.0359 -0.0129
## 90 100 102 112 114 141 144 156 157
## Predicted 1.3205 1.6542 1.39 1.2514 1.178 1.327 1.4568 1.223 1.7548
## cvpred 1.2860 1.5858 1.38 1.1869 1.129 1.287 1.3963 1.165 1.7138
## wins 0.0953 1.6292 2.57 1.1314 2.092 0.742 1.4110 1.808 1.8083
## CV residual -1.1907 0.0435 1.19 -0.0555 0.963 -0.545 0.0147 0.644 0.0944
## 158 160 163 165 167 171 174 182 184 190
## Predicted 1.308 1.32 1.021 1.265 0.951 1.341 1.34 1.572 1.223 1.325
## cvpred 1.280 1.25 0.998 1.264 1.009 1.264 1.22 1.424 1.157 1.258
## wins 1.629 1.13 1.960 1.629 -2.303 1.629 2.49 1.808 1.131 1.411
## CV residual 0.349 -0.12 0.963 0.365 -3.311 0.365 1.28 0.384 -0.026 0.153
## 199 201 203 207 210 218 225 231 234 238
## Predicted 1.462 1.30 1.389 1.134 1.14 1.682 1.198 1.09 1.381 1.530
## cvpred 1.394 1.21 1.303 1.108 1.11 1.632 1.143 1.08 1.347 1.441
## wins 2.208 2.31 1.808 1.131 2.31 1.131 1.131 2.41 2.313 2.208
## CV residual 0.815 1.10 0.506 0.023 1.21 -0.501 -0.012 1.33 0.965 0.767
##
## Sum of squares = 35.7 Mean square = 0.74 n = 48
##
## fold 4
## Observations in test set: 47
## 9 13 25 27 29 34 44 45 49 51
## Predicted 1.536 1.21 2.32 1.314 1.10 1.13 1.178 1.263 1.328 1.370
## cvpred 1.580 1.34 4.15 1.405 1.27 1.26 1.283 1.351 1.355 1.438
## wins 1.808 -2.30 2.41 1.629 -2.30 1.63 0.742 1.808 0.742 1.808
## CV residual 0.229 -3.65 -1.74 0.225 -3.58 0.37 -0.541 0.458 -0.613 0.371
## 54 64 65 68 75 97 101 106 107
## Predicted 1.1654 1.419 1.099 1.189 1.4290 1.11 1.534 1.487 1.251
## cvpred 1.3254 1.445 1.266 1.290 1.4664 1.19 1.572 1.487 1.284
## wins 0.0953 1.960 2.092 1.131 0.0953 -2.30 1.960 1.960 1.131
## CV residual -1.2301 0.515 0.826 -0.159 -1.3711 -3.50 0.388 0.473 -0.153
## 113 117 123 129 130 134 137 147 148
## Predicted 1.0540 1.428 1.337 1.339 1.46 1.194 1.38 1.117 1.398
## cvpred 1.1913 1.513 1.388 1.390 1.44 1.313 1.42 1.264 1.451
## wins 1.1314 0.742 1.629 1.131 1.63 2.208 1.13 1.131 0.742
## CV residual -0.0599 -0.771 0.242 -0.258 0.19 0.895 -0.29 -0.133 -0.709
## 154 161 169 177 180 181 185 189 193
## Predicted 1.6455 1.589 0.938 1.481 1.554 1.509 1.2702 1.042 1.07
## cvpred 1.8178 1.552 1.111 1.424 1.530 1.468 1.3549 1.147 1.19
## wins 0.0953 1.808 -2.303 1.629 1.808 1.960 1.4110 1.629 1.41
## CV residual -1.7225 0.256 -3.414 0.206 0.278 0.492 0.0561 0.482 0.22
## 195 197 200 204 212 219 221 224 226
## Predicted 1.3006 1.223 1.20 1.833 1.232 0.9831 1.39 1.386 1.330
## cvpred 1.3574 1.268 1.33 1.717 1.335 1.0750 1.39 1.359 1.356
## wins 0.0953 2.092 1.81 2.313 0.742 1.1314 2.57 0.742 1.131
## CV residual -1.2621 0.823 0.48 0.596 -0.593 0.0564 1.19 -0.617 -0.224
## 229
## Predicted 0.829
## cvpred 0.977
## wins 1.629
## CV residual 0.652
##
## Sum of squares = 70.3 Mean square = 1.5 n = 47
##
## fold 5
## Observations in test set: 47
## 1 10 11 12 16 22 28 30 32 39
## Predicted 1.14 1.186 1.37 1.42 1.1922 1.279 1.496 1.27 1.119 1.429
## cvpred 1.10 1.158 1.32 1.36 1.1506 1.233 1.444 1.24 1.086 1.383
## wins 2.41 0.742 1.96 1.13 0.0953 1.808 1.131 2.31 1.411 2.092
## CV residual 1.31 -0.416 0.64 -0.23 -1.0553 0.576 -0.313 1.07 0.325 0.709
## 41 50 53 58 59 62 83 84 87
## Predicted 1.236 0.983 1.082 1.350 1.089 0.947 1.35 1.032 1.1049
## cvpred 1.184 0.963 1.080 1.300 1.062 0.949 1.29 0.992 1.0996
## wins 1.411 0.742 1.808 1.629 1.411 1.629 2.41 0.742 0.0953
## CV residual 0.227 -0.221 0.729 0.329 0.349 0.680 1.11 -0.250 -1.0042
## 92 93 94 95 98 104 105 109 120
## Predicted 1.4075 1.642 1.298 1.479 1.190 1.412 1.262 1.43 1.3949
## cvpred 1.3703 1.594 1.267 1.436 1.192 1.385 1.250 1.39 1.3905
## wins 0.0953 2.407 1.131 1.629 1.629 1.960 1.131 1.63 1.4110
## CV residual -1.2750 0.812 -0.136 0.194 0.437 0.575 -0.118 0.24 0.0205
## 125 127 136 142 143 145 151 162 168
## Predicted 1.670 1.426 1.713 1.452 1.845 1.208 1.2449 1.187 1.371
## cvpred 1.619 1.407 1.663 1.416 1.808 1.192 1.2098 1.175 1.325
## wins 2.313 2.208 1.411 1.629 2.493 1.411 1.1314 1.411 1.131
## CV residual 0.693 0.802 -0.252 0.213 0.685 0.219 -0.0784 0.236 -0.194
## 172 179 186 188 196 206 216 220 222
## Predicted 1.4919 1.648 1.449 1.754 1.1635 1.44865 1.102 0.872 1.230
## cvpred 1.4646 1.619 1.445 1.707 1.1479 1.41215 1.112 0.904 1.259
## wins 0.0953 1.808 1.131 1.960 1.1314 1.41099 1.808 1.131 0.742
## CV residual -1.3693 0.189 -0.314 0.253 -0.0165 -0.00116 0.696 0.227 -0.517
## 227
## Predicted 1.1509
## cvpred 1.1755
## wins 0.0953
## CV residual -1.0802
##
## Sum of squares = 18.1 Mean square = 0.38 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.884