# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
wins = qb_stats["wins"]
# Generate clean data set
data.scaled.no_combine.for_wins = data.frame(scale(na.omit(cbind(wins, college_stats))))
# Generate the linear model
lm.scaled.no_combine.wins <- lm(formula = wins ~ ., data = data.scaled.no_combine.for_wins)
# Find optimum linear regression model for wins
step_reg.scaled.no_combine.wins <- stepAIC(lm.scaled.no_combine.wins, direction = "both")
## Start: AIC=3.38
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.03 217 1.41
## - c_avg_inter 1 0.23 218 1.63
## - c_avg_tds 1 0.33 218 1.74
## - c_pct 1 0.47 218 1.89
## - height 1 0.52 218 1.95
## - age 1 0.94 218 2.40
## - c_numyrs 1 1.22 218 2.70
## <none> 217 3.38
## - c_rate 1 2.20 219 3.76
## - c_avg_att 1 3.51 221 5.17
## - weight 1 4.24 222 5.96
## - c_avg_yds 1 7.49 225 9.41
##
## Step: AIC=1.41
## wins ~ height + weight + age + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.21 218 -0.36
## - c_avg_tds 1 0.39 218 -0.17
## - height 1 0.53 218 -0.01
## - c_pct 1 0.69 218 0.16
## - age 1 0.96 218 0.46
## - c_numyrs 1 1.19 218 0.70
## <none> 217 1.41
## - c_rate 1 2.58 220 2.21
## + c_avg_cmpp 1 0.03 217 3.38
## - weight 1 4.28 222 4.04
## - c_avg_yds 1 8.82 226 8.83
## - c_avg_att 1 8.99 226 9.01
##
## Step: AIC=-0.36
## wins ~ height + weight + age + c_rate + c_pct + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.33 218 -2.01
## - height 1 0.55 218 -1.77
## - c_pct 1 0.74 218 -1.56
## - age 1 1.00 218 -1.28
## - c_numyrs 1 1.06 218 -1.21
## <none> 218 -0.36
## - c_rate 1 2.42 220 0.26
## + c_avg_inter 1 0.21 217 1.41
## + c_avg_cmpp 1 0.01 218 1.63
## - weight 1 4.96 222 2.97
## - c_avg_yds 1 8.68 226 6.91
## - c_avg_att 1 10.02 228 8.31
##
## Step: AIC=-2.01
## wins ~ height + weight + age + c_rate + c_pct + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.50 218 -3.47
## - c_numyrs 1 1.05 219 -2.87
## - c_pct 1 1.11 219 -2.80
## - age 1 1.14 219 -2.77
## <none> 218 -2.01
## + c_avg_tds 1 0.33 218 -0.36
## + c_avg_inter 1 0.15 218 -0.17
## - c_rate 1 3.56 221 -0.17
## + c_avg_cmpp 1 0.00 218 -0.01
## - weight 1 4.88 223 1.24
## - c_avg_yds 1 8.77 227 5.34
## - c_avg_att 1 9.91 228 6.53
##
## Step: AIC=-3.47
## wins ~ weight + age + c_rate + c_pct + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.92 219 -4.47
## - c_pct 1 0.98 219 -4.41
## - age 1 1.04 219 -4.34
## <none> 218 -3.47
## + height 1 0.50 218 -2.01
## - c_rate 1 3.43 222 -1.77
## + c_avg_tds 1 0.28 218 -1.77
## + c_avg_inter 1 0.17 218 -1.65
## + c_avg_cmpp 1 0.00 218 -1.47
## - weight 1 5.05 223 -0.05
## - c_avg_yds 1 8.75 227 3.84
## - c_avg_att 1 9.86 228 5.00
##
## Step: AIC=-4.47
## wins ~ weight + age + c_rate + c_pct + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.84 220 -5.57
## - age 1 1.10 220 -5.28
## <none> 219 -4.47
## + c_numyrs 1 0.92 218 -3.47
## + height 1 0.37 219 -2.87
## + c_avg_tds 1 0.28 219 -2.77
## + c_avg_inter 1 0.06 219 -2.53
## - c_rate 1 3.68 223 -2.52
## + c_avg_cmpp 1 0.00 219 -2.47
## - weight 1 4.42 224 -1.74
## - c_avg_yds 1 9.37 229 3.45
## - c_avg_att 1 10.58 230 4.70
##
## Step: AIC=-5.57
## wins ~ weight + age + c_rate + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 1.63 222 -5.82
## <none> 220 -5.57
## + c_pct 1 0.84 219 -4.47
## + c_numyrs 1 0.78 219 -4.41
## + c_avg_tds 1 0.58 220 -4.19
## + c_avg_cmpp 1 0.41 220 -4.01
## - c_rate 1 3.40 224 -3.94
## + height 1 0.27 220 -3.86
## + c_avg_inter 1 0.08 220 -3.65
## - weight 1 5.00 225 -2.24
## - c_avg_yds 1 8.55 229 1.46
## - c_avg_att 1 9.86 230 2.81
##
## Step: AIC=-5.82
## wins ~ weight + c_rate + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 222 -5.82
## + age 1 1.63 220 -5.57
## + c_pct 1 1.37 220 -5.28
## + c_avg_tds 1 0.94 221 -4.83
## + c_numyrs 1 0.80 221 -4.68
## + c_avg_cmpp 1 0.58 221 -4.44
## - c_rate 1 3.50 225 -4.10
## + height 1 0.15 222 -3.98
## + c_avg_inter 1 0.10 222 -3.92
## - weight 1 4.17 226 -3.40
## - c_avg_yds 1 8.75 230 1.35
## - c_avg_att 1 10.20 232 2.85
summary(step_reg.scaled.no_combine.wins)
##
## Call:
## lm(formula = wins ~ weight + c_rate + c_avg_yds + c_avg_att,
## data = data.scaled.no_combine.for_wins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.694 -0.739 -0.125 0.550 2.864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.36e-16 6.35e-02 0.00 1.0000
## weight 1.40e-01 6.69e-02 2.09 0.0378 *
## c_rate -2.06e-01 1.08e-01 -1.91 0.0568 .
## c_avg_yds 1.21e+00 3.99e-01 3.03 0.0028 **
## c_avg_att -1.18e+00 3.61e-01 -3.27 0.0012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.978 on 232 degrees of freedom
## Multiple R-squared: 0.0606, Adjusted R-squared: 0.0444
## F-statistic: 3.74 on 4 and 232 DF, p-value: 0.00569
plot(step_reg.scaled.no_combine.wins)
leaps.scaled.no_combine.wins <- regsubsets(wins ~ ., data = data.scaled.no_combine.for_wins,
nbest = 10)
subsets(leaps.scaled.no_combine.wins, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_wins, step_reg.scaled.no_combine.wins,
m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: wins
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 3.3 3.29 3.44 0.0648 .
## c_rate 1 0.0 0.00 0.00 0.9956
## c_avg_yds 1 0.8 0.81 0.85 0.3589
## c_avg_att 1 10.2 10.20 10.68 0.0012 **
## Residuals 232 221.7 0.96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 8 15 18 19 20 21 23
## Predicted 0.0644 -0.0486 -0.138 -0.256 0.00605 -0.0789 -0.0984 0.510
## cvpred 0.0547 -0.0459 -0.100 -0.367 -0.07929 -0.1846 -0.1647 0.479
## wins 0.0312 1.3191 -0.935 0.675 0.99711 -0.6127 -0.9346 1.641
## CV residual -0.0235 1.3650 -0.834 1.043 1.07640 -0.4280 -0.7699 1.162
## 26 35 46 55 57 69 71 72
## Predicted 0.5710 -0.168 -0.175 -0.186 -0.0484 -0.0284 0.117 0.0612
## cvpred 0.4932 -0.221 -0.266 -0.232 -0.0551 -0.0601 0.104 -0.0108
## wins 0.0312 -1.257 1.641 -1.257 1.6410 -0.2907 0.675 1.9630
## CV residual -0.4619 -1.036 1.907 -1.025 1.6961 -0.2306 0.571 1.9738
## 73 76 79 81 82 91 96 115
## Predicted 0.0200 0.00544 -0.192 0.208 0.359 0.509 -0.112 0.149
## cvpred -0.0636 -0.07339 -0.242 0.305 0.439 0.515 -0.183 0.112
## wins 1.9630 0.03124 1.641 -1.257 -0.935 -0.613 0.675 -0.613
## CV residual 2.0266 0.10464 1.883 -1.562 -1.373 -1.128 0.858 -0.725
## 118 121 122 124 131 132 133 135
## Predicted -0.181 0.0250 -0.097 -0.150 0.109 -0.2661 -0.0994 -0.1157
## cvpred -0.155 -0.0162 -0.150 -0.173 0.102 -0.3413 -0.0967 0.0285
## wins 0.353 0.6752 0.353 0.353 -0.291 0.0312 -0.6127 0.3532
## CV residual 0.509 0.6913 0.503 0.526 -0.392 0.3726 -0.5160 0.3247
## 140 150 155 164 176 183 187 194
## Predicted 0.162 0.0135 -0.0867 -0.0791 0.167 0.0572 -0.336 0.141
## cvpred 0.180 -0.0272 -0.0470 -0.1178 0.166 0.0428 -0.390 0.104
## wins -0.613 0.6752 0.6752 -0.9346 0.353 0.0312 -0.613 0.675
## CV residual -0.793 0.7023 0.7221 -0.8168 0.187 -0.0116 -0.222 0.571
## 205 214 223 228 235 236 237
## Predicted 0.0724 -0.396 -0.120 -0.0607 -0.3424 -0.0815 0.213
## cvpred 0.0868 -0.396 -0.143 -0.0817 -0.3781 -0.1426 0.214
## wins -0.6127 -1.257 -0.613 -0.6127 -0.2907 0.0312 -0.935
## CV residual -0.6994 -0.860 -0.470 -0.5310 0.0874 0.1738 -1.149
##
## Sum of squares = 42.3 Mean square = 0.9 n = 47
##
## fold 2
## Observations in test set: 48
## 24 31 33 36 38 40 42 43
## Predicted -0.245 -0.156 0.7759 -0.0468 0.0652 -0.02206 -0.079 0.185
## cvpred -0.312 -0.202 0.9472 -0.0400 -0.0216 0.02538 -0.131 0.240
## wins 1.963 0.675 0.9971 -0.6127 2.9288 0.03124 -0.291 -0.935
## CV residual 2.275 0.877 0.0499 -0.5727 2.9504 0.00587 -0.159 -1.175
## 60 63 74 85 88 89 99 103
## Predicted -0.1089 0.335 -0.0392 -0.264 0.171 0.01073 -0.0330 0.0587
## cvpred -0.0732 0.272 -0.1065 -0.263 0.142 0.02425 0.0934 0.1443
## wins 0.0312 0.997 -0.9346 0.353 -0.613 0.03124 -1.2566 -0.6127
## CV residual 0.1045 0.725 -0.8281 0.616 -0.754 0.00699 -1.3500 -0.7569
## 108 110 111 116 119 126 128 138
## Predicted -0.0514 0.1970 0.127 -0.142 -0.2142 0.0635 0.0427 -0.0402
## cvpred 0.0129 0.1623 0.206 -0.190 -0.2130 0.1427 0.1124 -0.0814
## wins -1.2566 0.0312 0.353 -0.613 0.0312 -0.9346 -1.5785 0.6752
## CV residual -1.2695 -0.1311 0.147 -0.423 0.2442 -1.0773 -1.6909 0.7565
## 139 146 149 152 153 159 166 170
## Predicted 0.0761 -0.134 0.0526 0.0136 -0.522 -0.292 -0.1774 -0.140
## cvpred 0.2185 -0.163 0.1316 -0.0307 -0.597 -0.275 -0.2088 -0.112
## wins 1.6410 -1.579 0.6752 -0.2907 -0.291 1.319 -0.2907 0.353
## CV residual 1.4225 -1.416 0.5435 -0.2600 0.306 1.594 -0.0819 0.465
## 173 175 178 191 192 198 202 208
## Predicted -0.00429 0.0191 -0.456 -0.1380 0.0741 0.0044 -0.445 0.0913
## cvpred 0.06216 0.0931 -0.488 -0.0818 0.1114 0.0881 -0.546 0.1918
## wins 1.96298 -0.9346 0.675 -0.9346 -0.2907 -0.2907 -1.579 -0.6127
## CV residual 1.90082 -1.0277 1.163 -0.8529 -0.4022 -0.3788 -1.033 -0.8044
## 209 211 213 215 217 232 233 239
## Predicted -0.2328 0.248 -0.1153 0.290 -0.0330 -0.243 0.0685 0.126
## cvpred -0.1492 0.341 -0.0147 0.415 0.0681 -0.147 0.1036 0.253
## wins 0.0312 -1.257 -1.2566 -0.613 0.6752 -0.935 -1.2566 -0.613
## CV residual 0.1805 -1.598 -1.2418 -1.028 0.6071 -0.787 -1.3602 -0.866
##
## Sum of squares = 52.2 Mean square = 1.09 n = 48
##
## fold 3
## Observations in test set: 48
## 2 4 5 6 7 14 17 47 48
## Predicted 0.429 -0.121 0.1672 -0.1351 0.215 0.470 0.181 -0.123 -0.0758
## cvpred 0.320 -0.136 0.1403 -0.0688 0.161 0.333 0.154 -0.195 -0.0829
## wins 1.963 0.675 0.0312 -0.2907 0.353 0.675 -0.935 -0.935 -0.9346
## CV residual 1.643 0.811 -0.1090 -0.2219 0.192 0.342 -1.089 -0.740 -0.8517
## 52 56 61 66 67 70 77 78
## Predicted 0.373 0.1070 0.1172 0.303 0.00862 -0.06936 -0.0386 0.437
## cvpred 0.329 0.0176 0.0664 0.138 -0.06500 0.00442 -0.0864 0.353
## wins 2.607 -1.2566 -1.2566 0.997 0.35320 -0.93462 -0.6127 1.963
## CV residual 2.278 -1.2742 -1.3230 0.859 0.41820 -0.93905 -0.5263 1.610
## 80 86 90 100 102 112 114 141
## Predicted -0.1032 -0.277 -0.112 0.3417 0.0952 0.187 -0.210 -0.297
## cvpred -0.2058 -0.346 -0.188 0.2650 0.0249 0.205 -0.148 -0.356
## wins -0.2907 -0.613 -1.257 0.0312 2.6069 -0.613 0.997 -0.935
## CV residual -0.0849 -0.266 -1.069 -0.2338 2.5820 -0.818 1.145 -0.579
## 144 156 157 158 160 163 165 167
## Predicted 0.0968 0.0739 0.5195 -0.3294 0.03462 -0.1140 -0.3512 -0.316
## cvpred 0.0213 0.0627 0.3819 -0.4828 0.00584 -0.0948 -0.4503 -0.402
## wins -0.2907 0.3532 0.3532 0.0312 -0.61267 0.6752 0.0312 -1.579
## CV residual -0.3121 0.2905 -0.0287 0.5140 -0.61851 0.7700 0.4816 -1.176
## 171 174 182 184 190 199 201 203
## Predicted 0.0477 0.061 0.0565 0.1150 -0.3759 -0.154 -0.0497 0.1384
## cvpred -0.0645 -0.137 -0.0996 0.0774 -0.2932 -0.246 -0.1102 0.0834
## wins 0.0312 2.285 0.3532 -0.6127 -0.2907 1.319 1.6410 0.3532
## CV residual 0.0958 2.422 0.4528 -0.6900 0.0025 1.565 1.7512 0.2698
## 207 210 218 225 231 234 238
## Predicted -0.301 -0.107 0.294 -0.183 -0.194 0.244 0.0481
## cvpred -0.267 -0.136 0.209 -0.274 -0.297 0.116 -0.0921
## wins -0.613 1.641 -0.613 -0.613 1.963 1.641 1.3191
## CV residual -0.346 1.777 -0.821 -0.339 2.260 1.525 1.4112
##
## Sum of squares = 58.2 Mean square = 1.21 n = 48
##
## fold 4
## Observations in test set: 47
## 9 13 25 27 29 34 44 45
## Predicted -0.196 -0.359 1.106 -0.2179 -0.237 -0.0857 0.0169 0.0248
## cvpred -0.132 -0.242 0.980 -0.1708 -0.110 -0.0353 0.1133 0.0870
## wins 0.353 -1.579 1.963 0.0312 -1.579 0.0312 -0.9346 0.3532
## CV residual 0.485 -1.337 0.983 0.2020 -1.468 0.0665 -1.0479 0.2662
## 49 51 54 64 65 68 75 97
## Predicted -0.0238 0.0816 -0.0850 0.166 -0.502 -0.0361 0.437 -0.0858
## cvpred 0.0539 0.1637 0.0442 0.240 -0.394 0.0180 0.521 -0.0547
## wins -0.9346 0.3532 -1.2566 0.675 0.997 -0.6127 -1.257 -1.5785
## CV residual -0.9885 0.1895 -1.3007 0.435 1.391 -0.6306 -1.777 -1.5239
## 101 106 107 113 117 123 129 130
## Predicted 0.578 0.385 0.0912 -0.230 -0.268 -0.0745 0.127 0.3227
## cvpred 0.647 0.462 0.1349 -0.148 -0.200 -0.0197 0.198 0.3919
## wins 0.675 0.675 -0.6127 -0.613 -0.935 0.0312 -0.613 0.0312
## CV residual 0.028 0.213 -0.7476 -0.465 -0.735 0.0509 -0.810 -0.3606
## 134 137 147 148 154 161 169 177
## Predicted -0.0205 -0.0823 -0.291 0.154 0.405 0.0420 -0.267 0.2221
## cvpred 0.0228 -0.0734 -0.262 0.236 0.327 0.0519 -0.233 0.2101
## wins 1.3191 -0.6127 -0.613 -0.935 -1.257 0.3532 -1.579 0.0312
## CV residual 1.2963 -0.5392 -0.350 -1.170 -1.584 0.3013 -1.345 -0.1789
## 180 181 185 189 193 195 197 200
## Predicted 0.0417 -0.01175 0.601 -0.3554 -0.3434 -0.00526 0.0286 -0.241
## cvpred 0.0104 0.00461 0.686 -0.3532 -0.3502 0.01845 0.0441 -0.201
## wins 0.3532 0.67516 -0.291 0.0312 -0.2907 -1.25658 0.9971 0.353
## CV residual 0.3428 0.67055 -0.976 0.3845 0.0595 -1.27503 0.9530 0.554
## 204 212 219 221 224 226 229
## Predicted 0.0141 -0.191 0.1245 0.289 -0.0281 -0.0168 0.0983
## cvpred 0.0513 -0.188 0.0555 0.252 -0.0467 0.0122 0.0584
## wins 1.6410 -0.935 -0.6127 2.607 -0.9346 -0.6127 0.0312
## CV residual 1.5898 -0.747 -0.6682 2.355 -0.8879 -0.6248 -0.0271
##
## Sum of squares = 41.9 Mean square = 0.89 n = 47
##
## fold 5
## Observations in test set: 47
## 1 10 11 12 16 22 28 30 32
## Predicted -0.212 -0.0501 0.389 0.180 -0.255 0.1170 -0.279 0.0567 -0.124
## cvpred -0.211 -0.0516 0.300 0.130 -0.242 0.0746 -0.249 0.0393 -0.124
## wins 1.963 -0.9346 0.675 -0.613 -1.257 0.3532 -0.613 1.6410 -0.291
## CV residual 2.174 -0.8830 0.375 -0.743 -1.014 0.2786 -0.364 1.6017 -0.166
## 39 41 50 53 58 59 62 83
## Predicted 0.268 0.292 -0.442 -0.253 0.3264 -0.2970 -0.4400 0.668
## cvpred 0.216 0.214 -0.400 -0.204 0.2566 -0.2689 -0.3781 0.545
## wins 0.997 -0.291 -0.935 0.353 0.0312 -0.2907 0.0312 1.963
## CV residual 0.781 -0.505 -0.535 0.557 -0.2254 -0.0218 0.4094 1.418
## 84 87 92 93 94 95 98 104
## Predicted -0.414 0.0630 0.189 0.161 -0.0155 0.1290 0.0834 0.397
## cvpred -0.387 0.0539 0.152 0.134 -0.0225 0.1000 0.0762 0.338
## wins -0.935 -1.2566 -1.257 1.963 -0.6127 0.0312 0.0312 0.675
## CV residual -0.548 -1.3105 -1.409 1.829 -0.5902 -0.0687 -0.0449 0.337
## 105 109 120 125 127 136 142 143
## Predicted 0.0925 0.0258 0.182 0.133 0.255 0.184 -0.00952 0.0682
## cvpred 0.0934 0.0149 0.168 0.106 0.230 0.166 -0.00954 0.0517
## wins -0.6127 0.0312 -0.291 1.641 1.319 -0.291 0.03124 2.2849
## CV residual -0.7060 0.0163 -0.459 1.535 1.089 -0.457 0.04079 2.2332
## 145 151 162 168 172 179 186
## Predicted -0.2328 -0.5814 0.1066 0.026599 0.0604 -0.00834 -0.0853
## cvpred -0.1964 -0.5164 0.0961 -0.000214 0.0388 0.00432 -0.0777
## wins -0.2907 -0.6127 -0.2907 -0.612667 -1.2566 0.35320 -0.6127
## CV residual -0.0943 -0.0963 -0.3868 -0.612453 -1.2954 0.34888 -0.5350
## 188 196 206 216 220 222 227
## Predicted 0.113 -0.488 -0.1111 -0.1158 -0.205 -0.685 0.190
## cvpred 0.106 -0.414 -0.0916 -0.0681 -0.141 -0.537 0.192
## wins 0.675 -0.613 -0.2907 0.3532 -0.613 -0.935 -1.257
## CV residual 0.569 -0.199 -0.1991 0.4213 -0.472 -0.398 -1.449
##
## Sum of squares = 36.7 Mean square = 0.78 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.976