# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
yds = qb_stats["yds"]
# Generate clean data set
data.log.no_combine.for_yds = data.frame(log(na.omit(cbind(yds, college_stats)) +
0.1))
# Generate the linear model
lm.log.no_combine.yds <- lm(formula = yds ~ ., data = data.log.no_combine.for_yds)
# Find optimum linear regression model for yds
step_reg.log.no_combine.yds <- stepAIC(lm.log.no_combine.yds, direction = "both")
## Start: AIC=-123.2
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.004 129 -125
## - c_avg_tds 1 0.009 129 -125
## - height 1 0.018 129 -125
## - c_rate 1 0.039 129 -125
## - c_avg_inter 1 0.075 129 -125
## - c_avg_yds 1 0.307 129 -125
## - age 1 0.807 130 -124
## <none> 129 -123
## - c_pct 1 1.154 130 -123
## - c_avg_cmpp 1 1.257 130 -123
## - c_avg_att 1 1.414 130 -123
## - weight 1 1.860 131 -122
##
## Step: AIC=-125.2
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.007 129 -127
## - height 1 0.020 129 -127
## - c_rate 1 0.035 129 -127
## - c_avg_inter 1 0.091 129 -127
## - c_avg_yds 1 0.304 129 -127
## - age 1 0.810 130 -126
## <none> 129 -125
## - c_pct 1 1.267 130 -125
## - c_avg_cmpp 1 1.362 130 -125
## - c_avg_att 1 1.513 131 -124
## - weight 1 1.886 131 -124
## + c_numyrs 1 0.004 129 -123
##
## Step: AIC=-127.2
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.020 129 -129
## - c_rate 1 0.052 129 -129
## - c_avg_inter 1 0.213 129 -129
## - c_avg_yds 1 0.518 130 -128
## - age 1 0.805 130 -128
## <none> 129 -127
## - c_pct 1 1.367 130 -127
## - c_avg_cmpp 1 1.418 130 -127
## - c_avg_att 1 1.528 131 -126
## - weight 1 1.943 131 -126
## + c_avg_tds 1 0.007 129 -125
## + c_numyrs 1 0.002 129 -125
##
## Step: AIC=-129.2
## yds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.05 129 -131
## - c_avg_inter 1 0.22 129 -131
## - c_avg_yds 1 0.52 130 -130
## - age 1 0.83 130 -130
## <none> 129 -129
## - c_pct 1 1.44 131 -128
## - c_avg_cmpp 1 1.50 131 -128
## - c_avg_att 1 1.61 131 -128
## + height 1 0.02 129 -127
## + c_avg_tds 1 0.01 129 -127
## + c_numyrs 1 0.00 129 -127
## - weight 1 3.46 133 -125
##
## Step: AIC=-131.1
## yds ~ weight + age + c_avg_cmpp + c_pct + c_avg_inter + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.71 130 -132
## - age 1 0.90 130 -131
## <none> 129 -131
## - c_pct 1 1.43 131 -130
## - c_avg_cmpp 1 1.47 131 -130
## - c_avg_att 1 1.56 131 -130
## - c_avg_yds 1 1.92 131 -130
## + c_rate 1 0.05 129 -129
## + c_avg_tds 1 0.02 129 -129
## + height 1 0.02 129 -129
## + c_numyrs 1 0.00 129 -129
## - weight 1 3.56 133 -127
##
## Step: AIC=-131.7
## yds ~ weight + age + c_avg_cmpp + c_pct + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 0.854 131 -132
## - c_pct 1 0.854 131 -132
## - c_avg_cmpp 1 0.882 131 -132
## - c_avg_att 1 0.956 131 -132
## <none> 130 -132
## + c_avg_inter 1 0.713 129 -131
## + c_rate 1 0.548 129 -131
## - c_avg_yds 1 1.644 132 -131
## + c_avg_tds 1 0.100 130 -130
## + height 1 0.038 130 -130
## + c_numyrs 1 0.033 130 -130
## - weight 1 2.986 133 -128
##
## Step: AIC=-132.2
## yds ~ weight + c_avg_cmpp + c_pct + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.774 132 -133
## - c_avg_cmpp 1 0.813 132 -133
## - c_avg_att 1 0.881 132 -133
## <none> 131 -132
## + age 1 0.854 130 -132
## - c_avg_yds 1 1.419 132 -132
## + c_avg_inter 1 0.664 130 -131
## + c_rate 1 0.650 130 -131
## + c_avg_tds 1 0.188 131 -130
## + height 1 0.076 131 -130
## + c_numyrs 1 0.048 131 -130
## - weight 1 2.467 133 -130
##
## Step: AIC=-132.8
## yds ~ weight + c_avg_cmpp + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.29 132 -134
## - c_avg_yds 1 1.08 133 -133
## <none> 132 -133
## + c_pct 1 0.77 131 -132
## + age 1 0.77 131 -132
## + c_rate 1 0.24 131 -131
## + height 1 0.14 131 -131
## + c_avg_inter 1 0.13 131 -131
## + c_numyrs 1 0.12 131 -131
## + c_avg_tds 1 0.05 132 -131
## - c_avg_att 1 2.38 134 -130
## - weight 1 3.42 135 -129
##
## Step: AIC=-134.2
## yds ~ weight + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 132 -134
## + age 1 0.92 131 -134
## + c_avg_cmpp 1 0.29 132 -133
## + c_pct 1 0.25 132 -133
## + height 1 0.20 132 -133
## + c_numyrs 1 0.09 132 -132
## + c_avg_tds 1 0.07 132 -132
## + c_avg_inter 1 0.06 132 -132
## + c_rate 1 0.00 132 -132
## - c_avg_yds 1 2.70 134 -131
## - c_avg_att 1 2.75 135 -131
## - weight 1 4.00 136 -129
summary(step_reg.log.no_combine.yds)
##
## Call:
## lm(formula = yds ~ weight + c_avg_yds + c_avg_att, data = data.log.no_combine.for_yds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.590 -0.213 0.071 0.315 0.904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.320 4.010 -1.08 0.2825
## weight 2.022 0.757 2.67 0.0081 **
## c_avg_yds 0.733 0.334 2.19 0.0292 *
## c_avg_att -0.820 0.370 -2.22 0.0277 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.749 on 235 degrees of freedom
## Multiple R-squared: 0.0512, Adjusted R-squared: 0.0391
## F-statistic: 4.23 on 3 and 235 DF, p-value: 0.00621
plot(step_reg.log.no_combine.yds)
leaps.log.no_combine.yds <- regsubsets(yds ~ ., data = data.log.no_combine.for_yds,
nbest = 10)
subsets(leaps.log.no_combine.yds, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_yds, step_reg.log.no_combine.yds, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: yds
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 4.3 4.35 7.76 0.0058 **
## c_avg_yds 1 0.0 0.01 0.01 0.9173
## c_avg_att 1 2.8 2.75 4.91 0.0277 *
## Residuals 235 131.8 0.56
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 12 15 20 22 43 44 45 49
## Predicted 7.672 7.6035 7.49 7.678 7.658 7.725 7.647 7.62322 7.656
## cvpred 7.659 7.5903 7.48 7.661 7.644 7.713 7.633 7.61295 7.647
## yds 8.127 7.5782 7.36 7.526 8.304 7.168 7.425 7.61830 6.774
## CV residual 0.468 -0.0121 -0.12 -0.136 0.659 -0.545 -0.208 0.00535 -0.873
## 57 58 61 63 70 71 75 78 80 94
## Predicted 7.512 7.8633 7.674 7.68 7.657 7.626 7.811 7.89 7.488 7.532
## cvpred 7.502 7.8446 7.663 7.67 7.642 7.616 7.795 7.87 7.481 7.523
## yds 8.077 7.9442 7.412 8.05 7.349 8.251 7.983 8.12 7.662 7.698
## CV residual 0.575 0.0996 -0.251 0.38 -0.293 0.635 0.189 0.25 0.181 0.175
## 97 100 105 106 108 116 122 135 139
## Predicted 7.495 7.755 7.63 7.740 7.628 7.508 7.478 7.227 7.6425
## cvpred 7.488 7.741 7.62 7.725 7.620 7.497 7.470 7.226 7.6439
## yds 7.213 7.842 7.92 7.928 7.413 7.005 7.853 7.048 7.6587
## CV residual -0.275 0.101 0.30 0.203 -0.207 -0.492 0.384 -0.178 0.0149
## 143 147 148 160 167 173 175 179 180
## Predicted 7.841 7.356 7.618 7.512 7.543 7.559 7.4750 7.461 7.354
## cvpred 7.835 7.353 7.606 7.506 7.543 7.553 7.4715 7.459 7.356
## yds 8.321 7.911 7.733 8.113 7.026 8.071 7.4928 7.303 7.738
## CV residual 0.485 0.558 0.126 0.607 -0.518 0.518 0.0213 -0.157 0.382
## 183 188 197 200 202 206 215 218 220
## Predicted 7.554 7.584 7.59419 7.392 7.3750 7.43 7.269 7.534 7.113
## cvpred 7.544 7.579 7.59031 7.388 7.3670 7.43 7.274 7.531 7.119
## yds 6.716 7.365 7.59945 7.592 7.3186 7.10 6.790 7.234 7.354
## CV residual -0.828 -0.214 0.00914 0.205 -0.0484 -0.33 -0.484 -0.297 0.235
## 225
## Predicted 7.42
## cvpred 7.42
## yds 7.80
## CV residual 0.38
##
## Sum of squares = 6.71 Mean square = 0.14 n = 47
##
## fold 2
## Observations in test set: 48
## 8 11 16 18 21 26 28 31 32
## Predicted 7.549 7.890 7.510 7.582 7.639 8.034 7.317 7.564 7.6666
## cvpred 7.545 7.924 7.491 7.556 7.614 8.044 7.302 7.544 7.6522
## yds 8.131 7.455 7.351 7.965 7.200 7.793 7.334 8.160 7.7147
## CV residual 0.586 -0.469 -0.139 0.409 -0.415 -0.252 0.032 0.616 0.0626
## 33 35 36 39 42 54 56 66 68
## Predicted 7.397 7.4625 7.625 7.7655 7.548 7.399 7.631 7.756 7.586
## cvpred 7.401 7.4432 7.626 7.7850 7.536 7.384 7.641 7.813 7.590
## yds 7.555 7.4805 7.332 7.6958 7.806 7.682 7.116 7.985 7.738
## CV residual 0.154 0.0373 -0.293 -0.0892 0.269 0.298 -0.526 0.172 0.148
## 72 81 83 90 91 107 109 110 115
## Predicted 7.668 7.552 8.017 7.466 7.879 7.6689 7.544 7.727 7.6797
## cvpred 7.685 7.565 8.017 7.462 7.889 7.6906 7.560 7.715 7.6952
## yds 7.953 7.406 8.278 7.133 7.162 7.6751 8.024 7.822 7.7320
## CV residual 0.268 -0.159 0.261 -0.329 -0.727 -0.0154 0.465 0.107 0.0368
## 125 126 133 137 141 142 146 152 164
## Predicted 7.6987 7.603 7.33 7.4371 7.320 7.5526 7.596 7.510 7.523
## cvpred 7.7453 7.630 7.34 7.4508 7.313 7.5685 7.581 7.505 7.513
## yds 7.6862 7.144 7.66 7.3512 7.455 7.6324 7.708 7.364 7.975
## CV residual -0.0591 -0.485 0.32 -0.0996 0.141 0.0639 0.127 -0.141 0.461
## 171 186 189 198 201 203 212 216 227
## Predicted 7.486 7.691 7.197 7.477 7.466 7.5991 7.314 7.311 7.549
## cvpred 7.489 7.775 7.211 7.488 7.467 7.5949 7.304 7.272 7.586
## yds 7.745 7.530 6.798 6.820 7.598 7.6770 7.526 8.009 7.398
## CV residual 0.256 -0.245 -0.413 -0.668 0.131 0.0821 0.222 0.737 -0.188
## 229 230 240
## Predicted 7.346 7.52 7.2423
## cvpred 7.363 7.56 7.3388
## yds 7.087 5.25 7.2620
## CV residual -0.276 -2.31 -0.0768
##
## Sum of squares = 10.4 Mean square = 0.22 n = 48
##
## fold 3
## Observations in test set: 48
## 4 6 19 23 41 59 60 64 69 73
## Predicted 7.57 7.587 7.731 7.779 7.689 7.534 7.481 7.610 7.621 7.59
## cvpred 7.53 7.530 7.704 7.718 7.666 7.482 7.444 7.582 7.600 7.54
## yds 8.10 7.685 7.801 8.214 7.843 7.681 7.089 7.950 7.860 7.74
## CV residual 0.57 0.155 0.097 0.496 0.177 0.199 -0.355 0.369 0.261 0.20
## 77 79 84 88 89 95 113 117 119 120
## Predicted 7.517 7.500 7.511 7.707 7.598 7.731 7.472 7.421 7.40 7.7265
## cvpred 7.485 7.461 7.467 7.675 7.564 7.706 7.440 7.381 7.34 7.6914
## yds 6.977 7.926 7.803 8.227 7.686 7.769 7.568 6.803 8.03 7.6271
## CV residual -0.508 0.465 0.336 0.551 0.122 0.063 0.128 -0.579 0.69 -0.0643
## 123 124 127 131 132 134 136 138 140 149
## Predicted 7.499 7.499 7.725 7.198 7.416 7.57 7.619 7.516 7.671 7.519
## cvpred 7.470 7.475 7.685 7.114 7.368 7.52 7.587 7.489 7.655 7.506
## yds 8.039 7.606 8.234 7.876 7.665 7.39 7.404 7.932 7.863 7.008
## CV residual 0.569 0.131 0.549 0.762 0.298 -0.13 -0.183 0.444 0.208 -0.498
## 150 153 154 158 163 177 178 182 192
## Predicted 7.600 7.320 7.5117 7.368 7.572 7.6059 7.226 7.5219 7.534
## cvpred 7.560 7.268 7.4608 7.365 7.528 7.5739 7.172 7.5161 7.495
## yds 8.144 7.597 7.3995 8.153 7.701 7.6203 8.131 7.6020 7.416
## CV residual 0.585 0.328 -0.0613 0.787 0.173 0.0464 0.958 0.0858 -0.079
## 196 208 209 213 221 223 226 228 235
## Predicted 7.283 7.443 7.374 7.354 7.541 7.431 7.589 7.4391 7.307
## cvpred 7.232 7.405 7.337 7.336 7.494 7.403 7.568 7.4096 7.270
## yds 7.457 7.143 7.200 7.089 8.080 7.705 7.426 7.3403 7.013
## CV residual 0.225 -0.262 -0.136 -0.246 0.586 0.302 -0.142 -0.0694 -0.257
##
## Sum of squares = 7.4 Mean square = 0.15 n = 48
##
## fold 4
## Observations in test set: 48
## 2 5 7 9 14 17 24 37 51
## Predicted 7.805 7.725 8.119 7.310 7.881 7.68575 7.539 7.29 7.665
## cvpred 7.779 7.756 7.878 7.425 7.800 7.73194 7.670 7.54 7.726
## yds 8.383 7.503 8.307 7.823 8.164 7.72626 8.143 -2.30 7.971
## CV residual 0.604 -0.252 0.429 0.397 0.364 -0.00569 0.473 -9.84 0.245
## 52 53 55 62 74 76 85 86
## Predicted 7.7906 7.367 7.535 7.418 7.5522 7.5395 7.435 7.3353
## cvpred 7.8162 7.552 7.634 7.622 7.6519 7.6448 7.578 7.4601
## yds 7.8713 7.828 6.950 7.139 7.6286 7.6124 7.425 7.4056
## CV residual 0.0551 0.276 -0.684 -0.483 -0.0233 -0.0324 -0.153 -0.0545
## 87 92 93 99 102 111 118 128 129
## Predicted 7.69 7.70 7.661 7.661 7.534 7.624 7.3742 7.687 7.681
## cvpred 7.71 7.67 7.641 7.561 7.577 7.581 7.4864 7.649 7.725
## yds 6.88 6.87 8.013 7.699 8.046 7.949 7.3995 7.467 7.299
## CV residual -0.83 -0.80 0.372 0.137 0.469 0.368 -0.0869 -0.182 -0.426
## 130 151 155 156 157 159 162 165 166
## Predicted 7.888 7.3311 7.488 7.5436 7.4390 7.260 7.626 7.305 7.523
## cvpred 7.779 7.5293 7.595 7.6453 7.4894 7.459 7.680 7.397 7.610
## yds 8.083 7.5192 8.080 7.6672 7.5700 7.611 7.443 7.952 7.416
## CV residual 0.305 -0.0101 0.485 0.0219 0.0805 0.153 -0.237 0.555 -0.194
## 169 185 191 194 195 199 205 207 210
## Predicted 7.383 7.719 7.365 7.694 7.548 7.337 7.41 7.377 7.497
## cvpred 7.514 7.681 7.473 7.510 7.562 7.434 7.51 7.558 7.588
## yds 7.155 7.065 6.971 7.342 7.384 6.845 6.44 7.060 7.871
## CV residual -0.358 -0.617 -0.502 -0.168 -0.178 -0.589 -1.07 -0.499 0.282
## 231 236 237 239
## Predicted 7.252 7.692 7.723 7.24
## cvpred 7.370 7.535 7.591 7.40
## yds 7.923 7.377 6.923 6.36
## CV residual 0.553 -0.158 -0.668 -1.04
##
## Sum of squares = 106 Mean square = 2.21 n = 48
##
## fold 5
## Observations in test set: 48
## 1 10 13 25 27 29 30 34 38
## Predicted 7.436 7.6339 7.560 7.304 7.445 7.599 7.626 7.567 7.65
## cvpred 7.400 7.6122 7.506 7.042 7.386 7.565 7.607 7.504 7.65
## yds 8.045 7.5246 7.340 7.997 7.624 7.388 8.239 7.396 8.13
## CV residual 0.645 -0.0876 -0.165 0.955 0.238 -0.178 0.633 -0.108 0.48
## 40 46 47 48 50 65 67 82
## Predicted 7.55961 7.5784 7.4504 7.470 7.445 7.394 7.5838 7.711
## cvpred 7.54016 7.5417 7.4080 7.409 7.378 7.321 7.5813 7.760
## yds 7.53802 7.5332 7.3512 7.055 7.818 8.097 7.5099 7.133
## CV residual -0.00215 -0.0085 -0.0568 -0.354 0.441 0.776 -0.0714 -0.626
## 96 98 101 103 104 112 114 121 144
## Predicted 7.582 7.71228 7.438 7.56 7.9201 7.746 7.524 7.535 7.616
## cvpred 7.566 7.71897 7.458 7.52 7.9836 7.756 7.461 7.500 7.617
## yds 7.462 7.72228 7.976 7.68 7.9073 7.480 8.079 7.617 8.187
## CV residual -0.104 0.00331 0.518 0.16 -0.0763 -0.276 0.618 0.117 0.569
## 145 161 168 170 172 174 176 181 184
## Predicted 7.36 7.564 7.472 7.370 7.736 7.48 7.4938 7.452 7.56418
## cvpred 7.29 7.572 7.481 7.321 7.809 7.50 7.4474 7.454 7.53051
## yds 8.09 7.769 7.314 7.766 7.398 8.10 7.5230 7.861 7.54014
## CV residual 0.80 0.197 -0.167 0.445 -0.411 0.60 0.0756 0.407 0.00964
## 187 190 193 204 211 214 217 219 222
## Predicted 7.2903 7.3734 7.1652 7.5098 7.69 7.214 7.435 7.207 6.79
## cvpred 7.1848 7.2704 7.0114 7.5257 7.69 7.123 7.394 7.063 6.41
## yds 7.1286 7.2897 7.0742 7.5401 6.65 7.268 7.016 7.370 7.50
## CV residual -0.0562 0.0193 0.0628 0.0144 -1.04 0.145 -0.378 0.307 1.09
## 224 232 233 238
## Predicted 7.453 7.275 7.480 7.682
## cvpred 7.434 7.186 7.421 7.744
## yds 7.559 7.599 7.757 7.390
## CV residual 0.125 0.414 0.336 -0.354
##
## Sum of squares = 9.08 Mean square = 0.19 n = 48
##
## Overall (Sum over all 48 folds)
## ms
## 0.585