# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
sacks = qb_stats["sacked"]
# Generate clean data set
data.scaled.no_combine.for_sacks = data.frame(scale(na.omit(cbind(sacks, college_stats))))
# Generate the linear model
lm.scaled.no_combine.sacks <- lm(formula = sacked ~ ., data = data.scaled.no_combine.for_sacks)
# Find optimum linear regression model for sacks
step_reg.scaled.no_combine.sacks <- stepAIC(lm.scaled.no_combine.sacks, direction = "both")
## Start: AIC=13.46
## sacked ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.000 194 11.5
## - c_avg_cmpp 1 0.036 194 11.5
## - c_rate 1 0.211 194 11.7
## - c_avg_att 1 0.326 194 11.8
## - weight 1 0.361 194 11.8
## - c_pct 1 0.513 194 12.0
## - c_avg_tds 1 0.793 194 12.3
## - c_numyrs 1 0.989 195 12.5
## - c_avg_inter 1 1.666 195 13.2
## - age 1 1.867 196 13.4
## <none> 194 13.5
## - c_avg_yds 1 2.043 196 13.6
##
## Step: AIC=11.46
## sacked ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.036 194 9.50
## - c_rate 1 0.211 194 9.69
## - c_avg_att 1 0.327 194 9.81
## - c_pct 1 0.515 194 10.01
## - weight 1 0.542 194 10.03
## - c_avg_tds 1 0.793 194 10.30
## - c_numyrs 1 1.028 195 10.54
## - c_avg_inter 1 1.670 195 11.22
## - age 1 1.869 196 11.42
## <none> 194 11.46
## - c_avg_yds 1 2.044 196 11.61
## + height 1 0.000 194 13.46
##
## Step: AIC=9.5
## sacked ~ weight + age + c_rate + c_pct + c_avg_inter + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.177 194 7.69
## - weight 1 0.538 194 8.07
## - c_pct 1 0.711 194 8.25
## - c_avg_tds 1 0.898 195 8.45
## - c_numyrs 1 0.992 195 8.54
## - c_avg_att 1 1.103 195 8.66
## - age 1 1.849 196 9.44
## <none> 194 9.50
## - c_avg_inter 1 1.983 196 9.58
## - c_avg_yds 1 2.312 196 9.92
## + c_avg_cmpp 1 0.036 194 11.46
## + height 1 0.000 194 11.50
##
## Step: AIC=7.69
## sacked ~ weight + age + c_pct + c_avg_inter + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.510 194 6.23
## - c_pct 1 0.664 195 6.39
## - c_avg_att 1 1.032 195 6.77
## - c_numyrs 1 1.067 195 6.81
## - c_avg_tds 1 1.517 196 7.28
## - age 1 1.731 196 7.50
## - c_avg_inter 1 1.897 196 7.67
## <none> 194 7.69
## - c_avg_yds 1 2.244 196 8.04
## + c_rate 1 0.177 194 9.50
## + c_avg_cmpp 1 0.002 194 9.69
## + height 1 0.001 194 9.69
##
## Step: AIC=6.23
## sacked ~ age + c_pct + c_avg_inter + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.590 195 4.84
## - c_avg_att 1 1.144 196 5.42
## - c_numyrs 1 1.164 196 5.44
## - age 1 1.389 196 5.68
## - c_avg_tds 1 1.498 196 5.79
## - c_avg_inter 1 1.542 196 5.84
## <none> 194 6.23
## - c_avg_yds 1 2.228 197 6.55
## + weight 1 0.510 194 7.69
## + height 1 0.160 194 8.06
## + c_rate 1 0.150 194 8.07
## + c_avg_cmpp 1 0.001 194 8.22
##
## Step: AIC=4.84
## sacked ~ age + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.90 196 3.78
## - age 1 1.17 196 4.06
## - c_avg_att 1 1.50 196 4.40
## - c_avg_tds 1 1.69 197 4.60
## <none> 195 4.84
## - c_avg_inter 1 2.37 197 5.31
## + c_pct 1 0.59 194 6.23
## - c_avg_yds 1 3.41 198 6.38
## + weight 1 0.44 195 6.39
## + c_avg_cmpp 1 0.27 195 6.56
## + c_rate 1 0.12 195 6.72
## + height 1 0.09 195 6.75
##
## Step: AIC=3.78
## sacked ~ age + c_avg_inter + c_avg_tds + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 1.05 197 2.87
## - c_avg_tds 1 1.76 198 3.61
## - c_avg_inter 1 1.84 198 3.69
## <none> 196 3.78
## - c_avg_att 1 2.01 198 3.86
## + c_numyrs 1 0.90 195 4.84
## + weight 1 0.53 195 5.23
## + c_pct 1 0.33 196 5.44
## + c_avg_cmpp 1 0.30 196 5.47
## - c_avg_yds 1 3.76 200 5.66
## + height 1 0.04 196 5.74
## + c_rate 1 0.02 196 5.76
##
## Step: AIC=2.87
## sacked ~ c_avg_inter + c_avg_tds + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 1.43 198 2.34
## - c_avg_att 1 1.76 199 2.68
## <none> 197 2.87
## - c_avg_inter 1 1.99 199 2.92
## + age 1 1.05 196 3.78
## + c_numyrs 1 0.78 196 4.06
## - c_avg_yds 1 3.42 200 4.38
## + weight 1 0.22 197 4.65
## + c_pct 1 0.19 197 4.67
## + c_avg_cmpp 1 0.19 197 4.68
## + c_rate 1 0.02 197 4.85
## + height 1 0.01 197 4.86
##
## Step: AIC=2.34
## sacked ~ c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_att 1 1.038 200 1.41
## - c_avg_inter 1 1.406 200 1.78
## <none> 198 2.34
## - c_avg_yds 1 2.027 200 2.42
## + c_avg_tds 1 1.427 197 2.87
## + c_numyrs 1 0.859 198 3.46
## + age 1 0.714 198 3.61
## + c_pct 1 0.312 198 4.02
## + c_avg_cmpp 1 0.301 198 4.03
## + weight 1 0.240 198 4.10
## + c_rate 1 0.015 198 4.33
## + height 1 0.009 198 4.33
##
## Step: AIC=1.41
## sacked ~ c_avg_inter + c_avg_yds
##
## Df Sum of Sq RSS AIC
## <none> 200 1.41
## - c_avg_yds 1 2.09 202 1.54
## + c_numyrs 1 1.21 198 2.17
## + c_avg_att 1 1.04 198 2.34
## + c_avg_tds 1 0.71 199 2.68
## + age 1 0.62 199 2.77
## - c_avg_inter 1 3.35 203 2.81
## + c_pct 1 0.47 199 2.93
## + weight 1 0.34 199 3.06
## + c_rate 1 0.25 199 3.15
## + c_avg_cmpp 1 0.20 199 3.20
## + height 1 0.02 199 3.39
summary(step_reg.scaled.no_combine.sacks)
##
## Call:
## lm(formula = sacked ~ c_avg_inter + c_avg_yds, data = data.scaled.no_combine.for_sacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5214 -0.7665 -0.0516 0.5967 2.8683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.52e-17 6.97e-02 0.00 1.000
## c_avg_inter -1.61e-01 8.75e-02 -1.84 0.068 .
## c_avg_yds 1.27e-01 8.75e-02 1.45 0.148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.996 on 201 degrees of freedom
## Multiple R-squared: 0.0174, Adjusted R-squared: 0.00766
## F-statistic: 1.78 on 2 and 201 DF, p-value: 0.171
plot(step_reg.scaled.no_combine.sacks)
leaps.scaled.no_combine.sacks <- regsubsets(sacked ~ ., data = data.scaled.no_combine.for_sacks,
nbest = 10)
subsets(leaps.scaled.no_combine.sacks, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_sacks, step_reg.scaled.no_combine.sacks,
m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: sacked
## Df Sum Sq Mean Sq F value Pr(>F)
## c_avg_inter 1 1.4 1.447 1.46 0.23
## c_avg_yds 1 2.1 2.094 2.11 0.15
## Residuals 201 199.5 0.992
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 40
## 17 29 30 38 40 45 46 62
## Predicted 0.0125 -0.00280 -0.0888 0.244 0.000743 0.0241 0.171 -0.1220
## cvpred 0.0281 0.00922 -0.0626 0.222 0.029975 0.0466 0.169 -0.0829
## sacked -0.1800 -1.12636 -1.1264 0.104 -0.937090 0.2932 0.388 -0.8425
## CV residual -0.2081 -1.13558 -1.0638 -0.118 -0.967065 0.2466 0.218 -0.7596
## 64 72 79 91 97 98 103 115
## Predicted -0.14145 0.0686 0.0554 0.125 -0.0100 0.0474 0.0390 0.0210
## cvpred -0.10698 0.0909 0.0696 0.127 0.0222 0.0809 0.0692 0.0462
## sacked 0.00928 1.4288 1.0503 -0.369 -0.4639 -1.0317 1.2396 0.1039
## CV residual 0.11626 1.3379 0.9806 -0.496 -0.4861 -1.1126 1.1703 0.0577
## 117 125 126 127 134 135 139
## Predicted -0.08966 0.0551 -0.00144 -0.0541 -0.1082 -0.0975 0.0330
## cvpred -0.05875 0.0779 0.03105 -0.0148 -0.0652 -0.0688 0.0701
## sacked 0.00928 0.4825 -0.36927 -0.9371 -0.7478 -1.4103 -0.5585
## CV residual 0.06803 0.4046 -0.40031 -0.9223 -0.6826 -1.3414 -0.6286
## 141 142 144 148 154 156 166 169
## Predicted -0.0610 0.0271 -0.0176 0.0126 0.0558 -0.14490 0.0279 -0.0508
## cvpred -0.0289 0.0523 0.0153 0.0282 0.0921 -0.09371 0.0440 -0.0136
## sacked -0.0854 -0.8425 1.6181 1.9967 -0.3693 -0.08536 0.1986 -0.9371
## CV residual -0.0565 -0.8948 1.6029 1.9685 -0.4614 0.00835 0.1545 -0.9235
## 174 176 181 183 194 195 197 200
## Predicted -0.1248 -0.0292 -0.191 -0.277 0.0851 0.0112 -0.0657 -0.111
## cvpred -0.0851 0.0116 -0.137 -0.218 0.1176 0.0421 -0.0206 -0.069
## sacked 0.3878 -0.8425 1.523 -1.600 -1.3156 0.9556 0.7664 -1.126
## CV residual 0.4729 -0.8540 1.661 -1.382 -1.4332 0.9135 0.7870 -1.057
## 209
## Predicted 0.0572
## cvpred 0.0857
## sacked 0.1986
## CV residual 0.1128
##
## Sum of squares = 32.4 Mean square = 0.81 n = 40
##
## fold 2
## Observations in test set: 41
## 1 4 8 10 11 12 14 42 43
## Predicted 0.217 0.0658 0.162 0.02601 0.270 0.256 0.289 0.170 0.1068
## cvpred 0.162 0.0769 0.117 0.00826 0.261 0.188 0.255 0.115 0.0975
## sacked 0.672 0.8610 -0.180 0.38783 0.672 0.388 0.766 1.713 -1.7888
## CV residual 0.510 0.7841 -0.297 0.37956 0.410 0.200 0.512 1.598 -1.8863
## 53 54 55 65 68 73 76 88
## Predicted -0.197 0.248 0.1009 0.0496 0.0989 -0.0358 -0.0231 0.165
## cvpred -0.233 0.118 0.0478 -0.0342 0.1034 -0.0371 -0.0147 0.107
## sacked 0.482 1.050 -1.2210 -0.1800 -1.6942 -1.4103 1.1449 -0.369
## CV residual 0.715 0.933 -1.2688 -0.1458 -1.7976 -1.3731 1.1596 -0.477
## 101 102 116 119 120 123 130 133
## Predicted -0.0234 0.0166 -0.0635 -0.209 0.0354 -0.136 0.0471 -0.0972
## cvpred -0.1826 0.0634 -0.1452 -0.246 0.0894 -0.200 0.0456 -0.0946
## sacked 0.1039 -0.4639 -0.5585 2.659 -0.1800 1.618 0.6717 0.4825
## CV residual 0.2865 -0.5273 -0.4133 2.905 -0.2694 1.818 0.6261 0.5770
## 146 147 150 151 159 167 168
## Predicted -0.00893 -0.0209 0.0202 -0.0669 -0.0996 0.103 0.0885
## cvpred -0.05244 -0.0130 -0.0209 -0.1888 -0.1087 0.168 0.0420
## sacked 0.29319 1.9967 0.5771 1.2396 -0.1800 -0.369 0.1039
## CV residual 0.34562 2.0096 0.5980 1.4284 -0.0713 -0.537 0.0619
## 177 186 187 196 198 199 203 205
## Predicted -0.03445 0.0615 -0.138 -0.377 -0.0959 -0.147 -0.196 -0.228
## cvpred -0.00467 0.1212 -0.164 -0.486 -0.0812 -0.182 -0.309 -0.252
## sacked 0.00928 -0.1800 0.293 -1.316 -0.9371 -0.559 0.956 -1.126
## CV residual 0.01395 -0.3012 0.457 -0.829 -0.8559 -0.376 1.265 -0.874
## 206
## Predicted -0.122
## cvpred -0.163
## sacked -1.410
## CV residual -1.248
##
## Sum of squares = 43.1 Mean square = 1.05 n = 41
##
## fold 3
## Observations in test set: 41
## 5 16 19 20 25 26 28 37
## Predicted 0.217 0.120 0.05590 0.0167 0.0761 0.125 -0.083375 0.0701
## cvpred 0.274 0.198 0.01595 0.0991 -0.0799 0.159 0.000381 0.1063
## sacked -0.937 0.672 0.00928 -0.5585 0.5771 0.482 -1.126364 -2.4513
## CV residual -1.211 0.473 -0.00668 -0.6576 0.6570 0.324 -1.126745 -2.5576
## 41 44 48 52 58 59 60 61
## Predicted 0.310 0.110 -0.219 0.1615 0.212 -0.1056 0.0626 -0.0284
## cvpred 0.453 0.184 -0.214 0.3162 0.256 -0.0507 0.0301 -0.0236
## sacked -0.464 1.902 -0.653 0.3878 -0.653 0.3878 -0.6532 -0.6532
## CV residual -0.917 1.718 -0.439 0.0716 -0.909 0.4385 -0.6833 -0.6296
## 67 71 86 89 100 105 118 121
## Predicted 0.034 0.0249 -0.0901 0.0438 -0.0471 -0.114 -0.159 5.83e-02
## cvpred 0.058 0.0248 0.0336 0.0550 -0.0162 -0.140 -0.101 -4.97e-05
## sacked -1.316 2.2806 -1.2210 1.0503 2.0913 -0.275 -1.126 -5.59e-01
## CV residual -1.374 2.2558 -1.2546 0.9953 2.1075 -0.134 -1.025 -5.58e-01
## 128 132 140 158 161 162 163 164
## Predicted 0.1108 -0.0774 -0.0411 0.0992 -0.01163 -0.255 -0.321 -0.139
## cvpred 0.0786 -0.0794 -0.0190 0.0317 -0.07728 -0.253 -0.268 -0.113
## sacked -0.6532 -0.1800 -0.8425 2.5645 0.00928 -1.126 -1.505 1.240
## CV residual -0.7318 -0.1006 -0.8234 2.5328 0.08656 -0.874 -1.237 1.352
## 178 182 185 192 201 202 208 210
## Predicted -0.235 -0.0891 -0.1738 -0.0758 -0.123 -0.367 -0.0841 -0.129
## cvpred -0.239 -0.1698 -0.0288 -0.0947 -0.110 -0.278 -0.1664 -0.143
## sacked 0.956 0.1986 -1.8835 -0.6532 -0.275 1.145 0.2932 0.388
## CV residual 1.194 0.3683 -1.8546 -0.5585 -0.164 1.423 0.4595 0.531
## 211
## Predicted -0.0616
## cvpred -0.1286
## sacked 1.0503
## CV residual 1.1788
##
## Sum of squares = 52.7 Mean square = 1.29 n = 41
##
## fold 4
## Observations in test set: 41
## 13 18 21 24 27 33 34
## Predicted 0.02493 0.06365 0.123 0.0478 0.0467 0.0259 -0.0623
## cvpred -0.00101 0.04284 0.104 0.0264 0.0296 0.0162 -0.0864
## sacked -0.93709 0.00928 -0.653 -0.8425 -0.2746 -0.6532 -1.3156
## CV residual -0.93608 -0.03356 -0.757 -0.8688 -0.3043 -0.6693 -1.2292
## 39 47 49 51 56 63 66 70
## Predicted -0.00534 0.1108 0.2344 0.0252 0.154 0.251 0.1004 -0.0635
## cvpred -0.02895 0.0999 0.2308 0.0020 0.140 0.236 0.0871 -0.0929
## sacked -0.08536 0.5771 0.2932 -0.0854 -1.221 -0.369 0.6717 0.7664
## CV residual -0.05641 0.4772 0.0624 -0.0874 -1.361 -0.605 0.5847 0.8593
## 75 78 80 81 82 83 85 95
## Predicted 0.12181 0.125 0.1105 0.0575 0.161 0.111 -0.0125 0.0925
## cvpred 0.10526 0.111 0.0951 0.0352 0.147 0.090 -0.0334 0.0797
## sacked 0.00928 1.807 2.1859 0.1039 0.956 0.766 -1.4103 1.2396
## CV residual -0.09598 1.696 2.0908 0.0687 0.809 0.676 -1.3768 1.1599
## 99 106 109 112 113 114 124 131
## Predicted 0.0647 0.164 0.0748 0.00711 0.0418 0.01588 0.0872 0.0401
## cvpred 0.0551 0.150 0.0581 -0.02141 0.0197 -0.00557 0.0720 0.0301
## sacked -1.1264 1.997 0.8610 -1.78882 1.3342 0.76637 -1.5049 0.9556
## CV residual -1.1815 1.847 0.8029 -1.76741 1.3145 0.77194 -1.5769 0.9255
## 137 149 153 157 165 170 172 184
## Predicted -0.0135 -0.0893 -0.134 0.0245 0.00881 -0.101 0.02501 -0.361
## cvpred -0.0316 -0.1157 -0.168 0.0144 -0.01211 -0.130 0.00903 -0.408
## sacked 0.4825 -2.0727 1.618 0.3878 1.05028 0.104 -0.18000 0.577
## CV residual 0.5141 -1.9571 1.786 0.3734 1.06239 0.234 -0.18903 0.985
## 190 204
## Predicted -0.508 0.0448
## cvpred -0.570 0.0269
## sacked 0.199 -0.2746
## CV residual 0.769 -0.3015
##
## Sum of squares = 44 Mean square = 1.07 n = 41
##
## fold 5
## Observations in test set: 41
## 2 3 6 7 9 15 22 31
## Predicted 0.258 0.1522 0.00302 0.128 -0.0270 0.118 0.229 0.0341
## cvpred 0.236 0.1523 0.02322 0.150 -0.0175 0.107 0.216 0.0476
## sacked 1.429 0.1986 -0.08536 0.861 0.9556 -0.275 0.766 0.1039
## CV residual 1.192 0.0463 -0.10858 0.711 0.9732 -0.382 0.551 0.0563
## 32 35 36 50 57 74 77 87
## Predicted 0.0935 0.0270 0.1169 0.240 -0.169 -0.120 0.0587 0.0492
## cvpred 0.1055 0.0446 0.1264 0.234 -0.138 -0.101 0.0701 0.0777
## sacked -0.9371 -0.0854 0.1039 0.388 -0.275 -1.126 -1.3156 -1.4103
## CV residual -1.0425 -0.1300 -0.0225 0.154 -0.137 -1.025 -1.3858 -1.4880
## 90 92 93 94 96 104 107 108
## Predicted 0.1002 -0.0130 0.128 0.0526 0.0489 0.060 0.0639 0.119
## cvpred 0.0994 0.0101 0.140 0.0645 0.0692 0.083 0.0871 0.136
## sacked -0.6532 0.8610 -0.559 2.4698 -0.6532 -0.180 -0.6532 -1.316
## CV residual -0.7526 0.8509 -0.699 2.4053 -0.7223 -0.263 -0.7403 -1.452
## 110 122 129 136 143 145 152
## Predicted -0.02041 -0.0851 -0.0484 -0.13674 0.0540 -0.1235 -0.0721
## cvpred -0.00573 -0.0533 -0.0253 -0.09914 0.0868 -0.0871 -0.0600
## sacked -0.93709 -0.6532 0.1039 0.00928 0.1986 0.6717 -0.6532
## CV residual -0.93136 -0.5999 0.1293 0.10841 0.1118 0.7588 -0.5932
## 160 171 173 175 179 180 188 189
## Predicted 0.00521 -0.200 -0.0881 -0.0637 0.0227 -0.2071 -0.131 -0.1075
## cvpred 0.03679 -0.171 -0.0500 -0.0295 0.0526 -0.1526 -0.087 -0.0652
## sacked 1.99665 0.388 -0.2746 -1.0317 1.0503 -0.0854 -0.180 -1.2210
## CV residual 1.95986 0.559 -0.2247 -1.0022 0.9977 0.0672 -0.093 -1.1558
## 191 193
## Predicted -0.1177 -0.206
## cvpred -0.0747 -0.153
## sacked -0.1800 -1.126
## CV residual -0.1053 -0.973
##
## Sum of squares = 31.2 Mean square = 0.76 n = 41
##
## Overall (Sum over all 41 folds)
## ms
## 0.997