# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
sacks = qb_stats["sacked"]
# Generate clean data set
data.log.no_combine.for_sacks = data.frame(log(na.omit(cbind(sacks, college_stats)) +
0.1))
# Generate the linear model
lm.log.no_combine.sacks <- lm(formula = sacked ~ ., data = data.log.no_combine.for_sacks)
# Find optimum linear regression model for sacks
step_reg.log.no_combine.sacks <- stepAIC(lm.log.no_combine.sacks, direction = "both")
## Start: AIC=-202.1
## sacked ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.001 67.3 -204
## - weight 1 0.001 67.3 -204
## - c_pct 1 0.011 67.4 -204
## - age 1 0.057 67.4 -204
## - c_avg_cmpp 1 0.065 67.4 -204
## - c_numyrs 1 0.194 67.5 -204
## - c_avg_att 1 0.221 67.6 -203
## <none> 67.3 -202
## - c_avg_inter 1 0.803 68.2 -202
## - c_avg_tds 1 0.829 68.2 -202
## - c_rate 1 1.332 68.7 -200
## - c_avg_yds 1 1.779 69.1 -199
##
## Step: AIC=-204.1
## sacked ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.000 67.3 -206
## - c_pct 1 0.011 67.4 -206
## - age 1 0.058 67.4 -206
## - c_avg_cmpp 1 0.064 67.4 -206
## - c_numyrs 1 0.195 67.5 -206
## - c_avg_att 1 0.220 67.6 -205
## <none> 67.3 -204
## - c_avg_inter 1 0.814 68.2 -204
## - c_avg_tds 1 0.836 68.2 -204
## + height 1 0.001 67.3 -202
## - c_rate 1 1.342 68.7 -202
## - c_avg_yds 1 1.794 69.1 -201
##
## Step: AIC=-206.1
## sacked ~ age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.011 67.4 -208
## - age 1 0.064 67.4 -208
## - c_avg_cmpp 1 0.065 67.4 -208
## - c_numyrs 1 0.194 67.5 -208
## - c_avg_att 1 0.223 67.6 -207
## <none> 67.3 -206
## - c_avg_tds 1 0.842 68.2 -206
## - c_avg_inter 1 0.847 68.2 -206
## + weight 1 0.000 67.3 -204
## + height 1 0.000 67.3 -204
## - c_rate 1 1.355 68.7 -204
## - c_avg_yds 1 1.807 69.2 -203
##
## Step: AIC=-208.1
## sacked ~ age + c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 0.070 67.4 -210
## - c_numyrs 1 0.243 67.6 -209
## <none> 67.4 -208
## - c_avg_tds 1 1.082 68.4 -207
## - c_avg_inter 1 1.222 68.6 -206
## + c_pct 1 0.011 67.3 -206
## + weight 1 0.001 67.4 -206
## + height 1 0.000 67.4 -206
## - c_rate 1 1.660 69.0 -205
## - c_avg_cmpp 1 1.794 69.2 -205
## - c_avg_yds 1 2.025 69.4 -204
## - c_avg_att 1 2.040 69.4 -204
##
## Step: AIC=-209.8
## sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.230 67.7 -211
## <none> 67.4 -210
## - c_avg_tds 1 1.185 68.6 -208
## + age 1 0.070 67.4 -208
## + c_pct 1 0.016 67.4 -208
## + weight 1 0.008 67.4 -208
## - c_avg_inter 1 1.330 68.8 -208
## + height 1 0.001 67.4 -208
## - c_rate 1 1.754 69.2 -207
## - c_avg_cmpp 1 1.824 69.3 -206
## - c_avg_att 1 2.112 69.5 -206
## - c_avg_yds 1 2.129 69.6 -206
##
## Step: AIC=-211.1
## sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 67.7 -211
## - c_avg_tds 1 1.039 68.7 -210
## + c_numyrs 1 0.230 67.4 -210
## - c_avg_inter 1 1.198 68.9 -210
## + c_pct 1 0.067 67.6 -209
## + age 1 0.058 67.6 -209
## + height 1 0.007 67.7 -209
## + weight 1 0.003 67.7 -209
## - c_rate 1 1.604 69.3 -208
## - c_avg_cmpp 1 1.650 69.3 -208
## - c_avg_att 1 1.948 69.6 -207
## - c_avg_yds 1 1.991 69.7 -207
summary(step_reg.log.no_combine.sacks)
##
## Call:
## lm(formula = sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds +
## c_avg_yds + c_avg_att, data = data.log.no_combine.for_sacks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.219 -0.247 0.073 0.340 1.042
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.896 13.811 2.31 0.022 *
## c_avg_cmpp 3.283 1.498 2.19 0.030 *
## c_rate -7.094 3.283 -2.16 0.032 *
## c_avg_inter -0.461 0.247 -1.87 0.063 .
## c_avg_tds 0.761 0.438 1.74 0.084 .
## c_avg_yds 4.404 1.829 2.41 0.017 *
## c_avg_att -8.085 3.394 -2.38 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.586 on 197 degrees of freedom
## Multiple R-squared: 0.0421, Adjusted R-squared: 0.0129
## F-statistic: 1.44 on 6 and 197 DF, p-value: 0.201
plot(step_reg.log.no_combine.sacks)
leaps.log.no_combine.sacks <- regsubsets(sacked ~ ., data = data.log.no_combine.for_sacks,
nbest = 10)
subsets(leaps.log.no_combine.sacks, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_sacks, step_reg.log.no_combine.sacks, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: sacked
## Df Sum Sq Mean Sq F value Pr(>F)
## c_avg_cmpp 1 0.0 0.015 0.04 0.835
## c_rate 1 0.7 0.727 2.12 0.147
## c_avg_inter 1 0.0 0.011 0.03 0.861
## c_avg_tds 1 0.2 0.200 0.58 0.446
## c_avg_yds 1 0.1 0.069 0.20 0.654
## c_avg_att 1 1.9 1.948 5.67 0.018 *
## Residuals 197 67.7 0.343
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 40
## 17 29 30 38 40 45 46 62 64
## Predicted 3.1544 3.032 3.053 3.2432 3.173 3.142 3.166 2.8911 3.1686
## cvpred 3.1526 3.005 3.061 3.2220 3.173 3.128 3.156 2.8964 3.1663
## sacked 3.1822 2.646 2.646 3.2995 2.779 3.371 3.405 2.8391 3.2619
## CV residual 0.0296 -0.358 -0.415 0.0775 -0.395 0.243 0.248 -0.0573 0.0956
## 72 79 91 97 98 103 115 117 125
## Predicted 3.217 3.066 3.1691 3.016 3.177 3.127 3.179 3.060 3.3472
## cvpred 3.215 3.055 3.1712 3.026 3.159 3.144 3.184 3.069 3.3519
## sacked 3.716 3.614 3.0956 3.049 2.715 3.666 3.300 3.262 3.4372
## CV residual 0.501 0.559 -0.0756 0.023 -0.445 0.522 0.116 0.193 0.0853
## 126 127 134 135 139 141 142 144 148
## Predicted 3.221 3.186 3.060 3.150 3.248 3.11 3.204 3.200 3.232
## cvpred 3.231 3.203 3.058 3.161 3.242 3.11 3.208 3.210 3.209
## sacked 3.096 2.779 2.896 2.407 3.001 3.22 2.839 3.764 3.852
## CV residual -0.135 -0.424 -0.162 -0.755 -0.241 0.11 -0.369 0.553 0.643
## 154 156 166 169 174 176 181 183 194
## Predicted 3.494 3.090 3.098 2.898 3.055 3.154 3.302 3.149 2.663
## cvpred 3.553 3.106 3.071 2.885 3.049 3.214 3.303 3.171 2.749
## sacked 3.096 3.223 3.336 2.779 3.405 2.839 3.740 2.208 2.493
## CV residual -0.457 0.117 0.265 -0.107 0.356 -0.375 0.437 -0.962 -0.256
## 195 197 200 209
## Predicted 3.114 3.188 3.071 3.004
## cvpred 3.119 3.174 3.062 2.992
## sacked 3.586 3.529 2.646 3.336
## CV residual 0.467 0.356 -0.416 0.344
##
## Sum of squares = 5.8 Mean square = 0.15 n = 40
##
## fold 2
## Observations in test set: 41
## 1 4 8 10 11 12 14 42 43 53
## Predicted 3.165 3.117 3.2270 3.10 3.4277 3.3056 3.344 3.219 3.33 2.998
## cvpred 3.125 3.056 3.2121 3.04 3.4624 3.3190 3.353 3.204 3.35 2.912
## sacked 3.500 3.558 3.1822 3.40 3.4995 3.4045 3.529 3.786 1.96 3.437
## CV residual 0.374 0.502 -0.0299 0.36 0.0371 0.0855 0.176 0.583 -1.39 0.525
## 54 55 65 68 73 76 88 101 102
## Predicted 3.178 3.116 3.047 3.116 2.972 3.042 3.219 3.2453 3.1397
## cvpred 3.128 3.061 2.964 3.091 2.935 3.003 3.197 3.2627 3.1270
## sacked 3.614 2.573 3.182 2.092 2.407 3.640 3.096 3.2995 3.0493
## CV residual 0.486 -0.488 0.219 -0.999 -0.529 0.637 -0.102 0.0369 -0.0777
## 116 119 120 123 130 133 146 147 150 151
## Predicted 3.112 2.95 3.241 3.153 3.277 3.143 3.081 2.963 3.115 3.006
## cvpred 3.069 2.87 3.254 3.118 3.302 3.134 3.014 2.867 3.093 2.951
## sacked 3.001 3.99 3.182 3.764 3.500 3.437 3.371 3.852 3.469 3.666
## CV residual -0.068 1.12 -0.072 0.646 0.197 0.303 0.357 0.985 0.376 0.715
## 159 167 168 177 186 187 196 198 199
## Predicted 2.989 3.056 3.219 3.1823 3.1856 3.002 3.03 3.138 3.17
## cvpred 2.929 2.993 3.222 3.2275 3.2281 2.919 2.95 3.087 3.16
## sacked 3.182 3.096 3.300 3.2619 3.1822 3.371 2.49 2.779 3.00
## CV residual 0.253 0.103 0.078 0.0344 -0.0458 0.452 -0.46 -0.308 -0.16
## 203 205 206
## Predicted 3.138 3.012 3.15
## cvpred 3.118 3.051 3.13
## sacked 3.586 2.646 2.41
## CV residual 0.468 -0.405 -0.72
##
## Sum of squares = 10.4 Mean square = 0.25 n = 41
##
## fold 3
## Observations in test set: 41
## 5 16 19 20 25 26 28 37 41 44
## Predicted 3.237 3.132 3.14 3.067 3.219 3.245 3.10 2.92 3.276 3.098
## cvpred 3.290 3.245 3.17 3.191 3.024 3.263 3.18 3.15 3.337 3.196
## sacked 2.779 3.500 3.26 3.001 3.469 3.437 2.65 -2.30 3.049 3.831
## CV residual -0.511 0.255 0.09 -0.191 0.445 0.175 -0.53 -5.45 -0.287 0.635
## 48 52 58 59 60 61 67 71 86
## Predicted 3.092 3.22 3.329 2.989 3.133 3.091 3.194 3.18 3.085
## cvpred 3.104 3.28 3.346 3.128 3.181 3.129 3.219 3.18 3.174
## sacked 2.950 3.40 2.950 3.405 2.950 2.950 2.493 3.91 2.573
## CV residual -0.155 0.12 -0.396 0.276 -0.231 -0.179 -0.726 0.73 -0.601
## 89 100 105 118 121 128 132 140 158
## Predicted 3.162 3.237 3.0345 3.108 3.149 3.299 3.0286 3.254 3.256
## cvpred 3.202 3.211 3.0937 3.143 3.165 3.273 3.1236 3.220 3.224
## sacked 3.614 3.873 3.1398 2.646 3.001 2.950 3.1822 2.839 3.972
## CV residual 0.412 0.662 0.0462 -0.497 -0.164 -0.323 0.0586 -0.381 0.748
## 161 162 163 164 178 182 185 192 201
## Predicted 3.2602 3.092 3.018 3.092 2.992 3.256 2.86 3.121 3.1063
## cvpred 3.1620 3.086 3.092 3.137 3.073 3.074 2.98 3.125 3.1197
## sacked 3.2619 2.646 2.313 3.666 3.586 3.336 1.81 2.950 3.1398
## CV residual 0.0999 -0.439 -0.779 0.529 0.513 0.262 -1.17 -0.175 0.0201
## 202 208 210 211
## Predicted 3.014 3.092 3.066 3.163
## cvpred 3.103 3.068 3.108 3.093
## sacked 3.640 3.371 3.405 3.614
## CV residual 0.537 0.303 0.296 0.521
##
## Sum of squares = 38.1 Mean square = 0.93 n = 41
##
## fold 4
## Observations in test set: 41
## 13 18 21 24 27 33 34 39 47
## Predicted 3.026 3.019 3.090 3.071 3.109 3.298 2.98 3.2265 3.196
## cvpred 2.925 2.963 3.055 2.985 3.035 3.473 2.90 3.2580 3.170
## sacked 2.779 3.262 2.950 2.839 3.140 2.950 2.49 3.2229 3.469
## CV residual -0.146 0.299 -0.105 -0.146 0.105 -0.524 -0.41 -0.0352 0.299
## 49 51 56 63 66 70 75 78 80
## Predicted 3.483 3.148 3.176 3.204 3.336 3.025 3.2391 3.200 3.26
## cvpred 3.558 3.116 3.185 3.220 3.386 2.893 3.2737 3.226 3.25
## sacked 3.371 3.223 2.573 3.096 3.500 3.529 3.2619 3.809 3.89
## CV residual -0.188 0.106 -0.613 -0.125 0.113 0.636 -0.0118 0.583 0.64
## 81 82 83 85 95 99 106 109 112 113
## Predicted 3.2140 3.202 3.233 3.007 3.31 3.36 3.298 3.204 3.10 3.024
## cvpred 3.2296 3.227 3.248 2.936 3.32 3.38 3.330 3.212 3.09 2.994
## sacked 3.2995 3.586 3.529 2.407 3.67 2.65 3.852 3.558 1.96 3.691
## CV residual 0.0699 0.359 0.281 -0.529 0.35 -0.73 0.522 0.346 -1.13 0.697
## 114 124 131 137 149 153 157 165 170 172
## Predicted 3.017 3.143 2.63 3.138 3.27 2.965 3.3354 3.151 3.162 3.205
## cvpred 2.951 3.138 2.22 3.115 3.32 2.865 3.3886 3.136 3.175 3.260
## sacked 3.529 2.313 3.59 3.437 1.41 3.764 3.4045 3.614 3.300 3.182
## CV residual 0.578 -0.826 1.36 0.322 -1.91 0.898 0.0159 0.478 0.125 -0.078
## 184 190 204
## Predicted 3.2463 3.079 3.316
## cvpred 3.4183 3.086 3.275
## sacked 3.4689 3.336 3.140
## CV residual 0.0506 0.249 -0.135
##
## Sum of squares = 13.4 Mean square = 0.33 n = 41
##
## fold 5
## Observations in test set: 41
## 2 3 6 7 9 15 22 31 32 35
## Predicted 3.319 3.2822 3.020 3.380 3.156 3.1699 3.291 3.074 3.10 3.051
## cvpred 3.314 3.3020 3.010 3.406 3.146 3.1927 3.278 3.067 3.13 3.012
## sacked 3.716 3.3358 3.223 3.558 3.586 3.1398 3.529 3.300 2.78 3.223
## CV residual 0.402 0.0338 0.213 0.152 0.441 -0.0529 0.251 0.232 -0.35 0.211
## 36 50 57 74 77 87 90 92 93
## Predicted 3.171 3.191 3.0847 3.110 3.166 3.094 3.197 3.19 3.31
## cvpred 3.174 3.171 3.0717 3.102 3.147 3.119 3.194 3.20 3.29
## sacked 3.300 3.405 3.1398 2.646 2.493 2.407 2.950 3.56 3.00
## CV residual 0.126 0.234 0.0681 -0.456 -0.654 -0.712 -0.245 0.36 -0.29
## 94 96 104 107 108 110 122 129 136
## Predicted 3.142 3.108 3.2226 3.140 3.240 3.116 3.0504 3.145 3.1847
## cvpred 3.126 3.134 3.2213 3.135 3.251 3.095 3.0249 3.148 3.1821
## sacked 3.953 2.950 3.1822 2.950 2.493 2.779 2.9497 3.300 3.2619
## CV residual 0.827 -0.184 -0.0391 -0.185 -0.758 -0.316 -0.0752 0.151 0.0798
## 143 145 152 160 171 173 175 179 180
## Predicted 3.440 3.070 3.163 3.073 3.13 3.0959 3.210 3.23 3.338
## cvpred 3.457 3.073 3.169 3.054 3.13 3.1157 3.201 3.21 3.324
## sacked 3.336 3.500 2.950 3.852 3.40 3.1398 2.715 3.61 3.223
## CV residual -0.122 0.427 -0.219 0.798 0.27 0.0241 -0.486 0.40 -0.101
## 188 189 191 193
## Predicted 3.324 2.993 3.005 3.02
## cvpred 3.349 3.015 3.005 3.03
## sacked 3.182 2.573 3.182 2.65
## CV residual -0.166 -0.442 0.177 -0.38
##
## Sum of squares = 5.43 Mean square = 0.13 n = 41
##
## Overall (Sum over all 41 folds)
## ms
## 0.358