# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]

# Set the resopnse variables
sacks = qb_stats["sacked"]

# Generate clean data set
data.log.no_combine.for_sacks = data.frame(log(na.omit(cbind(sacks, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.no_combine.sacks <- lm(formula = sacked ~ ., data = data.log.no_combine.for_sacks)

# Find optimum linear regression model for sacks
step_reg.log.no_combine.sacks <- stepAIC(lm.log.no_combine.sacks, direction = "both")
## Start:  AIC=-202.1
## sacked ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - height       1     0.001 67.3 -204
## - weight       1     0.001 67.3 -204
## - c_pct        1     0.011 67.4 -204
## - age          1     0.057 67.4 -204
## - c_avg_cmpp   1     0.065 67.4 -204
## - c_numyrs     1     0.194 67.5 -204
## - c_avg_att    1     0.221 67.6 -203
## <none>                     67.3 -202
## - c_avg_inter  1     0.803 68.2 -202
## - c_avg_tds    1     0.829 68.2 -202
## - c_rate       1     1.332 68.7 -200
## - c_avg_yds    1     1.779 69.1 -199
## 
## Step:  AIC=-204.1
## sacked ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - weight       1     0.000 67.3 -206
## - c_pct        1     0.011 67.4 -206
## - age          1     0.058 67.4 -206
## - c_avg_cmpp   1     0.064 67.4 -206
## - c_numyrs     1     0.195 67.5 -206
## - c_avg_att    1     0.220 67.6 -205
## <none>                     67.3 -204
## - c_avg_inter  1     0.814 68.2 -204
## - c_avg_tds    1     0.836 68.2 -204
## + height       1     0.001 67.3 -202
## - c_rate       1     1.342 68.7 -202
## - c_avg_yds    1     1.794 69.1 -201
## 
## Step:  AIC=-206.1
## sacked ~ age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + c_avg_tds + 
##     c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_pct        1     0.011 67.4 -208
## - age          1     0.064 67.4 -208
## - c_avg_cmpp   1     0.065 67.4 -208
## - c_numyrs     1     0.194 67.5 -208
## - c_avg_att    1     0.223 67.6 -207
## <none>                     67.3 -206
## - c_avg_tds    1     0.842 68.2 -206
## - c_avg_inter  1     0.847 68.2 -206
## + weight       1     0.000 67.3 -204
## + height       1     0.000 67.3 -204
## - c_rate       1     1.355 68.7 -204
## - c_avg_yds    1     1.807 69.2 -203
## 
## Step:  AIC=-208.1
## sacked ~ age + c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + 
##     c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - age          1     0.070 67.4 -210
## - c_numyrs     1     0.243 67.6 -209
## <none>                     67.4 -208
## - c_avg_tds    1     1.082 68.4 -207
## - c_avg_inter  1     1.222 68.6 -206
## + c_pct        1     0.011 67.3 -206
## + weight       1     0.001 67.4 -206
## + height       1     0.000 67.4 -206
## - c_rate       1     1.660 69.0 -205
## - c_avg_cmpp   1     1.794 69.2 -205
## - c_avg_yds    1     2.025 69.4 -204
## - c_avg_att    1     2.040 69.4 -204
## 
## Step:  AIC=-209.8
## sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + c_avg_yds + 
##     c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_numyrs     1     0.230 67.7 -211
## <none>                     67.4 -210
## - c_avg_tds    1     1.185 68.6 -208
## + age          1     0.070 67.4 -208
## + c_pct        1     0.016 67.4 -208
## + weight       1     0.008 67.4 -208
## - c_avg_inter  1     1.330 68.8 -208
## + height       1     0.001 67.4 -208
## - c_rate       1     1.754 69.2 -207
## - c_avg_cmpp   1     1.824 69.3 -206
## - c_avg_att    1     2.112 69.5 -206
## - c_avg_yds    1     2.129 69.6 -206
## 
## Step:  AIC=-211.1
## sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + c_avg_yds + 
##     c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## <none>                     67.7 -211
## - c_avg_tds    1     1.039 68.7 -210
## + c_numyrs     1     0.230 67.4 -210
## - c_avg_inter  1     1.198 68.9 -210
## + c_pct        1     0.067 67.6 -209
## + age          1     0.058 67.6 -209
## + height       1     0.007 67.7 -209
## + weight       1     0.003 67.7 -209
## - c_rate       1     1.604 69.3 -208
## - c_avg_cmpp   1     1.650 69.3 -208
## - c_avg_att    1     1.948 69.6 -207
## - c_avg_yds    1     1.991 69.7 -207
summary(step_reg.log.no_combine.sacks)
## 
## Call:
## lm(formula = sacked ~ c_avg_cmpp + c_rate + c_avg_inter + c_avg_tds + 
##     c_avg_yds + c_avg_att, data = data.log.no_combine.for_sacks)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.219 -0.247  0.073  0.340  1.042 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   31.896     13.811    2.31    0.022 *
## c_avg_cmpp     3.283      1.498    2.19    0.030 *
## c_rate        -7.094      3.283   -2.16    0.032 *
## c_avg_inter   -0.461      0.247   -1.87    0.063 .
## c_avg_tds      0.761      0.438    1.74    0.084 .
## c_avg_yds      4.404      1.829    2.41    0.017 *
## c_avg_att     -8.085      3.394   -2.38    0.018 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.586 on 197 degrees of freedom
## Multiple R-squared: 0.0421,  Adjusted R-squared: 0.0129 
## F-statistic: 1.44 on 6 and 197 DF,  p-value: 0.201
plot(step_reg.log.no_combine.sacks)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.log.no_combine.sacks <- regsubsets(sacked ~ ., data = data.log.no_combine.for_sacks, 
    nbest = 10)
subsets(leaps.log.no_combine.sacks, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.no_combine.for_sacks, step_reg.log.no_combine.sacks, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: sacked
##              Df Sum Sq Mean Sq F value Pr(>F)  
## c_avg_cmpp    1    0.0   0.015    0.04  0.835  
## c_rate        1    0.7   0.727    2.12  0.147  
## c_avg_inter   1    0.0   0.011    0.03  0.861  
## c_avg_tds     1    0.2   0.200    0.58  0.446  
## c_avg_yds     1    0.1   0.069    0.20  0.654  
## c_avg_att     1    1.9   1.948    5.67  0.018 *
## Residuals   197   67.7   0.343                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 40 
##                 17     29     30     38     40    45    46      62     64
## Predicted   3.1544  3.032  3.053 3.2432  3.173 3.142 3.166  2.8911 3.1686
## cvpred      3.1526  3.005  3.061 3.2220  3.173 3.128 3.156  2.8964 3.1663
## sacked      3.1822  2.646  2.646 3.2995  2.779 3.371 3.405  2.8391 3.2619
## CV residual 0.0296 -0.358 -0.415 0.0775 -0.395 0.243 0.248 -0.0573 0.0956
##                72    79      91    97     98   103   115   117    125
## Predicted   3.217 3.066  3.1691 3.016  3.177 3.127 3.179 3.060 3.3472
## cvpred      3.215 3.055  3.1712 3.026  3.159 3.144 3.184 3.069 3.3519
## sacked      3.716 3.614  3.0956 3.049  2.715 3.666 3.300 3.262 3.4372
## CV residual 0.501 0.559 -0.0756 0.023 -0.445 0.522 0.116 0.193 0.0853
##                126    127    134    135    139  141    142   144   148
## Predicted    3.221  3.186  3.060  3.150  3.248 3.11  3.204 3.200 3.232
## cvpred       3.231  3.203  3.058  3.161  3.242 3.11  3.208 3.210 3.209
## sacked       3.096  2.779  2.896  2.407  3.001 3.22  2.839 3.764 3.852
## CV residual -0.135 -0.424 -0.162 -0.755 -0.241 0.11 -0.369 0.553 0.643
##                154   156   166    169   174    176   181    183    194
## Predicted    3.494 3.090 3.098  2.898 3.055  3.154 3.302  3.149  2.663
## cvpred       3.553 3.106 3.071  2.885 3.049  3.214 3.303  3.171  2.749
## sacked       3.096 3.223 3.336  2.779 3.405  2.839 3.740  2.208  2.493
## CV residual -0.457 0.117 0.265 -0.107 0.356 -0.375 0.437 -0.962 -0.256
##               195   197    200   209
## Predicted   3.114 3.188  3.071 3.004
## cvpred      3.119 3.174  3.062 2.992
## sacked      3.586 3.529  2.646 3.336
## CV residual 0.467 0.356 -0.416 0.344
## 
## Sum of squares = 5.8    Mean square = 0.15    n = 40 
## 
## fold 2 
## Observations in test set: 41 
##                 1     4       8   10     11     12    14    42    43    53
## Predicted   3.165 3.117  3.2270 3.10 3.4277 3.3056 3.344 3.219  3.33 2.998
## cvpred      3.125 3.056  3.2121 3.04 3.4624 3.3190 3.353 3.204  3.35 2.912
## sacked      3.500 3.558  3.1822 3.40 3.4995 3.4045 3.529 3.786  1.96 3.437
## CV residual 0.374 0.502 -0.0299 0.36 0.0371 0.0855 0.176 0.583 -1.39 0.525
##                54     55    65     68     73    76     88    101     102
## Predicted   3.178  3.116 3.047  3.116  2.972 3.042  3.219 3.2453  3.1397
## cvpred      3.128  3.061 2.964  3.091  2.935 3.003  3.197 3.2627  3.1270
## sacked      3.614  2.573 3.182  2.092  2.407 3.640  3.096 3.2995  3.0493
## CV residual 0.486 -0.488 0.219 -0.999 -0.529 0.637 -0.102 0.0369 -0.0777
##                116  119    120   123   130   133   146   147   150   151
## Predicted    3.112 2.95  3.241 3.153 3.277 3.143 3.081 2.963 3.115 3.006
## cvpred       3.069 2.87  3.254 3.118 3.302 3.134 3.014 2.867 3.093 2.951
## sacked       3.001 3.99  3.182 3.764 3.500 3.437 3.371 3.852 3.469 3.666
## CV residual -0.068 1.12 -0.072 0.646 0.197 0.303 0.357 0.985 0.376 0.715
##               159   167   168    177     186   187   196    198   199
## Predicted   2.989 3.056 3.219 3.1823  3.1856 3.002  3.03  3.138  3.17
## cvpred      2.929 2.993 3.222 3.2275  3.2281 2.919  2.95  3.087  3.16
## sacked      3.182 3.096 3.300 3.2619  3.1822 3.371  2.49  2.779  3.00
## CV residual 0.253 0.103 0.078 0.0344 -0.0458 0.452 -0.46 -0.308 -0.16
##               203    205   206
## Predicted   3.138  3.012  3.15
## cvpred      3.118  3.051  3.13
## sacked      3.586  2.646  2.41
## CV residual 0.468 -0.405 -0.72
## 
## Sum of squares = 10.4    Mean square = 0.25    n = 41 
## 
## fold 3 
## Observations in test set: 41 
##                  5    16   19     20    25    26    28    37     41    44
## Predicted    3.237 3.132 3.14  3.067 3.219 3.245  3.10  2.92  3.276 3.098
## cvpred       3.290 3.245 3.17  3.191 3.024 3.263  3.18  3.15  3.337 3.196
## sacked       2.779 3.500 3.26  3.001 3.469 3.437  2.65 -2.30  3.049 3.831
## CV residual -0.511 0.255 0.09 -0.191 0.445 0.175 -0.53 -5.45 -0.287 0.635
##                 48   52     58    59     60     61     67   71     86
## Predicted    3.092 3.22  3.329 2.989  3.133  3.091  3.194 3.18  3.085
## cvpred       3.104 3.28  3.346 3.128  3.181  3.129  3.219 3.18  3.174
## sacked       2.950 3.40  2.950 3.405  2.950  2.950  2.493 3.91  2.573
## CV residual -0.155 0.12 -0.396 0.276 -0.231 -0.179 -0.726 0.73 -0.601
##                89   100    105    118    121    128    132    140   158
## Predicted   3.162 3.237 3.0345  3.108  3.149  3.299 3.0286  3.254 3.256
## cvpred      3.202 3.211 3.0937  3.143  3.165  3.273 3.1236  3.220 3.224
## sacked      3.614 3.873 3.1398  2.646  3.001  2.950 3.1822  2.839 3.972
## CV residual 0.412 0.662 0.0462 -0.497 -0.164 -0.323 0.0586 -0.381 0.748
##                161    162    163   164   178   182   185    192    201
## Predicted   3.2602  3.092  3.018 3.092 2.992 3.256  2.86  3.121 3.1063
## cvpred      3.1620  3.086  3.092 3.137 3.073 3.074  2.98  3.125 3.1197
## sacked      3.2619  2.646  2.313 3.666 3.586 3.336  1.81  2.950 3.1398
## CV residual 0.0999 -0.439 -0.779 0.529 0.513 0.262 -1.17 -0.175 0.0201
##               202   208   210   211
## Predicted   3.014 3.092 3.066 3.163
## cvpred      3.103 3.068 3.108 3.093
## sacked      3.640 3.371 3.405 3.614
## CV residual 0.537 0.303 0.296 0.521
## 
## Sum of squares = 38.1    Mean square = 0.93    n = 41 
## 
## fold 4 
## Observations in test set: 41 
##                 13    18     21     24    27     33    34      39    47
## Predicted    3.026 3.019  3.090  3.071 3.109  3.298  2.98  3.2265 3.196
## cvpred       2.925 2.963  3.055  2.985 3.035  3.473  2.90  3.2580 3.170
## sacked       2.779 3.262  2.950  2.839 3.140  2.950  2.49  3.2229 3.469
## CV residual -0.146 0.299 -0.105 -0.146 0.105 -0.524 -0.41 -0.0352 0.299
##                 49    51     56     63    66    70      75    78   80
## Predicted    3.483 3.148  3.176  3.204 3.336 3.025  3.2391 3.200 3.26
## cvpred       3.558 3.116  3.185  3.220 3.386 2.893  3.2737 3.226 3.25
## sacked       3.371 3.223  2.573  3.096 3.500 3.529  3.2619 3.809 3.89
## CV residual -0.188 0.106 -0.613 -0.125 0.113 0.636 -0.0118 0.583 0.64
##                 81    82    83     85   95    99   106   109   112   113
## Predicted   3.2140 3.202 3.233  3.007 3.31  3.36 3.298 3.204  3.10 3.024
## cvpred      3.2296 3.227 3.248  2.936 3.32  3.38 3.330 3.212  3.09 2.994
## sacked      3.2995 3.586 3.529  2.407 3.67  2.65 3.852 3.558  1.96 3.691
## CV residual 0.0699 0.359 0.281 -0.529 0.35 -0.73 0.522 0.346 -1.13 0.697
##               114    124  131   137   149   153    157   165   170    172
## Predicted   3.017  3.143 2.63 3.138  3.27 2.965 3.3354 3.151 3.162  3.205
## cvpred      2.951  3.138 2.22 3.115  3.32 2.865 3.3886 3.136 3.175  3.260
## sacked      3.529  2.313 3.59 3.437  1.41 3.764 3.4045 3.614 3.300  3.182
## CV residual 0.578 -0.826 1.36 0.322 -1.91 0.898 0.0159 0.478 0.125 -0.078
##                184   190    204
## Predicted   3.2463 3.079  3.316
## cvpred      3.4183 3.086  3.275
## sacked      3.4689 3.336  3.140
## CV residual 0.0506 0.249 -0.135
## 
## Sum of squares = 13.4    Mean square = 0.33    n = 41 
## 
## fold 5 
## Observations in test set: 41 
##                 2      3     6     7     9      15    22    31    32    35
## Predicted   3.319 3.2822 3.020 3.380 3.156  3.1699 3.291 3.074  3.10 3.051
## cvpred      3.314 3.3020 3.010 3.406 3.146  3.1927 3.278 3.067  3.13 3.012
## sacked      3.716 3.3358 3.223 3.558 3.586  3.1398 3.529 3.300  2.78 3.223
## CV residual 0.402 0.0338 0.213 0.152 0.441 -0.0529 0.251 0.232 -0.35 0.211
##                36    50     57     74     77     87     90   92    93
## Predicted   3.171 3.191 3.0847  3.110  3.166  3.094  3.197 3.19  3.31
## cvpred      3.174 3.171 3.0717  3.102  3.147  3.119  3.194 3.20  3.29
## sacked      3.300 3.405 3.1398  2.646  2.493  2.407  2.950 3.56  3.00
## CV residual 0.126 0.234 0.0681 -0.456 -0.654 -0.712 -0.245 0.36 -0.29
##                94     96     104    107    108    110     122   129    136
## Predicted   3.142  3.108  3.2226  3.140  3.240  3.116  3.0504 3.145 3.1847
## cvpred      3.126  3.134  3.2213  3.135  3.251  3.095  3.0249 3.148 3.1821
## sacked      3.953  2.950  3.1822  2.950  2.493  2.779  2.9497 3.300 3.2619
## CV residual 0.827 -0.184 -0.0391 -0.185 -0.758 -0.316 -0.0752 0.151 0.0798
##                143   145    152   160  171    173    175  179    180
## Predicted    3.440 3.070  3.163 3.073 3.13 3.0959  3.210 3.23  3.338
## cvpred       3.457 3.073  3.169 3.054 3.13 3.1157  3.201 3.21  3.324
## sacked       3.336 3.500  2.950 3.852 3.40 3.1398  2.715 3.61  3.223
## CV residual -0.122 0.427 -0.219 0.798 0.27 0.0241 -0.486 0.40 -0.101
##                188    189   191   193
## Predicted    3.324  2.993 3.005  3.02
## cvpred       3.349  3.015 3.005  3.03
## sacked       3.182  2.573 3.182  2.65
## CV residual -0.166 -0.442 0.177 -0.38
## 
## Sum of squares = 5.43    Mean square = 0.13    n = 41 
## 
## Overall (Sum over all 41 folds) 
##    ms 
## 0.358