# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]

# Set the resopnse variables
ints = qb_stats["ints"]

# Generate clean data set
data.log.no_combine.for_ints = data.frame(log(na.omit(cbind(ints, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.no_combine.ints <- lm(formula = ints ~ ., data = data.log.no_combine.for_ints)

# Find optimum linear regression model for ints
step_reg.log.no_combine.ints <- stepAIC(lm.log.no_combine.ints, direction = "both")
## Start:  AIC=-293.4
## ints ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_avg_cmpp   1     0.000 60.9 -295
## - c_pct        1     0.001 60.9 -295
## - c_avg_att    1     0.006 60.9 -295
## - c_numyrs     1     0.013 60.9 -295
## - weight       1     0.014 60.9 -295
## - c_avg_tds    1     0.097 61.0 -295
## - c_rate       1     0.159 61.0 -295
## - c_avg_inter  1     0.231 61.1 -295
## - height       1     0.287 61.2 -294
## - c_avg_yds    1     0.400 61.3 -294
## <none>                     60.9 -293
## - age          1     1.434 62.3 -290
## 
## Step:  AIC=-295.4
## ints ~ height + weight + age + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_numyrs     1     0.013 60.9 -297
## - weight       1     0.014 60.9 -297
## - c_pct        1     0.066 60.9 -297
## - c_avg_tds    1     0.099 61.0 -297
## - c_rate       1     0.161 61.0 -297
## - c_avg_inter  1     0.266 61.1 -296
## - height       1     0.289 61.2 -296
## - c_avg_yds    1     0.400 61.3 -296
## <none>                     60.9 -295
## - c_avg_att    1     0.620 61.5 -295
## + c_avg_cmpp   1     0.000 60.9 -293
## - age          1     1.437 62.3 -292
## 
## Step:  AIC=-297.4
## ints ~ height + weight + age + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - weight       1     0.010 60.9 -299
## - c_pct        1     0.056 60.9 -299
## - c_avg_tds    1     0.089 61.0 -299
## - c_rate       1     0.149 61.0 -299
## - c_avg_inter  1     0.302 61.2 -298
## - height       1     0.308 61.2 -298
## - c_avg_yds    1     0.387 61.3 -298
## <none>                     60.9 -297
## - c_avg_att    1     0.607 61.5 -297
## + c_numyrs     1     0.013 60.9 -295
## + c_avg_cmpp   1     0.000 60.9 -295
## - age          1     1.434 62.3 -294
## 
## Step:  AIC=-299.4
## ints ~ height + age + c_rate + c_pct + c_avg_inter + c_avg_tds + 
##     c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_pct        1     0.064 61.0 -301
## - c_avg_tds    1     0.098 61.0 -301
## - c_rate       1     0.163 61.1 -301
## - c_avg_inter  1     0.294 61.2 -300
## - c_avg_yds    1     0.409 61.3 -300
## <none>                     60.9 -299
## - height       1     0.596 61.5 -299
## - c_avg_att    1     0.629 61.5 -299
## + weight       1     0.010 60.9 -297
## + c_numyrs     1     0.009 60.9 -297
## + c_avg_cmpp   1     0.000 60.9 -297
## - age          1     1.557 62.5 -295
## 
## Step:  AIC=-301.1
## ints ~ height + age + c_rate + c_avg_inter + c_avg_tds + c_avg_yds + 
##     c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_avg_tds    1     0.033 61.0 -303
## - c_rate       1     0.158 61.1 -302
## - c_avg_yds    1     0.503 61.5 -301
## <none>                     61.0 -301
## - height       1     0.700 61.7 -300
## - c_avg_inter  1     0.957 61.9 -299
## + c_avg_cmpp   1     0.064 60.9 -299
## + c_pct        1     0.064 60.9 -299
## + weight       1     0.018 60.9 -299
## - c_avg_att    1     1.039 62.0 -299
## + c_numyrs     1     0.001 61.0 -299
## - age          1     1.530 62.5 -297
## 
## Step:  AIC=-303
## ints ~ height + age + c_rate + c_avg_inter + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_rate       1     0.125 61.1 -304
## - c_avg_yds    1     0.470 61.5 -303
## <none>                     61.0 -303
## - height       1     0.685 61.7 -302
## + c_avg_tds    1     0.033 61.0 -301
## + weight       1     0.019 61.0 -301
## + c_numyrs     1     0.000 61.0 -301
## + c_avg_cmpp   1     0.000 61.0 -301
## + c_pct        1     0.000 61.0 -301
## - c_avg_inter  1     1.071 62.1 -301
## - c_avg_att    1     1.094 62.1 -301
## - age          1     1.707 62.7 -298
## 
## Step:  AIC=-304.5
## ints ~ height + age + c_avg_inter + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## <none>                     61.1 -304
## - height       1     0.645 61.8 -304
## + c_rate       1     0.125 61.0 -303
## + c_pct        1     0.056 61.1 -303
## + c_avg_cmpp   1     0.055 61.1 -303
## + weight       1     0.021 61.1 -303
## + c_numyrs     1     0.001 61.1 -302
## + c_avg_tds    1     0.001 61.1 -302
## - c_avg_yds    1     1.048 62.2 -302
## - age          1     1.728 62.8 -300
## - c_avg_att    1     2.424 63.5 -297
## - c_avg_inter  1     3.097 64.2 -295
summary(step_reg.log.no_combine.ints)
## 
## Call:
## lm(formula = ints ~ height + age + c_avg_inter + c_avg_yds + 
##     c_avg_att, data = data.log.no_combine.for_ints)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.408 -0.260  0.090  0.320  0.818 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.7994     7.1189   -0.67  0.50088    
## height        2.5196     1.6207    1.55  0.12142    
## age          -0.9759     0.3835   -2.54  0.01159 *  
## c_avg_inter   0.3280     0.0963    3.41  0.00078 ***
## c_avg_yds     0.4740     0.2392    1.98  0.04867 *  
## c_avg_att    -0.8757     0.2906   -3.01  0.00287 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.517 on 229 degrees of freedom
## Multiple R-squared: 0.09,    Adjusted R-squared: 0.0701 
## F-statistic: 4.53 on 5 and 229 DF,  p-value: 0.000588
plot(step_reg.log.no_combine.ints)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.log.no_combine.ints <- regsubsets(ints ~ ., data = data.log.no_combine.for_ints, 
    nbest = 10)
subsets(leaps.log.no_combine.ints, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.no_combine.for_ints, step_reg.log.no_combine.ints, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: ints
##              Df Sum Sq Mean Sq F value Pr(>F)   
## height        1    0.1   0.141    0.53 0.4676   
## age           1    1.4   1.388    5.20 0.0235 * 
## c_avg_inter   1    0.0   0.001    0.00 0.9558   
## c_avg_yds     1    2.1   2.090    7.83 0.0056 **
## c_avg_att     1    2.4   2.424    9.08 0.0029 **
## Residuals   229   61.1   0.267                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 47 
##                3     7     36      39    41    45    46    59    71    75
## Predicted   2.21 2.668 2.2718  2.6289 2.354  2.44 2.260  2.35 2.441 2.289
## cvpred      2.11 2.663 2.2284  2.6118 2.326  2.39 2.227  2.31 2.393 2.192
## ints        2.84 2.839 2.3125  2.5726 2.493  1.96 2.573  2.21 3.096 2.950
## CV residual 0.73 0.176 0.0842 -0.0392 0.168 -0.43 0.345 -0.10 0.703 0.758
##                78    84     87    94    96    99   100   108   110    113
## Predicted   2.403 2.432  2.251 2.404  2.40 2.219 2.615 2.365 2.603  2.361
## cvpred      2.365 2.421  2.163 2.379  2.36 2.064 2.598 2.337 2.612  2.321
## ints        2.573 2.573  1.808 2.715  2.21 2.779 2.715 2.493 2.715  2.208
## CV residual 0.208 0.152 -0.354 0.336 -0.15 0.715 0.117 0.157 0.103 -0.113
##               119   128  135   143   148   153   158  160   161   166  174
## Predicted   2.385 2.485 2.27 2.712 2.413 2.369 2.321 2.24 2.569 2.417 2.50
## cvpred      2.303 2.461 2.20 2.673 2.365 2.343 2.256 2.16 2.515 2.392 2.42
## ints        2.896 2.896 2.31 3.096 2.573 2.839 3.140 2.65 2.646 2.646 3.22
## CV residual 0.593 0.435 0.11 0.423 0.208 0.496 0.884 0.49 0.132 0.254 0.80
##               178     179     188   192    193   201   203   209    211
## Predicted   2.402 2.64107 2.71970 2.568 2.2909 2.476 2.536 2.213  2.728
## cvpred      2.366 2.63644 2.70720 2.549 2.2167 2.430 2.504 2.122  2.728
## ints        3.182 2.64617 2.71469 2.715 2.3125 2.779 2.779 2.407  2.092
## CV residual 0.817 0.00973 0.00749 0.166 0.0958 0.349 0.275 0.285 -0.636
##               220   221   222   232   235    237  240
## Predicted   2.264 2.628 2.148 2.654  2.44  2.888 2.53
## cvpred      2.199 2.577 2.103 2.665  2.40  2.882 2.46
## ints        2.493 3.001 2.646 2.839  2.21  2.573 2.78
## CV residual 0.294 0.424 0.543 0.174 -0.19 -0.309 0.32
## 
## Sum of squares = 7.86    Mean square = 0.17    n = 47 
## 
## fold 2 
## Observations in test set: 47 
##                 4     5    14    17      33    42     43    44     48
## Predicted   2.370  2.26 2.381 2.509  2.4841 2.260  2.458 2.397  2.409
## cvpred      2.368  2.22 2.370 2.566  2.5647 2.226  2.440 2.369  2.421
## ints        2.573  1.13 2.715 3.001  2.4932 2.839  1.808 2.573  2.092
## CV residual 0.204 -1.09 0.344 0.434 -0.0715 0.613 -0.632 0.203 -0.329
##                50    51     55     60     66    69    73    80   101   102
## Predicted   1.861 2.411  2.374  2.161  2.529 2.399 2.185  2.29 2.295 2.205
## cvpred      1.778 2.390  2.374  2.141  2.598 2.413 2.140  2.25 2.272 2.128
## ints        2.313 2.896  2.208  2.092  2.092 2.715 2.313  1.96 2.573 2.313
## CV residual 0.535 0.506 -0.166 -0.049 -0.507 0.302 0.173 -0.29 0.301 0.185
##               106    107    112  114   123    126    129    133   138
## Predicted   2.466  2.355  2.444 2.30 2.492  2.487  2.402  2.513 2.389
## cvpred      2.471  2.382  2.414 2.32 2.525  2.503  2.453  2.559 2.380
## ints        2.896  2.208  2.092 2.57 2.839  1.960  2.208  2.407 2.493
## CV residual 0.424 -0.173 -0.322 0.25 0.314 -0.543 -0.245 -0.152 0.113
##                139   145    150    155    159    163   164    169   176
## Predicted    2.574 2.421 2.4450  2.384  2.262  2.669 2.371  2.477 2.606
## cvpred       2.592 2.441 2.4381  2.367  2.292  2.724 2.355  2.533 2.684
## ints         2.208 3.140 2.4932  2.092  1.960  1.808 3.049  1.960 2.839
## CV residual -0.383 0.699 0.0551 -0.275 -0.332 -0.915 0.694 -0.573 0.155
##              177    182   184    185   189    199   210  213   233   239
## Predicted   2.55  2.805 2.646  2.704  2.47  2.533 2.470 2.47 2.773  2.70
## cvpred      2.59  2.893 2.652  2.740  2.56  2.595 2.467 2.54 2.848  2.81
## ints        2.78  2.779 3.049  2.092  1.41  2.313 2.715 2.78 3.096  1.63
## CV residual 0.19 -0.114 0.397 -0.649 -1.14 -0.283 0.248 0.24 0.247 -1.19
## 
## Sum of squares = 10.6    Mean square = 0.23    n = 47 
## 
## fold 3 
## Observations in test set: 47 
##                 2      6     11   25    27     29    31      32     47
## Predicted   2.385 2.3165  2.248 2.27  2.25  2.351 2.318  2.2625  2.151
## cvpred      2.399 2.3144  2.251 2.08  2.27  2.378 2.321  2.2746  2.144
## ints        2.896 2.4069  1.808 2.49  2.09  2.092 2.646  2.2083  1.808
## CV residual 0.497 0.0925 -0.443 0.41 -0.18 -0.286 0.325 -0.0664 -0.335
##                  52     53    58     62     63   64    67     76    79
## Predicted    2.4895  2.454 2.380  2.415  2.227 2.30  2.40  2.362 2.302
## cvpred       2.5065  2.461 2.385  2.403  2.260 2.37  2.42  2.366 2.294
## ints         2.4069  2.313 2.779  2.208  1.808 2.78  1.81  2.092 2.573
## CV residual -0.0995 -0.148 0.394 -0.194 -0.451 0.41 -0.61 -0.274 0.278
##                 85     86     89     90     92    93   109    120    121
## Predicted    2.235  2.252  2.363  2.203  2.468 2.266 2.384 2.5499  2.151
## cvpred       2.244  2.303  2.367  2.230  2.499 2.267 2.390 2.5429  2.170
## ints         1.808  1.808  1.808  1.808  2.208 2.839 3.049 2.5726  1.411
## CV residual -0.435 -0.495 -0.559 -0.422 -0.291 0.572 0.659 0.0297 -0.759
##                127    136   146   147    152   165    175   186    191
## Predicted   2.4803  2.621 2.371 2.380  2.617 2.218  2.560 2.593  2.359
## cvpred      2.5058  2.633 2.377 2.368  2.639 2.241  2.573 2.582  2.371
## ints        2.5726  1.960 2.896 2.839  1.960 2.779  2.208 2.839  1.808
## CV residual 0.0669 -0.673 0.519 0.472 -0.679 0.538 -0.364 0.257 -0.562
##                 195     202  206   212   215    223   224   225   227
## Predicted    2.5445  2.5589 2.49 2.398 2.515 2.6548 2.482 2.336 2.725
## cvpred       2.5394  2.5669 2.50 2.404 2.506 2.6416 2.462 2.330 2.695
## ints         2.4932  2.4932 2.84 2.839 2.715 2.7147 2.839 3.001 3.182
## CV residual -0.0462 -0.0737 0.34 0.435 0.208 0.0731 0.378 0.671 0.487
##               229   230
## Predicted   2.566  2.57
## cvpred      2.541  2.54
## ints        2.646  1.13
## CV residual 0.105 -1.41
## 
## Sum of squares = 9.83    Mean square = 0.21    n = 47 
## 
## fold 4 
## Observations in test set: 47 
##                 8       9    10    13     16    20    22     23     28
## Predicted   2.255  2.3090 2.376 2.224  2.312 2.537 2.188  2.598  2.135
## cvpred      2.229  2.2953 2.337 2.173  2.259 2.451 2.180  2.685  2.129
## ints        2.573  2.2083 2.573 2.208  2.208 2.896 2.573  2.407  1.131
## CV residual 0.343 -0.0871 0.235 0.035 -0.051 0.445 0.393 -0.278 -0.998
##                  38     49    57    61    65    68     72    74     77
## Predicted    2.2536 2.3882 2.309 2.387 2.220 2.349 2.4278 2.123  2.331
## cvpred       2.2203 2.3812 2.299 2.388 2.161 2.327 2.4186 2.130  2.304
## ints         2.2083 2.4069 2.779 2.493 2.779 2.779 2.4932 2.407  1.960
## CV residual -0.0121 0.0257 0.479 0.105 0.618 0.452 0.0746 0.277 -0.344
##                 81    83    95       97    98   104   105   115     117
## Predicted    2.428 2.475 2.385  2.35139  2.44 2.575 2.439 2.465  2.2473
## cvpred       2.412 2.445 2.404  2.32009  2.48 2.568 2.389 2.455  2.2254
## ints         2.208 2.779 2.573  2.31254  2.31 2.950 2.573 2.839  2.2083
## CV residual -0.203 0.333 0.169 -0.00755 -0.17 0.382 0.184 0.385 -0.0171
##               122   124    125   131    137   144    149   156  157    162
## Predicted   2.348 2.375  2.431 2.159 2.4356 2.450  2.573 2.485 2.47  2.655
## cvpred      2.306 2.360  2.475 2.121 2.4324 2.460  2.581 2.460 2.52  2.618
## ints        2.779 2.573  1.808 2.646 2.4932 2.839  2.313 2.839 2.65  2.407
## CV residual 0.473 0.212 -0.666 0.525 0.0608 0.379 -0.268 0.379 0.13 -0.211
##               171   187    194  196    197    204   214   218   228   231
## Predicted   2.664 2.421  2.116 2.52  2.504  2.357 2.304 2.560  2.60 2.455
## cvpred      2.632 2.379  2.242 2.44  2.536  2.407 2.298 2.584  2.63 2.458
## ints        2.950 2.646  1.808 2.57  2.313  1.411 2.573 2.839  1.96 2.839
## CV residual 0.318 0.267 -0.433 0.13 -0.224 -0.996 0.274 0.255 -0.67 0.381
## 
## Sum of squares = 6.74    Mean square = 0.14    n = 47 
## 
## fold 5 
## Observations in test set: 47 
##                 1    12     15    18    19     21      24     26    30
## Predicted   2.104  2.14  2.352 2.283 2.488  2.258  2.3967  2.521 2.514
## cvpred      2.187  2.26  2.403 2.387 2.475  2.378  2.4938  2.556 2.585
## ints        2.313  2.09  2.208 2.646 3.001  1.960  2.4069  2.092 2.950
## CV residual 0.126 -0.17 -0.195 0.259 0.526 -0.418 -0.0868 -0.464 0.365
##                 34     35    37      40   54     56    70     82    91
## Predicted    2.415  2.133  2.11  2.1515 2.06  2.422  2.40  2.373 2.542
## cvpred       2.503  2.272  2.28  2.2789 2.17  2.463  2.50  2.404 2.558
## ints         2.092  1.808 -2.30  0.0953 2.31  2.208  2.09  1.808 2.715
## CV residual -0.411 -0.463 -4.58 -2.1836 0.14 -0.254 -0.41 -0.596 0.156
##                103  111    116    118    132     134   141    142   151
## Predicted    2.307 2.54  2.494  2.313 2.4034  2.5551 2.295 2.5186 2.233
## cvpred       2.357 2.52  2.542  2.377 2.4644  2.5886 2.349 2.5508 2.368
## ints         1.960 2.84  2.208  2.092 2.4932  2.4932 2.646 2.5726 2.715
## CV residual -0.397 0.32 -0.333 -0.285 0.0288 -0.0954 0.297 0.0218 0.346
##               154    167    168  170    172   173   180   181    183   190
## Predicted   2.670  2.183  2.455 2.49 2.5907 2.512 2.556 2.459  2.478 2.404
## cvpred      2.615  2.280  2.434 2.50 2.5215 2.524 2.555 2.432  2.551 2.504
## ints        2.646  2.092  1.960 2.71 2.5726 2.950 2.839 2.896  1.808 2.646
## CV residual 0.031 -0.188 -0.474 0.21 0.0511 0.425 0.285 0.464 -0.742 0.143
##                198   200    205    207    208   217   219   226   236
## Predicted    2.461 2.334  2.531  2.557  2.639 2.450 2.779  2.67 2.693
## cvpred       2.521 2.408  2.547  2.597  2.649 2.450 2.744  2.62 2.573
## ints         1.960 2.779  2.092  2.208  2.208 2.646 2.950  2.09 2.896
## CV residual -0.561 0.371 -0.455 -0.389 -0.441 0.196 0.205 -0.53 0.323
##                238
## Predicted    2.899
## cvpred       2.726
## ints         2.407
## CV residual -0.319
## 
## Sum of squares = 31.3    Mean square = 0.67    n = 47 
## 
## Overall (Sum over all 47 folds) 
##    ms 
## 0.282