# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]

# Set the resopnse variables
wins = qb_stats["wins"]

# Generate clean data set
data.log.no_combine.for_wins = data.frame(log(na.omit(cbind(wins, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.no_combine.wins <- lm(formula = wins ~ ., data = data.log.no_combine.for_wins)

# Find optimum linear regression model for wins
step_reg.log.no_combine.wins <- stepAIC(lm.log.no_combine.wins, direction = "both")
## Start:  AIC=-23.43
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - c_avg_inter  1     0.045 194 -25.4
## - c_avg_tds    1     0.243 194 -25.1
## - age          1     0.251 194 -25.1
## - c_numyrs     1     0.440 194 -24.9
## - c_rate       1     0.515 194 -24.8
## - height       1     0.852 195 -24.4
## - weight       1     1.385 195 -23.7
## <none>                     194 -23.4
## - c_pct        1     1.933 196 -23.1
## - c_avg_cmpp   1     2.108 196 -22.9
## - c_avg_yds    1     2.282 196 -22.7
## - c_avg_att    1     2.569 197 -22.3
## 
## Step:  AIC=-25.37
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - age          1      0.26 194 -27.1
## - c_numyrs     1      0.40 194 -26.9
## - c_avg_tds    1      0.68 195 -26.5
## - height       1      0.88 195 -26.3
## - c_rate       1      0.93 195 -26.2
## - weight       1      1.55 196 -25.5
## <none>                     194 -25.4
## - c_pct        1      2.62 197 -24.2
## - c_avg_cmpp   1      2.66 197 -24.1
## - c_avg_att    1      2.97 197 -23.8
## + c_avg_inter  1      0.05 194 -23.4
## - c_avg_yds    1      4.75 199 -21.6
## 
## Step:  AIC=-27.05
## wins ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds + 
##     c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - c_numyrs     1      0.42 195 -28.5
## - height       1      0.83 195 -28.1
## - c_avg_tds    1      0.83 195 -28.0
## - c_rate       1      0.87 195 -28.0
## - weight       1      1.35 196 -27.4
## <none>                     194 -27.1
## - c_pct        1      2.56 197 -25.9
## - c_avg_cmpp   1      2.60 197 -25.9
## - c_avg_att    1      2.90 197 -25.5
## + age          1      0.26 194 -25.4
## + c_avg_inter  1      0.05 194 -25.1
## - c_avg_yds    1      4.66 199 -23.4
## 
## Step:  AIC=-28.54
## wins ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds + 
##     c_avg_yds + c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - height       1      0.76 196 -29.6
## - c_avg_tds    1      0.87 196 -29.5
## - c_rate       1      0.91 196 -29.4
## - weight       1      1.10 196 -29.2
## <none>                     195 -28.5
## + c_numyrs     1      0.42 194 -27.1
## - c_pct        1      2.95 198 -27.0
## - c_avg_cmpp   1      2.98 198 -26.9
## + age          1      0.28 194 -26.9
## + c_avg_inter  1      0.01 195 -26.6
## - c_avg_att    1      3.31 198 -26.6
## - c_avg_yds    1      4.88 200 -24.7
## 
## Step:  AIC=-29.62
## wins ~ weight + c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds + 
##     c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - weight       1      0.44 196 -31.1
## - c_avg_tds    1      0.75 196 -30.7
## - c_rate       1      0.96 196 -30.4
## <none>                     196 -29.6
## + height       1      0.76 195 -28.5
## - c_pct        1      2.61 198 -28.5
## - c_avg_cmpp   1      2.64 198 -28.4
## - c_avg_att    1      2.96 198 -28.1
## + c_numyrs     1      0.35 195 -28.1
## + age          1      0.22 195 -27.9
## + c_avg_inter  1      0.02 196 -27.6
## - c_avg_yds    1      4.92 200 -25.7
## 
## Step:  AIC=-31.08
## wins ~ c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds + 
##     c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## - c_rate       1      0.78 197 -32.1
## - c_avg_tds    1      0.95 197 -31.9
## <none>                     196 -31.1
## + weight       1      0.44 196 -29.6
## + c_numyrs     1      0.19 196 -29.3
## - c_pct        1      3.17 199 -29.3
## - c_avg_cmpp   1      3.18 199 -29.3
## + height       1      0.10 196 -29.2
## + age          1      0.10 196 -29.2
## + c_avg_inter  1      0.09 196 -29.2
## - c_avg_att    1      3.47 199 -28.9
## - c_avg_yds    1      4.61 201 -27.6
## 
## Step:  AIC=-32.15
## wins ~ c_avg_cmpp + c_pct + c_avg_tds + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq RSS   AIC
## <none>                     197 -32.1
## - c_avg_cmpp   1      2.52 199 -31.1
## + c_rate       1      0.78 196 -31.1
## - c_pct        1      2.59 199 -31.0
## - c_avg_att    1      2.73 200 -30.9
## + c_avg_inter  1      0.31 196 -30.5
## + weight       1      0.25 196 -30.5
## + c_numyrs     1      0.24 196 -30.4
## + height       1      0.18 196 -30.4
## + age          1      0.08 197 -30.2
## - c_avg_tds    1      6.36 203 -26.6
## - c_avg_yds    1      6.86 204 -26.0
summary(step_reg.log.no_combine.wins)
## 
## Call:
## lm(formula = wins ~ c_avg_cmpp + c_pct + c_avg_tds + c_avg_yds + 
##     c_avg_att, data = data.log.no_combine.for_wins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.659 -0.316  0.175  0.585  1.339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  130.799     76.665    1.71   0.0893 . 
## c_avg_cmpp    29.062     16.897    1.72   0.0868 . 
## c_pct        -29.322     16.819   -1.74   0.0826 . 
## c_avg_tds     -0.649      0.238   -2.73   0.0068 **
## c_avg_yds      1.965      0.692    2.84   0.0049 **
## c_avg_att    -30.457     17.004   -1.79   0.0746 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.923 on 231 degrees of freedom
## Multiple R-squared: 0.0495,  Adjusted R-squared: 0.0289 
## F-statistic: 2.41 on 5 and 231 DF,  p-value: 0.0376
plot(step_reg.log.no_combine.wins)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.log.no_combine.wins <- regsubsets(wins ~ ., data = data.log.no_combine.for_wins, 
    nbest = 10)
subsets(leaps.log.no_combine.wins, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.no_combine.for_wins, step_reg.log.no_combine.wins, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: wins
##             Df Sum Sq Mean Sq F value Pr(>F)  
## c_avg_cmpp   1    1.4    1.36    1.60  0.207  
## c_pct        1    0.3    0.32    0.38  0.540  
## c_avg_tds    1    1.0    0.98    1.16  0.283  
## c_avg_yds    1    4.8    4.85    5.69  0.018 *
## c_avg_att    1    2.7    2.73    3.21  0.075 .
## Residuals  231  196.7    0.85                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 47 
##                 3     8     15    18   19     20     21   23    26      35
## Predicted   1.306 1.306  1.199 1.012 1.15  1.321  1.132 2.22 1.317  1.3052
## cvpred      1.207 1.261  1.077 0.898 1.03  1.286  1.072 2.17 1.275  1.3348
## wins        1.629 2.208  0.742 1.960 2.09  1.131  0.742 2.31 1.629  0.0953
## CV residual 0.422 0.947 -0.335 1.062 1.06 -0.154 -0.330 0.14 0.355 -1.2395
##               46      55   57    69   71   72    73    76   79      81
## Predicted   1.19  1.1705 1.27 1.250 1.43 1.39 1.630 1.398 1.14  1.5257
## cvpred      1.12  1.0904 1.28 1.174 1.45 1.33 1.601 1.362 1.08  1.5862
## wins        2.31  0.0953 2.31 1.411 1.96 2.41 2.407 1.629 2.31  0.0953
## CV residual 1.19 -0.9951 1.03 0.237 0.51 1.08 0.806 0.267 1.23 -1.4909
##                 82     91   96    115   118   121   122   124  131   132
## Predicted    1.373  1.349 1.12  1.377 1.346 1.353 1.302 1.199 1.30 1.189
## cvpred       1.401  1.387 1.00  1.374 1.356 1.360 1.329 1.129 1.19 1.130
## wins         0.742  1.131 1.96  1.131 1.808 1.960 1.808 1.808 1.41 1.629
## CV residual -0.659 -0.255 0.96 -0.242 0.452 0.601 0.479 0.679 0.22 0.499
##                133   135    140   150   155    164    176    183   187
## Predicted    1.383 1.438  1.448 1.324 1.445  1.288 1.6648 1.4899 1.063
## cvpred       1.401 1.503  1.414 1.322 1.434  1.263 1.7500 1.5353 0.983
## wins         1.131 1.808  1.131 1.960 1.960  0.742 1.8083 1.6292 1.131
## CV residual -0.269 0.305 -0.283 0.638 0.526 -0.522 0.0583 0.0939 0.149
##               194   205     214    223     228   235   236    237
## Predicted   1.652  2.06  1.0394 1.1655  1.2483 1.148 1.411  1.854
## cvpred      1.781  2.21  0.9783 1.0983  1.1471 1.059 1.405  1.953
## wins        1.960  1.13  0.0953 1.1314  1.1314 1.411 1.629  0.742
## CV residual 0.179 -1.08 -0.8829 0.0331 -0.0157 0.352 0.224 -1.211
## 
## Sum of squares = 22.5    Mean square = 0.48    n = 47 
## 
## fold 2 
## Observations in test set: 48 
##               24    31    33      36   38    40     42     43    60    63
## Predicted   1.23 1.194 1.091  1.1928 1.31 1.427  1.392  1.555 1.265 1.204
## cvpred      1.26 1.207 0.972  1.2190 1.36 1.492  1.460  1.665 1.290 1.246
## wins        2.41 1.960 2.092  1.1314 2.65 1.629  1.411  0.742 1.629 2.092
## CV residual 1.15 0.753 1.120 -0.0876 1.29 0.137 -0.049 -0.923 0.339 0.845
##                 74    85     88   89      99    103     108   110   111
## Predicted    1.354 1.097  1.270 1.32  1.5793  1.297  1.2445 1.391 1.544
## cvpred       1.404 1.083  1.318 1.37  1.6999  1.320  1.2901 1.435 1.636
## wins         0.742 1.808  1.131 1.63  0.0953  1.131  0.0953 1.629 1.808
## CV residual -0.662 0.725 -0.186 0.26 -1.6046 -0.189 -1.1948 0.194 0.172
##                116   119    126   128  138   139   146   149     152   153
## Predicted    1.353 1.337  1.466  1.36 1.29 1.313  1.28 1.593  1.3742 1.200
## cvpred       1.407 1.364  1.544  1.43 1.33 1.369  1.31 1.695  1.4375 1.200
## wins         1.131 1.629  0.742 -2.30 1.96 2.313 -2.30 1.960  1.4110 1.411
## CV residual -0.276 0.266 -0.802 -3.73 0.63 0.943 -3.62 0.265 -0.0265 0.211
##               159   166   170  173    175   178    191    192  198   202
## Predicted   1.243 1.167 1.416 1.23  1.551 1.158  1.083  1.544 1.25  1.09
## cvpred      1.232 1.187 1.476 1.25  1.632 1.128  1.056  1.615 1.27  1.08
## wins        2.208 1.411 1.808 2.41  0.742 1.960  0.742  1.411 1.41 -2.30
## CV residual 0.976 0.224 0.333 1.15 -0.890 0.832 -0.314 -0.204 0.14 -3.38
##                208   209     211     213    215   217    232     233   239
## Predicted    1.858 0.999  1.3690  1.3575  1.385 1.293  1.058  1.2311 1.039
## cvpred       1.989 0.969  1.4074  1.3932  1.366 1.316  1.052  1.2271 0.982
## wins         1.131 1.629  0.0953  0.0953  1.131 1.960  0.742  0.0953 1.131
## CV residual -0.858 0.660 -1.3121 -1.2979 -0.234 0.644 -0.310 -1.1318 0.149
## 
## Sum of squares = 62.9    Mean square = 1.31    n = 48 
## 
## fold 3 
## Observations in test set: 48 
##                2     4     5    6     7    14     17     47     48   52
## Predicted   1.32 1.169 1.267 1.16 1.479 1.301  1.395  1.355  1.272 1.29
## cvpred      1.29 1.194 1.207 1.12 1.444 1.263  1.322  1.296  1.190 1.25
## wins        2.41 1.960 1.629 1.41 1.808 1.960  0.742  0.742  0.742 2.57
## CV residual 1.12 0.766 0.422 0.29 0.365 0.697 -0.580 -0.554 -0.448 1.32
##                  56      61    66   67     70     77   78     80      86
## Predicted    1.2742  1.2149 1.714 1.33  1.394  1.477 1.33 1.3957  1.1891
## cvpred       1.2051  1.1526 1.585 1.30  1.388  1.410 1.25 1.3751  1.1443
## wins         0.0953  0.0953 2.092 1.81  0.742  1.131 2.41 1.4110  1.1314
## CV residual -1.1098 -1.0573 0.507 0.51 -0.646 -0.278 1.16 0.0359 -0.0129
##                  90    100  102     112   114    141    144   156    157
## Predicted    1.3205 1.6542 1.39  1.2514 1.178  1.327 1.4568 1.223 1.7548
## cvpred       1.2860 1.5858 1.38  1.1869 1.129  1.287 1.3963 1.165 1.7138
## wins         0.0953 1.6292 2.57  1.1314 2.092  0.742 1.4110 1.808 1.8083
## CV residual -1.1907 0.0435 1.19 -0.0555 0.963 -0.545 0.0147 0.644 0.0944
##               158   160   163   165    167   171  174   182    184   190
## Predicted   1.308  1.32 1.021 1.265  0.951 1.341 1.34 1.572  1.223 1.325
## cvpred      1.280  1.25 0.998 1.264  1.009 1.264 1.22 1.424  1.157 1.258
## wins        1.629  1.13 1.960 1.629 -2.303 1.629 2.49 1.808  1.131 1.411
## CV residual 0.349 -0.12 0.963 0.365 -3.311 0.365 1.28 0.384 -0.026 0.153
##               199  201   203   207  210    218    225  231   234   238
## Predicted   1.462 1.30 1.389 1.134 1.14  1.682  1.198 1.09 1.381 1.530
## cvpred      1.394 1.21 1.303 1.108 1.11  1.632  1.143 1.08 1.347 1.441
## wins        2.208 2.31 1.808 1.131 2.31  1.131  1.131 2.41 2.313 2.208
## CV residual 0.815 1.10 0.506 0.023 1.21 -0.501 -0.012 1.33 0.965 0.767
## 
## Sum of squares = 35.7    Mean square = 0.74    n = 48 
## 
## fold 4 
## Observations in test set: 47 
##                 9    13    25    27    29   34     44    45     49    51
## Predicted   1.536  1.21  2.32 1.314  1.10 1.13  1.178 1.263  1.328 1.370
## cvpred      1.580  1.34  4.15 1.405  1.27 1.26  1.283 1.351  1.355 1.438
## wins        1.808 -2.30  2.41 1.629 -2.30 1.63  0.742 1.808  0.742 1.808
## CV residual 0.229 -3.65 -1.74 0.225 -3.58 0.37 -0.541 0.458 -0.613 0.371
##                  54    64    65     68      75    97   101   106    107
## Predicted    1.1654 1.419 1.099  1.189  1.4290  1.11 1.534 1.487  1.251
## cvpred       1.3254 1.445 1.266  1.290  1.4664  1.19 1.572 1.487  1.284
## wins         0.0953 1.960 2.092  1.131  0.0953 -2.30 1.960 1.960  1.131
## CV residual -1.2301 0.515 0.826 -0.159 -1.3711 -3.50 0.388 0.473 -0.153
##                 113    117   123    129  130   134   137    147    148
## Predicted    1.0540  1.428 1.337  1.339 1.46 1.194  1.38  1.117  1.398
## cvpred       1.1913  1.513 1.388  1.390 1.44 1.313  1.42  1.264  1.451
## wins         1.1314  0.742 1.629  1.131 1.63 2.208  1.13  1.131  0.742
## CV residual -0.0599 -0.771 0.242 -0.258 0.19 0.895 -0.29 -0.133 -0.709
##                 154   161    169   177   180   181    185   189  193
## Predicted    1.6455 1.589  0.938 1.481 1.554 1.509 1.2702 1.042 1.07
## cvpred       1.8178 1.552  1.111 1.424 1.530 1.468 1.3549 1.147 1.19
## wins         0.0953 1.808 -2.303 1.629 1.808 1.960 1.4110 1.629 1.41
## CV residual -1.7225 0.256 -3.414 0.206 0.278 0.492 0.0561 0.482 0.22
##                 195   197  200   204    212    219  221    224    226
## Predicted    1.3006 1.223 1.20 1.833  1.232 0.9831 1.39  1.386  1.330
## cvpred       1.3574 1.268 1.33 1.717  1.335 1.0750 1.39  1.359  1.356
## wins         0.0953 2.092 1.81 2.313  0.742 1.1314 2.57  0.742  1.131
## CV residual -1.2621 0.823 0.48 0.596 -0.593 0.0564 1.19 -0.617 -0.224
##               229
## Predicted   0.829
## cvpred      0.977
## wins        1.629
## CV residual 0.652
## 
## Sum of squares = 70.3    Mean square = 1.5    n = 47 
## 
## fold 5 
## Observations in test set: 47 
##                1     10   11    12      16    22     28   30    32    39
## Predicted   1.14  1.186 1.37  1.42  1.1922 1.279  1.496 1.27 1.119 1.429
## cvpred      1.10  1.158 1.32  1.36  1.1506 1.233  1.444 1.24 1.086 1.383
## wins        2.41  0.742 1.96  1.13  0.0953 1.808  1.131 2.31 1.411 2.092
## CV residual 1.31 -0.416 0.64 -0.23 -1.0553 0.576 -0.313 1.07 0.325 0.709
##                41     50    53    58    59    62   83     84      87
## Predicted   1.236  0.983 1.082 1.350 1.089 0.947 1.35  1.032  1.1049
## cvpred      1.184  0.963 1.080 1.300 1.062 0.949 1.29  0.992  1.0996
## wins        1.411  0.742 1.808 1.629 1.411 1.629 2.41  0.742  0.0953
## CV residual 0.227 -0.221 0.729 0.329 0.349 0.680 1.11 -0.250 -1.0042
##                  92    93     94    95    98   104    105  109    120
## Predicted    1.4075 1.642  1.298 1.479 1.190 1.412  1.262 1.43 1.3949
## cvpred       1.3703 1.594  1.267 1.436 1.192 1.385  1.250 1.39 1.3905
## wins         0.0953 2.407  1.131 1.629 1.629 1.960  1.131 1.63 1.4110
## CV residual -1.2750 0.812 -0.136 0.194 0.437 0.575 -0.118 0.24 0.0205
##               125   127    136   142   143   145     151   162    168
## Predicted   1.670 1.426  1.713 1.452 1.845 1.208  1.2449 1.187  1.371
## cvpred      1.619 1.407  1.663 1.416 1.808 1.192  1.2098 1.175  1.325
## wins        2.313 2.208  1.411 1.629 2.493 1.411  1.1314 1.411  1.131
## CV residual 0.693 0.802 -0.252 0.213 0.685 0.219 -0.0784 0.236 -0.194
##                 172   179    186   188     196      206   216   220    222
## Predicted    1.4919 1.648  1.449 1.754  1.1635  1.44865 1.102 0.872  1.230
## cvpred       1.4646 1.619  1.445 1.707  1.1479  1.41215 1.112 0.904  1.259
## wins         0.0953 1.808  1.131 1.960  1.1314  1.41099 1.808 1.131  0.742
## CV residual -1.3693 0.189 -0.314 0.253 -0.0165 -0.00116 0.696 0.227 -0.517
##                 227
## Predicted    1.1509
## cvpred       1.1755
## wins         0.0953
## CV residual -1.0802
## 
## Sum of squares = 18.1    Mean square = 0.38    n = 47 
## 
## Overall (Sum over all 47 folds) 
##    ms 
## 0.884