# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]

# Set the resopnse variables
games_started = qb_stats["games_started"]

# Generate clean data set
data.log.no_combine.for_games_started = data.frame(log(na.omit(cbind(games_started, 
    college_stats)) + 0.1))

# Generate the linear model
lm.log.no_combine.games_started <- lm(formula = games_started ~ ., data = data.log.no_combine.for_games_started)

# Find optimum linear regression model for games_started
step_reg.log.no_combine.games_started <- stepAIC(lm.log.no_combine.games_started, 
    direction = "both")
## Start:  AIC=-627.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_avg_tds    1     0.002 15.9 -630
## - c_numyrs     1     0.003 15.9 -630
## - c_avg_inter  1     0.041 15.9 -629
## - c_rate       1     0.075 15.9 -629
## - age          1     0.098 16.0 -628
## - height       1     0.111 16.0 -628
## <none>                     15.9 -628
## - c_pct        1     0.136 16.0 -628
## - c_avg_cmpp   1     0.159 16.0 -627
## - c_avg_yds    1     0.164 16.0 -627
## - c_avg_att    1     0.193 16.1 -627
## - weight       1     0.561 16.4 -621
## 
## Step:  AIC=-629.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_inter + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_numyrs     1     0.002 15.9 -632
## - c_avg_inter  1     0.051 15.9 -631
## - age          1     0.099 16.0 -630
## - height       1     0.112 16.0 -630
## <none>                     15.9 -630
## - c_pct        1     0.153 16.0 -629
## - c_avg_cmpp   1     0.171 16.1 -629
## - c_avg_att    1     0.198 16.1 -629
## - c_rate       1     0.205 16.1 -629
## + c_avg_tds    1     0.002 15.9 -628
## - c_avg_yds    1     0.290 16.2 -627
## - weight       1     0.575 16.4 -623
## 
## Step:  AIC=-631.8
## games_started ~ height + weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_inter + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - c_avg_inter  1     0.049 15.9 -633
## - age          1     0.098 16.0 -632
## - height       1     0.111 16.0 -632
## <none>                     15.9 -632
## - c_pct        1     0.167 16.1 -631
## - c_avg_cmpp   1     0.185 16.1 -631
## - c_rate       1     0.205 16.1 -631
## - c_avg_att    1     0.214 16.1 -631
## + c_numyrs     1     0.002 15.9 -630
## + c_avg_tds    1     0.001 15.9 -630
## - c_avg_yds    1     0.291 16.2 -629
## - weight       1     0.580 16.5 -625
## 
## Step:  AIC=-633
## games_started ~ height + weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - age          1     0.084 16.0 -634
## - height       1     0.116 16.0 -633
## <none>                     15.9 -633
## - c_rate       1     0.169 16.1 -632
## + c_avg_inter  1     0.049 15.9 -632
## - c_pct        1     0.248 16.2 -631
## + c_avg_tds    1     0.013 15.9 -631
## - c_avg_yds    1     0.257 16.2 -631
## - c_avg_cmpp   1     0.264 16.2 -631
## + c_numyrs     1     0.000 15.9 -631
## - c_avg_att    1     0.285 16.2 -631
## - weight       1     0.691 16.6 -625
## 
## Step:  AIC=-633.7
## games_started ~ height + weight + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_yds + c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## - height       1     0.133 16.1 -634
## <none>                     16.0 -634
## - c_rate       1     0.153 16.2 -633
## + age          1     0.084 15.9 -633
## + c_avg_inter  1     0.035 16.0 -632
## - c_avg_yds    1     0.251 16.3 -632
## - c_pct        1     0.259 16.3 -632
## + c_avg_tds    1     0.007 16.0 -632
## + c_numyrs     1     0.000 16.0 -632
## - c_avg_cmpp   1     0.272 16.3 -632
## - c_avg_att    1     0.292 16.3 -631
## - weight       1     0.828 16.8 -624
## 
## Step:  AIC=-633.8
## games_started ~ weight + c_avg_cmpp + c_rate + c_pct + c_avg_yds + 
##     c_avg_att
## 
##               Df Sum of Sq  RSS  AIC
## <none>                     16.1 -634
## + height       1     0.133 16.0 -634
## - c_rate       1     0.143 16.3 -634
## + age          1     0.101 16.0 -633
## - c_pct        1     0.215 16.4 -633
## - c_avg_cmpp   1     0.226 16.4 -632
## + c_avg_inter  1     0.039 16.1 -632
## - c_avg_att    1     0.245 16.4 -632
## - c_avg_yds    1     0.248 16.4 -632
## + c_avg_tds    1     0.006 16.1 -632
## + c_numyrs     1     0.000 16.1 -632
## - weight       1     0.750 16.9 -625
summary(step_reg.log.no_combine.games_started)
## 
## Call:
## lm(formula = games_started ~ weight + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_yds + c_avg_att, data = data.log.no_combine.for_games_started)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.517 -0.209  0.000  0.209  0.535 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   39.962     23.918    1.67   0.0961 . 
## weight         0.931      0.283    3.29   0.0012 **
## c_avg_cmpp     9.281      5.141    1.81   0.0723 . 
## c_rate        -0.540      0.376   -1.44   0.1518   
## c_pct         -8.892      5.043   -1.76   0.0792 . 
## c_avg_yds      0.616      0.326    1.89   0.0597 . 
## c_avg_att     -9.901      5.262   -1.88   0.0611 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.263 on 233 degrees of freedom
## Multiple R-squared: 0.0849,  Adjusted R-squared: 0.0613 
## F-statistic:  3.6 on 6 and 233 DF,  p-value: 0.00196
plot(step_reg.log.no_combine.games_started)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.log.no_combine.games_started <- regsubsets(games_started ~ ., data = data.log.no_combine.for_games_started, 
    nbest = 10)
subsets(leaps.log.no_combine.games_started, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.no_combine.for_games_started, step_reg.log.no_combine.games_started, 
    m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: games_started
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## weight       1   1.02   1.019   14.70 0.00016 ***
## c_avg_cmpp   1   0.00   0.005    0.07 0.79231    
## c_rate       1   0.12   0.115    1.67 0.19817    
## c_pct        1   0.02   0.016    0.23 0.62930    
## c_avg_yds    1   0.10   0.097    1.40 0.23847    
## c_avg_att    1   0.25   0.245    3.54 0.06114 .  
## Residuals  233  16.15   0.069                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 48 
##                   3     15   25     27     29     34    38      45    46
## Predicted     2.446  2.383 2.68  2.346  2.418  2.396 2.449  2.4024 2.391
## cvpred        2.408  2.309 1.58  2.327  2.409  2.425 2.418  2.3967 2.406
## games_started 2.715  2.092 2.78  2.092  1.960  2.208 2.779  2.3125 2.715
## CV residual   0.307 -0.217 1.20 -0.235 -0.448 -0.217 0.361 -0.0842 0.309
##                   47     48     50     61     63    65     67    71    75
## Predicted      2.310  2.346  2.311  2.405 2.4321 2.323  2.397 2.407 2.483
## cvpred         2.314  2.386  2.303  2.437 2.4118 2.294  2.388 2.419 2.483
## games_started  2.208  1.960  2.092  2.092 2.4932 2.779  1.960 2.779 2.715
## CV residual   -0.106 -0.426 -0.211 -0.346 0.0814 0.485 -0.427 0.359 0.232
##                  78     94      96     97     98   101    110  113   114
## Predicted     2.492  2.354  2.3725  2.325  2.415 2.342 2.4739 2.32 2.368
## cvpred        2.528  2.362  2.3802  2.385  2.460 2.253 2.4890 2.31 2.384
## games_started 2.779  2.208  2.3125  1.960  2.092 2.407 2.4932 2.57 2.573
## CV residual   0.251 -0.154 -0.0677 -0.425 -0.369 0.154 0.0042 0.26 0.189
##                  135   138    139   150   158    171    172   174    176
## Predicted     2.2220 2.364 2.3929 2.404 2.244  2.363  2.410 2.290 2.3504
## cvpred        2.1979 2.350 2.4216 2.409 2.218  2.321  2.409 2.286 2.4034
## games_started 2.2083 2.493 2.4932 2.646 2.779  2.208  2.092 2.779 2.4069
## CV residual   0.0104 0.143 0.0716 0.237 0.561 -0.113 -0.318 0.493 0.0035
##                  177    183     184     190    193     200    211    214
## Predicted      2.357  2.412  2.4015  2.3510  2.211  2.3163  2.403  2.187
## cvpred         2.431  2.443  2.4265  2.3757  2.240  2.2935  2.471  2.164
## games_started  2.313  1.960  2.4069  2.3125  2.092  2.2083  1.960  1.960
## CV residual   -0.118 -0.483 -0.0195 -0.0632 -0.148 -0.0852 -0.511 -0.204
##                  215    217    219   231
## Predicted      2.289  2.275  2.266 2.239
## cvpred         2.355  2.316  2.336 2.254
## games_started  1.960  2.208  2.208 2.646
## CV residual   -0.395 -0.107 -0.128 0.393
## 
## Sum of squares = 5.28    Mean square = 0.11    n = 48 
## 
## fold 2 
## Observations in test set: 48 
##                   11     12    18    22    26    33     36    42     43
## Predicted      2.514  2.418 2.387 2.425 2.589 2.312  2.403 2.404  2.475
## cvpred         2.532  2.435 2.405 2.448 2.566 2.239  2.423 2.432  2.495
## games_started  2.407  2.208 2.573 2.779 2.715 2.493  2.092 2.573  1.960
## CV residual   -0.126 -0.227 0.168 0.331 0.149 0.254 -0.331 0.141 -0.535
##                   44     49    57     58     70       80     90    100
## Predicted      2.405  2.395 2.359 2.5296  2.470  2.36559  2.352 2.5024
## cvpred         2.393  2.437 2.346 2.5379  2.477  2.41065  2.387 2.4953
## games_started  2.092  1.960 2.715 2.5726  2.208  2.40695  1.960 2.5726
## CV residual   -0.302 -0.477 0.369 0.0347 -0.269 -0.00371 -0.427 0.0773
##                   103    105   106    108    116   123   125     128
## Predicted      2.3499 2.4256 2.456  2.383  2.390 2.375 2.453  2.4191
## cvpred         2.3420 2.3908 2.452  2.411  2.413 2.395 2.468  2.4425
## games_started  2.3125 2.4069 2.779  2.092  1.960 2.779 2.646  2.4069
## CV residual   -0.0294 0.0162 0.327 -0.319 -0.453 0.384 0.178 -0.0356
##                  137   143   144     146    147    149    152   160
## Predicted      2.329 2.473 2.405  2.4350  2.301  2.370  2.392 2.339
## cvpred         2.343 2.466 2.402  2.4623  2.334  2.373  2.419 2.315
## games_started  2.208 2.779 2.779  2.4069  2.208  1.960  2.313 2.779
## CV residual   -0.135 0.313 0.377 -0.0553 -0.126 -0.413 -0.107 0.464
##                   161    167    175   179   188    191     197  201    202
## Predicted      2.3744  2.282  2.339 2.323 2.433  2.250 2.38630 2.33  2.314
## cvpred         2.3736  2.344  2.333 2.319 2.439  2.238 2.40526 2.31  2.315
## games_started  2.3125  1.960  1.960 2.493 2.646  2.092 2.40695 2.57  1.960
## CV residual   -0.0611 -0.383 -0.373 0.174 0.207 -0.146 0.00168 0.26 -0.355
##                 210      218   224    225    226     240
## Predicted     2.345  2.44155 2.245 2.2798  2.392  2.2743
## cvpred        2.351  2.40990 2.218 2.2982  2.416  2.2716
## games_started 2.573  2.40695 2.493 2.3125  2.208  2.2083
## CV residual   0.221 -0.00296 0.276 0.0144 -0.207 -0.0633
## 
## Sum of squares = 3.44    Mean square = 0.07    n = 48 
## 
## fold 3 
## Observations in test set: 48 
##                    6     8      16     21     28    31      32     35
## Predicted     2.3958 2.382  2.3782  2.417  2.312 2.387  2.4438  2.332
## cvpred        2.3848 2.376  2.3662  2.403  2.315 2.376  2.4184  2.339
## games_started 2.4069 2.779  2.3125  2.208  2.092 2.779  2.4069  1.960
## CV residual   0.0222 0.403 -0.0536 -0.195 -0.223 0.403 -0.0115 -0.379
##                   39     52     54     60    68    73    79    81   83
## Predicted     2.4881 2.5093  2.322  2.325 2.362 2.443 2.341  2.38 2.61
## cvpred        2.4783 2.4845  2.321  2.332 2.368 2.422 2.342  2.39 2.58
## games_started 2.5726 2.5726  2.208  1.960 2.493 2.573 2.779  2.21 2.78
## CV residual   0.0943 0.0881 -0.112 -0.372 0.125 0.151 0.437 -0.18 0.20
##                  84     89     91    92    107   109  115    117   119
## Predicted     2.387 2.4013  2.492  2.44  2.386 2.368 2.42  2.360 2.354
## cvpred        2.367 2.3980  2.486  2.44  2.394 2.377 2.42  2.353 2.339
## games_started 2.646 2.4932  2.208  1.96  2.208 2.715 2.57  1.960 2.573
## CV residual   0.279 0.0952 -0.278 -0.48 -0.186 0.337 0.15 -0.393 0.234
##                   120   122    126    142   148    154     166     169
## Predicted      2.4200 2.337  2.397 2.3828 2.429  2.381  2.3734  2.2585
## cvpred         2.4269 2.340  2.401 2.3834 2.421  2.423  2.3666  2.2742
## games_started  2.4069 2.646  1.960 2.4069 2.646  1.960  2.3125  2.2083
## CV residual   -0.0199 0.306 -0.441 0.0235 0.225 -0.463 -0.0541 -0.0659
##                    170    180    187    189    198     206   212    213
## Predicted      2.30169 2.3052 2.2547  2.171  2.356  2.3546 2.301  2.226
## cvpred         2.31442 2.3294 2.2638  2.206  2.358  2.3564 2.305  2.266
## games_started  2.31254 2.4069 2.3125  1.960  2.092  2.3125 2.407  1.960
## CV residual   -0.00188 0.0775 0.0488 -0.245 -0.266 -0.0438 0.102 -0.306
##                 221    227    232   234     237   239
## Predicted     2.374  2.344 2.2387 2.337  2.4644  2.25
## cvpred        2.388  2.364 2.2550 2.353  2.4793  2.27
## games_started 2.646  2.313 2.3125 2.646  2.4069  1.96
## CV residual   0.258 -0.051 0.0575 0.293 -0.0723 -0.31
## 
## Sum of squares = 2.87    Mean square = 0.06    n = 48 
## 
## fold 4 
## Observations in test set: 48 
##                   2    4     9    19     20   23      41    51    53
## Predicted     2.499 2.39 2.311 2.423  2.467 2.51  2.4414 2.450 2.303
## cvpred        2.492 2.35 2.318 2.387  2.456 2.41  2.4364 2.442 2.278
## games_started 2.715 2.78 2.493 2.715  2.208 2.71  2.4069 2.573 2.573
## CV residual   0.222 0.43 0.175 0.328 -0.247 0.31 -0.0294 0.131 0.295
##                   59    69    72     74    76     77      85   111   121
## Predicted     2.3914 2.386 2.433  2.403 2.393  2.380  2.3231 2.384 2.340
## cvpred        2.3711 2.363 2.404  2.402 2.365  2.378  2.2904 2.385 2.319
## games_started 2.4069 2.779 2.646  2.092 2.493  2.092  2.2083 2.779 2.493
## CV residual   0.0358 0.416 0.242 -0.311 0.128 -0.286 -0.0821 0.394 0.174
##                  124   127    129   130   131   132     133   134    136
## Predicted     2.3341 2.448  2.448 2.509 2.208 2.323  2.2784 2.391  2.452
## cvpred        2.3144 2.435  2.438 2.499 2.125 2.301  2.2669 2.372  2.457
## games_started 2.4069 2.646  2.092 2.715 2.573 2.573  2.2083 2.573  2.092
## CV residual   0.0925 0.212 -0.347 0.216 0.447 0.272 -0.0586 0.201 -0.365
##                 140   156   157    163   164    168   173   178   181
## Predicted     2.446  2.37 2.344  2.389 2.383  2.326 2.359 2.251 2.332
## cvpred        2.438  2.36 2.304  2.371 2.376  2.325 2.343 2.238 2.337
## games_started 2.493  2.21 2.407  2.208 2.573  1.960 2.715 2.779 2.646
## CV residual   0.055 -0.15 0.103 -0.163 0.197 -0.365 0.371 0.541 0.309
##                  182     194   196   203     204    208    209    222
## Predicted     2.3188  2.2878 2.286 2.422  2.3935  2.366 2.2417 2.1105
## cvpred        2.3271  2.2519 2.280 2.431  2.3908  2.351 2.1920 2.0652
## games_started 2.4069  2.2083 2.407 2.646  2.3125  2.208 2.2083 2.0919
## CV residual   0.0798 -0.0437 0.127 0.215 -0.0783 -0.143 0.0163 0.0266
##                   223    228    230     236
## Predicted      2.2878  2.312 2.2939  2.3576
## cvpred         2.2588  2.269 2.2853  2.3297
## games_started  2.2083  1.960 2.3125  2.3125
## CV residual   -0.0505 -0.309 0.0273 -0.0172
## 
## Sum of squares = 2.88    Mean square = 0.06    n = 48 
## 
## fold 5 
## Observations in test set: 48 
##                   1      5     7     10     13   14     17   24    30
## Predicted     2.315  2.446 2.576  2.420  2.413 2.51  2.457 2.39 2.408
## cvpred        2.314  2.459 2.601  2.432  2.424 2.53  2.465 2.40 2.413
## games_started 2.779  1.960 2.779  2.313  2.092 2.78  2.313 2.78 2.715
## CV residual   0.465 -0.499 0.178 -0.119 -0.332 0.25 -0.153 0.38 0.301
##                   37    40     55     56      62      64    66      82
## Predicted      2.266  2.40  2.378  2.393  2.2937  2.4257 2.440  2.4124
## cvpred         2.267  2.40  2.384  2.398  2.2971  2.4273 2.440  2.4170
## games_started  2.208  2.09  1.960  1.960  2.2083  2.4069 2.715  2.4069
## CV residual   -0.059 -0.31 -0.424 -0.438 -0.0888 -0.0203 0.275 -0.0101
##                  86     87    88    93    95     99  102    104    112
## Predicted      2.29  2.402 2.453 2.428 2.463  2.404 2.36 2.5244  2.477
## cvpred         2.28  2.417 2.465 2.430 2.472  2.406 2.36 2.5438  2.492
## games_started  1.96  1.960 2.779 2.779 2.573  1.960 2.78 2.5726  1.960
## CV residual   -0.32 -0.457 0.314 0.349 0.101 -0.446 0.42 0.0288 -0.532
##                  118     141   145    151   153   155   159    162   165
## Predicted      2.313  2.2840 2.284 2.3009 2.298 2.398 2.259  2.406 2.276
## cvpred         2.303  2.2735 2.276 2.2947 2.295 2.396 2.251  2.411 2.266
## games_started  2.092  2.2083 2.779 2.3125 2.407 2.573 2.573  2.092 2.493
## CV residual   -0.211 -0.0652 0.503 0.0179 0.112 0.176 0.322 -0.319 0.227
##                   185    186    192    195  199    205      207   216
## Predicted      2.3957 2.3600 2.3754  2.341 2.30  2.385  2.32172 2.291
## cvpred         2.3993 2.3608 2.3723  2.341 2.29  2.371  2.31999 2.290
## games_started  2.3125 2.4069 2.4069  2.092 2.65  1.960  2.31254 2.646
## CV residual   -0.0868 0.0461 0.0347 -0.249 0.36 -0.411 -0.00745 0.356
##                 220    229   233    235    238
## Predicted     2.175  2.226 2.346  2.291 2.4064
## cvpred        2.166  2.225 2.340  2.283 2.3996
## games_started 2.313  1.960 2.646  2.092 2.4932
## CV residual   0.147 -0.265 0.307 -0.191 0.0936
## 
## Sum of squares = 4.03    Mean square = 0.08    n = 48 
## 
## Overall (Sum over all 48 folds) 
##     ms 
## 0.0771