# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]

# Set the resopnse variables
tds = qb_stats["tds"]

# Generate clean data set
data.log.no_combine.for_tds = data.frame(log(na.omit(cbind(tds, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.no_combine.tds <- lm(formula = tds ~ ., data = data.log.no_combine.for_tds)

# Find optimum linear regression model for tds
step_reg.log.no_combine.tds <- stepAIC(lm.log.no_combine.tds, direction = "both")
## Start:  AIC=-154.6
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_avg_yds    1     0.008 111 -157
## - height       1     0.014 111 -156
## - c_rate       1     0.016 111 -156
## - c_numyrs     1     0.052 111 -156
## - c_avg_tds    1     0.106 111 -156
## - c_avg_inter  1     0.262 111 -156
## - weight       1     0.639 111 -155
## - age          1     0.805 112 -155
## - c_pct        1     0.931 112 -155
## <none>                     111 -155
## - c_avg_cmpp   1     0.990 112 -154
## - c_avg_att    1     1.015 112 -154
## 
## Step:  AIC=-156.6
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - height       1     0.013 111 -158
## - c_numyrs     1     0.046 111 -158
## - c_avg_tds    1     0.341 111 -158
## - c_rate       1     0.503 111 -158
## - weight       1     0.654 111 -157
## - age          1     0.808 112 -157
## - c_avg_inter  1     0.883 112 -157
## <none>                     111 -157
## - c_pct        1     0.970 112 -156
## - c_avg_cmpp   1     1.005 112 -156
## - c_avg_att    1     1.009 112 -156
## + c_avg_yds    1     0.008 111 -155
## 
## Step:  AIC=-158.5
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_numyrs + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_numyrs     1     0.042 111 -160
## - c_avg_tds    1     0.345 111 -160
## - c_rate       1     0.511 111 -159
## - age          1     0.796 112 -159
## - weight       1     0.875 112 -159
## - c_avg_inter  1     0.876 112 -159
## <none>                     111 -158
## - c_pct        1     0.956 112 -158
## - c_avg_cmpp   1     0.991 112 -158
## - c_avg_att    1     0.996 112 -158
## + height       1     0.013 111 -157
## + c_avg_yds    1     0.008 111 -156
## 
## Step:  AIC=-160.4
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_avg_tds    1     0.410 111 -162
## - c_rate       1     0.591 111 -161
## - age          1     0.805 112 -161
## - weight       1     0.834 112 -161
## <none>                     111 -160
## - c_avg_inter  1     0.977 112 -160
## - c_pct        1     1.122 112 -160
## - c_avg_cmpp   1     1.153 112 -160
## - c_avg_att    1     1.157 112 -160
## + c_numyrs     1     0.042 111 -158
## + height       1     0.009 111 -158
## + c_avg_yds    1     0.002 111 -158
## 
## Step:  AIC=-161.6
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_rate       1     0.185 111 -163
## - weight       1     0.753 112 -162
## - c_avg_inter  1     0.803 112 -162
## - c_pct        1     0.884 112 -162
## - c_avg_cmpp   1     0.936 112 -162
## - age          1     0.937 112 -162
## <none>                     111 -162
## - c_avg_att    1     0.952 112 -162
## + c_avg_tds    1     0.410 111 -160
## + c_avg_yds    1     0.251 111 -160
## + c_numyrs     1     0.107 111 -160
## + height       1     0.010 111 -160
## 
## Step:  AIC=-163.2
## tds ~ weight + age + c_avg_cmpp + c_pct + c_avg_inter + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_avg_inter  1     0.622 112 -164
## - weight       1     0.683 112 -164
## - c_pct        1     0.753 112 -164
## - c_avg_cmpp   1     0.818 112 -163
## - c_avg_att    1     0.831 112 -163
## - age          1     0.836 112 -163
## <none>                     111 -163
## + c_avg_yds    1     0.340 111 -162
## + c_rate       1     0.185 111 -162
## + c_numyrs     1     0.124 111 -161
## + height       1     0.013 111 -161
## + c_avg_tds    1     0.004 111 -161
## 
## Step:  AIC=-163.9
## tds ~ weight + age + c_avg_cmpp + c_pct + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - c_pct        1     0.360 112 -165
## - c_avg_cmpp   1     0.403 112 -165
## - c_avg_att    1     0.408 112 -165
## - weight       1     0.417 112 -165
## - age          1     0.805 113 -164
## <none>                     112 -164
## + c_avg_inter  1     0.622 111 -163
## + c_avg_yds    1     0.223 112 -162
## + c_numyrs     1     0.207 112 -162
## + c_avg_tds    1     0.062 112 -162
## + c_rate       1     0.004 112 -162
## + height       1     0.002 112 -162
## 
## Step:  AIC=-165.1
## tds ~ weight + age + c_avg_cmpp + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - weight       1     0.668 113 -166
## - age          1     0.778 113 -166
## <none>                     112 -165
## - c_avg_att    1     0.974 113 -165
## - c_avg_cmpp   1     1.010 113 -165
## + c_pct        1     0.360 112 -164
## + c_numyrs     1     0.293 112 -164
## + c_avg_inter  1     0.229 112 -164
## + c_avg_yds    1     0.117 112 -163
## + c_avg_tds    1     0.041 112 -163
## + c_rate       1     0.002 112 -163
## + height       1     0.001 112 -163
## 
## Step:  AIC=-165.7
## tds ~ age + c_avg_cmpp + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## - age          1     0.530 114 -167
## <none>                     113 -166
## + weight       1     0.668 112 -165
## + c_pct        1     0.610 112 -165
## - c_avg_att    1     1.359 114 -165
## - c_avg_cmpp   1     1.466 114 -165
## + height       1     0.300 113 -164
## + c_numyrs     1     0.133 113 -164
## + c_avg_yds    1     0.067 113 -164
## + c_avg_inter  1     0.032 113 -164
## + c_avg_tds    1     0.027 113 -164
## + c_rate       1     0.002 113 -164
## 
## Step:  AIC=-166.6
## tds ~ c_avg_cmpp + c_avg_att
## 
##               Df Sum of Sq RSS  AIC
## <none>                     114 -167
## + age          1     0.530 113 -166
## + c_pct        1     0.522 113 -166
## + weight       1     0.420 113 -166
## - c_avg_att    1     1.609 115 -165
## + height       1     0.249 113 -165
## - c_avg_cmpp   1     1.690 115 -165
## + c_numyrs     1     0.173 113 -165
## + c_avg_tds    1     0.086 114 -165
## + c_avg_inter  1     0.050 114 -165
## + c_avg_yds    1     0.040 114 -165
## + c_rate       1     0.003 114 -165
summary(step_reg.log.no_combine.tds)
## 
## Call:
## lm(formula = tds ~ c_avg_cmpp + c_avg_att, data = data.log.no_combine.for_tds)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.637 -0.274  0.080  0.430  1.185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.146      0.601    5.24  3.6e-07 ***
## c_avg_cmpp     0.805      0.432    1.86    0.064 .  
## c_avg_att     -0.875      0.481   -1.82    0.071 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.698 on 233 degrees of freedom
## Multiple R-squared: 0.0148,  Adjusted R-squared: 0.00637 
## F-statistic: 1.75 on 2 and 233 DF,  p-value: 0.175
plot(step_reg.log.no_combine.tds)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.log.no_combine.tds <- regsubsets(tds ~ ., data = data.log.no_combine.for_tds, 
    nbest = 10)
subsets(leaps.log.no_combine.tds, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.no_combine.for_tds, step_reg.log.no_combine.tds, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: tds
##             Df Sum Sq Mean Sq F value Pr(>F)  
## c_avg_cmpp   1    0.1   0.100    0.21  0.650  
## c_avg_att    1    1.6   1.609    3.30  0.071 .
## Residuals  233  113.6   0.487                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 47 
##                 3     15    18    24   26    35    37    39     41      45
## Predicted   2.462  2.444 2.324 2.329 2.37  2.18  2.27 2.378 2.3775  2.3400
## cvpred      2.437  2.430 2.357 2.360 2.38  2.27  2.33 2.386 2.3905  2.3629
## tds         2.646  1.808 2.493 2.779 2.57  1.81 -2.30 2.493 2.4069  2.3125
## CV residual 0.209 -0.622 0.136 0.419 0.19 -0.46 -4.63 0.107 0.0164 -0.0503
##                52    56     62     70    71     75    78     80    85
## Predicted   2.383  2.31  2.149  2.362 2.285 2.3262 2.327 2.4073 2.311
## cvpred      2.394  2.35  2.253  2.379 2.330 2.3564 2.355 2.4041 2.345
## tds         2.839  1.81  1.960  1.960 3.262 2.4069 3.049 2.4932 2.646
## CV residual 0.445 -0.54 -0.293 -0.419 0.932 0.0505 0.694 0.0891 0.301
##                88     89  103    116   118   119    121     133    136
## Predicted   2.361 2.3238 2.28  2.351 2.255  2.32  2.256  2.2295  2.301
## cvpred      2.379 2.3541 2.32  2.373 2.315  2.35  2.310  2.2935  2.339
## tds         3.262 2.4069 2.71  1.629 2.407  2.21  1.629  2.2083  2.092
## CV residual 0.883 0.0529 0.39 -0.743 0.092 -0.14 -0.681 -0.0852 -0.248
##               139    149    152   159     165    166   170  173  175   188
## Predicted   2.512  2.318  2.353 2.216 2.39720  2.363 2.287 2.30 2.26  2.38
## cvpred      2.450  2.347  2.375 2.288 2.39697  2.380 2.331 2.34 2.31  2.38
## tds         2.715  0.742  2.092 2.573 2.40695  1.960 2.646 3.40 2.71  2.09
## CV residual 0.265 -1.605 -0.283 0.285 0.00998 -0.419 0.315 1.07 0.40 -0.29
##                 193    198      205    211    220    224   225     230
## Predicted    2.2049  2.344  2.26745  2.224  2.215  2.177 2.293  2.1539
## cvpred       2.2809  2.360  2.31643  2.289  2.283  2.256 2.334  2.2436
## tds          2.2083  1.808  2.31254  1.629  1.960  2.092 2.779  0.0953
## CV residual -0.0726 -0.551 -0.00389 -0.659 -0.323 -0.165 0.445 -2.1483
##                239
## Predicted    2.247
## cvpred       2.297
## tds          1.808
## CV residual -0.489
## 
## Sum of squares = 37    Mean square = 0.79    n = 47 
## 
## fold 2 
## Observations in test set: 48 
##                 4      5     7    14   17   33     36      42      46
## Predicted   2.402 2.3097 2.486 2.435 2.29 2.10  2.364  2.3720  2.3246
## cvpred      2.375 2.2714 2.472 2.408 2.25 2.06  2.332  2.3382  2.2876
## tds         2.493 2.3125 3.049 2.896 2.57 2.21  1.629  2.3125  2.2083
## CV residual 0.118 0.0411 0.577 0.488 0.32 0.15 -0.702 -0.0256 -0.0793
##                 47    65   66     69    73     77   99   100  102   114
## Predicted    2.281 2.332 2.32  2.398 2.405  2.295 2.48 2.360 2.35 2.231
## cvpred       2.245 2.294 2.29  2.370 2.379  2.256 2.46 2.328 2.32 2.187
## tds          1.960 2.839 2.78  2.208 2.573  1.629 2.71 2.715 3.05 2.896
## CV residual -0.284 0.545 0.49 -0.162 0.193 -0.627 0.25 0.386 0.73 0.709
##               122    126    128   131   138   141   144    147    153
## Predicted   2.187  2.354  2.419 2.305 2.301 2.312 2.317  2.340  2.270
## cvpred      2.139  2.325  2.394 2.277 2.261 2.275 2.284  2.308  2.228
## tds         2.779  2.092  2.208 2.407 3.140 2.407 3.096  2.092  1.960
## CV residual 0.639 -0.233 -0.186 0.129 0.879 0.132 0.811 -0.216 -0.268
##               156   158   161   163   164   171    172  174  176  178
## Predicted   2.214 2.395  2.32 2.304 2.280 2.297  2.355 2.26 2.22 2.18
## cvpred      2.172 2.371  2.29 2.266 2.240 2.257  2.328 2.22 2.19 2.14
## tds         2.779 2.646  1.96 3.001 2.896 2.646  1.629 3.34 2.65 3.14
## CV residual 0.607 0.275 -0.33 0.735 0.656 0.389 -0.699 1.11 0.46 1.00
##               183   184     187    191    199   201   204    207   213
## Predicted   2.245 2.234  2.2609  2.190  2.273 2.224 2.454  2.247 2.208
## cvpred      2.202 2.193  2.2214  2.146  2.235 2.180 2.432  2.205 2.168
## tds         2.493 2.313  2.2083  1.411  1.411 2.573 2.715  1.808 2.493
## CV residual 0.291 0.119 -0.0131 -0.735 -0.824 0.392 0.283 -0.397 0.325
##                214
## Predicted    2.247
## cvpred       2.207
## tds          1.960
## CV residual -0.247
## 
## Sum of squares = 12.1    Mean square = 0.25    n = 48 
## 
## fold 3 
## Observations in test set: 47 
##                 2      6    25     29     34     43    44     48    50
## Predicted   2.415 2.2551 1.989  2.322  2.308  2.422  2.25  2.198 2.341
## cvpred      2.468 2.2436 1.739  2.348  2.306  2.460  2.25  2.160 2.362
## tds         3.140 2.3125 2.646  2.092  1.960  2.313  1.13  1.629 2.896
## CV residual 0.672 0.0689 0.907 -0.256 -0.346 -0.147 -1.12 -0.531 0.534
##                51     55    59      60   64     76   79      86     87
## Predicted   2.324  2.340 2.314  2.2860 2.31  2.315 2.28  2.3187  2.364
## cvpred      2.339  2.365 2.329  2.2731 2.32  2.308 2.28  2.3412  2.368
## tds         2.896  1.808 2.646  2.2083 3.00  1.960 2.90  2.3125  1.808
## CV residual 0.556 -0.556 0.317 -0.0648 0.68 -0.348 0.62 -0.0286 -0.559
##                 90   94   101    106   107    108   112   123  124    134
## Predicted    2.364 2.28 2.357  2.328 2.294  2.398 2.285 2.348 2.35  2.279
## cvpred       2.398 2.27 2.393  2.343 2.278  2.421 2.294 2.367 2.36  2.266
## tds          1.411 2.71 2.715  1.411 2.646  2.313 2.493 2.779 2.49  2.092
## CV residual -0.987 0.44 0.322 -0.932 0.368 -0.108 0.199 0.412 0.13 -0.175
##                137    148   157   177    179   180   181    185   192
## Predicted    2.322  2.404 2.204 2.175  2.274 2.177 2.336  2.203 2.227
## cvpred       2.311  2.448 2.092 2.100  2.245 2.110 2.336  2.179 2.194
## tds          1.960  2.092 2.646 2.407  1.808 2.896 2.573  1.960 2.573
## CV residual -0.351 -0.356 0.555 0.307 -0.436 0.786 0.237 -0.219 0.379
##               194   197   200     208    215   216    217   233    236
## Predicted   2.035 2.406 2.310  2.3141  2.083 2.216 2.2369 2.139  2.353
## cvpred      1.831 2.413 2.311  2.2914  1.965 2.173 2.1864 2.074  2.321
## tds         2.407 2.646 2.896  2.2083  1.411 2.950 2.2083 2.407  0.742
## CV residual 0.576 0.233 0.585 -0.0831 -0.554 0.777 0.0219 0.333 -1.579
##                238
## Predicted   2.3405
## cvpred      2.3141
## tds         2.4069
## CV residual 0.0928
## 
## Sum of squares = 13.8    Mean square = 0.29    n = 47 
## 
## fold 4 
## Observations in test set: 47 
##                 9    10     11     13    16    22    27    28  31     32
## Predicted   2.324 2.376 2.4330  2.314  2.36 2.380 2.348 2.295 2.3  2.435
## cvpred      2.334 2.397 2.4641  2.321  2.37 2.400 2.363 2.299 2.3  2.466
## tds         2.779 2.573 2.4932  1.808  1.13 3.336 2.573 2.407 3.0  2.208
## CV residual 0.445 0.176 0.0291 -0.513 -1.24 0.936 0.209 0.108 0.7 -0.258
##                38      49    53    58    63    67    74   84    92    93
## Predicted   2.359  2.4478 2.232 2.407 2.357 2.370 2.289 2.41  2.33 2.342
## cvpred      2.375  2.4824 2.224 2.432 2.372 2.389 2.291 2.43  2.35 2.356
## tds         3.096  0.0953 2.407 2.646 3.096 2.646 2.493 2.71 -2.30 3.049
## CV residual 0.721 -2.3871 0.183 0.214 0.723 0.257 0.202 0.28 -4.65 0.694
##                 95     97   104   105  109   110    113   117    120
## Predicted   2.4299  2.233 2.329 2.218 2.30 2.252  2.301  2.29  2.382
## cvpred      2.4613  2.226 2.341 2.208 2.31 2.248  2.307  2.29  2.407
## tds         2.4932  1.808 2.646 2.573 2.90 2.715  1.808  1.13  1.808
## CV residual 0.0319 -0.418 0.305 0.364 0.59 0.467 -0.498 -1.16 -0.598
##                125   127   130   143   145    146    162   182    189
## Predicted    2.416 2.288 2.395 2.421 2.258  2.392  2.242 2.201  2.261
## cvpred       2.445 2.292 2.420 2.452 2.256  2.415  2.237 2.190  2.260
## tds          2.313 3.096 2.573 3.096 2.839  2.092  1.808 2.573  2.092
## CV residual -0.132 0.803 0.153 0.643 0.583 -0.323 -0.428 0.382 -0.168
##                195     196    209  212   223   226    228   232    237
## Predicted    2.276  2.1938  2.332 2.28 2.319 2.409  2.392 2.393  2.350
## cvpred       2.278  2.1783  2.346 2.29 2.330 2.437  2.418 2.419  2.368
## tds          2.092  2.0919  1.960 2.90 2.896 2.715  2.313 2.896  1.960
## CV residual -0.186 -0.0864 -0.386 0.61 0.566 0.278 -0.106 0.477 -0.408
## 
## Sum of squares = 38.8    Mean square = 0.83    n = 47 
## 
## fold 5 
## Observations in test set: 47 
##                 1     8     12    19     20     21    23   30     40
## Predicted   2.335 2.353  2.341 2.436 2.3083  2.290 2.602 2.19 2.3688
## cvpred      2.316 2.334  2.320 2.419 2.2927  2.276 2.617 2.19 2.3578
## tds         3.262 3.001  2.208 2.493 2.3125  2.092 3.049 3.37 2.4069
## CV residual 0.946 0.666 -0.112 0.074 0.0198 -0.184 0.432 1.18 0.0491
##                  54    57    61    68    72     81    82     91     96
## Predicted    2.3693 2.229  2.30 2.282 2.399  2.274  2.29  2.251 2.3905
## cvpred       2.3400 2.228  2.29 2.280 2.387  2.268  2.29  2.248 2.3746
## tds          2.3125 2.950  2.09 2.493 2.896  2.092  1.13  0.742 2.4069
## CV residual -0.0275 0.722 -0.20 0.213 0.509 -0.176 -1.15 -1.506 0.0323
##               98   111    115    129   132    135    140     142  150
## Predicted   2.51 2.291  2.307  2.322 2.282  2.228 2.4139  2.3590 2.26
## cvpred      2.50 2.292  2.303  2.311 2.278  2.226 2.3937  2.3491 2.26
## tds         2.84 2.493  1.960  1.629 2.407  2.092 2.4069  2.3125 3.30
## CV residual 0.34 0.201 -0.343 -0.682 0.129 -0.134 0.0133 -0.0366 1.04
##               151    154   155   160    167    168    169    186    190
## Predicted   2.207  2.299 2.332 2.213  2.469  2.368  2.245 2.3545  2.191
## cvpred      2.201  2.349 2.311 2.226  2.461  2.351  2.250 2.3656  2.194
## tds         2.407  1.411 3.140 2.493  1.629  2.208  1.629 2.4069  1.960
## CV residual 0.206 -0.938 0.828 0.267 -0.832 -0.143 -0.621 0.0413 -0.233
##                202  203    206   210    218   219    222   227    229
## Predicted    2.229 2.27  2.315 2.297  2.288  2.18  1.955 2.272 2.2537
## cvpred       2.225 2.26  2.308 2.294  2.301  2.21  1.997 2.292 2.2789
## tds          1.960 2.95  1.960 3.049  2.092  2.09  1.411 2.715 2.3125
## CV residual -0.264 0.69 -0.348 0.756 -0.209 -0.12 -0.586 0.422 0.0337
##                235   240
## Predicted    2.325 2.328
## cvpred       2.316 2.357
## tds          1.808 2.646
## CV residual -0.508 0.289
## 
## Sum of squares = 14.3    Mean square = 0.31    n = 47 
## 
## Overall (Sum over all 47 folds) 
##    ms 
## 0.492