# Fetch Data
qb_stats_w_combine <- read.csv("../data/qb_stats_w_combine.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att", "X40", 
    "wonderlic", "cone", "shuttle", "vert_leap", "broad_jump")
college_stats = qb_stats_w_combine[, predictors]

# Set the resopnse variables
tds = qb_stats_w_combine["tds"]

# Generate clean data set
data.log.w_combine.for_tds = data.frame(log(na.omit(cbind(tds, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.w_combine.tds <- lm(formula = tds ~ ., data = data.log.w_combine.for_tds)

# Find optimum linear regression model for tds
step_reg.log.w_combine.tds <- stepAIC(lm.log.w_combine.tds, direction = "both")
## Start:  AIC=-19.68
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic + 
##     cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS   AIC
## - X40          1     0.000  8.78 -21.7
## - age          1     0.001  8.78 -21.7
## - cone         1     0.040  8.82 -21.5
## - shuttle      1     0.065  8.84 -21.4
## - broad_jump   1     0.119  8.90 -21.2
## - height       1     0.166  8.95 -21.0
## - vert_leap    1     0.186  8.97 -20.9
## - c_avg_yds    1     0.192  8.97 -20.9
## - c_rate       1     0.215  8.99 -20.8
## - c_avg_inter  1     0.256  9.04 -20.6
## - weight       1     0.268  9.05 -20.5
## - c_avg_tds    1     0.269  9.05 -20.5
## <none>                      8.78 -19.7
## - wonderlic    1     0.514  9.29 -19.5
## - c_numyrs     1     1.073  9.85 -17.3
## - c_avg_att    1     1.530 10.31 -15.6
## - c_avg_cmpp   1     1.743 10.52 -14.8
## - c_pct        1     1.804 10.58 -14.6
## 
## Step:  AIC=-21.68
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic + 
##     cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS   AIC
## - age          1     0.001  8.78 -23.7
## - cone         1     0.040  8.82 -23.5
## - shuttle      1     0.072  8.85 -23.4
## - broad_jump   1     0.159  8.94 -23.0
## - height       1     0.180  8.96 -22.9
## - vert_leap    1     0.191  8.97 -22.9
## - c_avg_yds    1     0.204  8.98 -22.8
## - c_rate       1     0.228  9.01 -22.7
## - weight       1     0.274  9.05 -22.5
## - c_avg_inter  1     0.286  9.06 -22.5
## - c_avg_tds    1     0.287  9.07 -22.5
## <none>                      8.78 -21.7
## - wonderlic    1     0.522  9.30 -21.5
## + X40          1     0.000  8.78 -19.7
## - c_numyrs     1     1.073  9.85 -19.3
## - c_avg_att    1     1.530 10.31 -17.6
## - c_avg_cmpp   1     1.752 10.53 -16.8
## - c_pct        1     1.826 10.61 -16.5
## 
## Step:  AIC=-23.67
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic + 
##     cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS   AIC
## - cone         1     0.041  8.82 -25.5
## - shuttle      1     0.072  8.85 -25.4
## - broad_jump   1     0.159  8.94 -25.0
## - height       1     0.187  8.97 -24.9
## - vert_leap    1     0.192  8.97 -24.9
## - c_avg_yds    1     0.203  8.98 -24.8
## - c_rate       1     0.228  9.01 -24.7
## - c_avg_tds    1     0.286  9.07 -24.4
## - c_avg_inter  1     0.300  9.08 -24.4
## - weight       1     0.309  9.09 -24.4
## <none>                      8.78 -23.7
## - wonderlic    1     0.606  9.39 -23.1
## + age          1     0.001  8.78 -21.7
## + X40          1     0.000  8.78 -21.7
## - c_numyrs     1     1.073  9.85 -21.3
## - c_avg_att    1     1.545 10.32 -19.5
## - c_avg_cmpp   1     1.767 10.55 -18.7
## - c_pct        1     1.835 10.62 -18.5
## 
## Step:  AIC=-25.49
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic + 
##     shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS   AIC
## - shuttle      1     0.034  8.86 -27.4
## - broad_jump   1     0.167  8.99 -26.8
## - vert_leap    1     0.175  9.00 -26.8
## - c_avg_yds    1     0.237  9.06 -26.5
## - c_rate       1     0.256  9.08 -26.4
## - c_avg_inter  1     0.259  9.08 -26.4
## - c_avg_tds    1     0.291  9.11 -26.3
## - height       1     0.303  9.12 -26.2
## <none>                      8.82 -25.5
## - wonderlic    1     0.568  9.39 -25.1
## - weight       1     0.571  9.39 -25.1
## + cone         1     0.041  8.78 -23.7
## + age          1     0.002  8.82 -23.5
## + X40          1     0.000  8.82 -23.5
## - c_numyrs     1     1.034  9.86 -23.3
## - c_avg_att    1     1.687 10.51 -20.9
## - c_avg_cmpp   1     1.930 10.75 -20.0
## - c_pct        1     1.996 10.82 -19.7
## 
## Step:  AIC=-27.35
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic + 
##     vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS   AIC
## - broad_jump   1     0.147  9.00 -28.7
## - c_avg_inter  1     0.247  9.10 -28.3
## - vert_leap    1     0.285  9.14 -28.1
## - c_avg_yds    1     0.287  9.14 -28.1
## - c_rate       1     0.291  9.15 -28.1
## - c_avg_tds    1     0.292  9.15 -28.1
## - height       1     0.304  9.16 -28.1
## <none>                      8.86 -27.4
## - wonderlic    1     0.801  9.66 -26.1
## - weight       1     0.929  9.78 -25.6
## + shuttle      1     0.034  8.82 -25.5
## + X40          1     0.005  8.85 -25.4
## + cone         1     0.003  8.85 -25.4
## + age          1     0.001  8.85 -25.4
## - c_numyrs     1     1.018  9.87 -25.2
## - c_avg_att    1     1.816 10.67 -22.3
## - c_avg_cmpp   1     2.055 10.91 -21.4
## - c_pct        1     2.110 10.97 -21.2
## 
## Step:  AIC=-28.72
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic + 
##     vert_leap
## 
##               Df Sum of Sq   RSS   AIC
## - vert_leap    1     0.140  9.14 -30.1
## - c_avg_yds    1     0.219  9.22 -29.8
## - c_avg_tds    1     0.220  9.22 -29.8
## - c_rate       1     0.220  9.22 -29.8
## - c_avg_inter  1     0.341  9.34 -29.3
## - height       1     0.345  9.35 -29.3
## <none>                      9.00 -28.7
## - wonderlic    1     0.786  9.79 -27.5
## + broad_jump   1     0.147  8.86 -27.4
## - c_numyrs     1     0.888  9.89 -27.1
## + X40          1     0.019  8.98 -26.8
## + shuttle      1     0.014  8.99 -26.8
## + cone         1     0.012  8.99 -26.8
## + age          1     0.001  9.00 -26.7
## - weight       1     1.091 10.09 -26.4
## - c_avg_att    1     1.756 10.76 -23.9
## - c_avg_cmpp   1     2.029 11.03 -23.0
## - c_pct        1     2.122 11.12 -22.7
## 
## Step:  AIC=-30.13
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic
## 
##               Df Sum of Sq   RSS   AIC
## - c_avg_yds    1     0.207  9.35 -31.3
## - c_rate       1     0.228  9.37 -31.2
## - c_avg_tds    1     0.246  9.39 -31.1
## - height       1     0.293  9.44 -30.9
## - c_avg_inter  1     0.331  9.47 -30.8
## <none>                      9.14 -30.1
## - wonderlic    1     0.811  9.95 -28.9
## + vert_leap    1     0.140  9.00 -28.7
## + shuttle      1     0.111  9.03 -28.6
## + X40          1     0.032  9.11 -28.3
## - c_numyrs     1     0.991 10.13 -28.2
## + cone         1     0.016  9.13 -28.2
## + broad_jump   1     0.002  9.14 -28.1
## + age          1     0.001  9.14 -28.1
## - weight       1     1.025 10.17 -28.1
## - c_avg_att    1     1.840 10.98 -25.2
## - c_avg_cmpp   1     2.146 11.29 -24.1
## - c_pct        1     2.231 11.37 -23.8
## 
## Step:  AIC=-31.28
## tds ~ height + weight + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_numyrs + c_avg_att + wonderlic
## 
##               Df Sum of Sq   RSS   AIC
## - c_rate       1     0.025  9.37 -33.2
## - c_avg_tds    1     0.040  9.39 -33.1
## - height       1     0.377  9.73 -31.8
## <none>                      9.35 -31.3
## - wonderlic    1     0.672 10.02 -30.6
## - c_numyrs     1     0.802 10.15 -30.2
## + c_avg_yds    1     0.207  9.14 -30.1
## + shuttle      1     0.161  9.19 -29.9
## + vert_leap    1     0.128  9.22 -29.8
## - c_avg_inter  1     0.961 10.31 -29.6
## + X40          1     0.026  9.32 -29.4
## + cone         1     0.016  9.33 -29.3
## - weight       1     1.026 10.38 -29.3
## + broad_jump   1     0.008  9.34 -29.3
## + age          1     0.002  9.35 -29.3
## - c_pct        1     2.098 11.45 -25.6
## - c_avg_cmpp   1     2.236 11.59 -25.1
## - c_avg_att    1     2.263 11.61 -25.0
## 
## Step:  AIC=-33.18
## tds ~ height + weight + c_avg_cmpp + c_pct + c_avg_inter + c_avg_tds + 
##     c_numyrs + c_avg_att + wonderlic
## 
##               Df Sum of Sq   RSS   AIC
## - c_avg_tds    1     0.016  9.39 -35.1
## - height       1     0.407  9.78 -33.6
## <none>                      9.37 -33.2
## - wonderlic    1     0.653 10.03 -32.6
## - c_numyrs     1     0.847 10.22 -31.9
## + shuttle      1     0.161  9.21 -31.8
## + vert_leap    1     0.150  9.22 -31.8
## + X40          1     0.035  9.34 -31.3
## + c_rate       1     0.025  9.35 -31.3
## + cone         1     0.021  9.35 -31.3
## + broad_jump   1     0.017  9.36 -31.3
## + c_avg_yds    1     0.004  9.37 -31.2
## + age          1     0.003  9.37 -31.2
## - weight       1     1.058 10.43 -31.1
## - c_avg_inter  1     1.157 10.53 -30.8
## - c_pct        1     2.458 11.83 -26.3
## - c_avg_att    1     2.461 11.84 -26.3
## - c_avg_cmpp   1     2.465 11.84 -26.3
## 
## Step:  AIC=-35.12
## tds ~ height + weight + c_avg_cmpp + c_pct + c_avg_inter + c_numyrs + 
##     c_avg_att + wonderlic
## 
##               Df Sum of Sq   RSS   AIC
## - height       1     0.433  9.82 -35.4
## <none>                      9.39 -35.1
## - wonderlic    1     0.710 10.10 -34.3
## - c_numyrs     1     0.836 10.23 -33.9
## + vert_leap    1     0.146  9.24 -33.7
## + shuttle      1     0.131  9.26 -33.7
## + X40          1     0.028  9.36 -33.2
## + cone         1     0.021  9.37 -33.2
## + broad_jump   1     0.016  9.37 -33.2
## + c_avg_tds    1     0.016  9.37 -33.2
## + age          1     0.001  9.39 -33.1
## + c_avg_yds    1     0.001  9.39 -33.1
## + c_rate       1     0.000  9.39 -33.1
## - weight       1     1.181 10.57 -32.6
## - c_avg_inter  1     1.235 10.62 -32.4
## - c_avg_cmpp   1     2.454 11.84 -28.3
## - c_pct        1     2.455 11.84 -28.3
## - c_avg_att    1     2.458 11.85 -28.3
## 
## Step:  AIC=-35.41
## tds ~ weight + c_avg_cmpp + c_pct + c_avg_inter + c_numyrs + 
##     c_avg_att + wonderlic
## 
##               Df Sum of Sq   RSS   AIC
## <none>                      9.82 -35.4
## + height       1     0.433  9.39 -35.1
## - weight       1     0.763 10.59 -34.6
## - c_numyrs     1     0.895 10.72 -34.1
## + vert_leap    1     0.095  9.73 -33.8
## + shuttle      1     0.086  9.74 -33.7
## + c_avg_tds    1     0.041  9.78 -33.6
## + cone         1     0.006  9.82 -33.4
## + age          1     0.004  9.82 -33.4
## + broad_jump   1     0.003  9.82 -33.4
## + c_rate       1     0.001  9.82 -33.4
## + X40          1     0.001  9.82 -33.4
## + c_avg_yds    1     0.001  9.82 -33.4
## - wonderlic    1     1.234 11.06 -32.9
## - c_avg_inter  1     1.277 11.10 -32.8
## - c_pct        1     2.364 12.19 -29.2
## - c_avg_cmpp   1     2.383 12.21 -29.2
## - c_avg_att    1     2.391 12.21 -29.1
summary(step_reg.log.w_combine.tds)
## 
## Call:
## lm(formula = tds ~ weight + c_avg_cmpp + c_pct + c_avg_inter + 
##     c_numyrs + c_avg_att + wonderlic, data = data.log.w_combine.for_tds)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.297 -0.295 -0.067  0.347  1.183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  284.429    111.167    2.56    0.016 *
## weight         2.992      1.960    1.53    0.137  
## c_avg_cmpp    66.302     24.578    2.70    0.011 *
## c_pct        -63.631     23.680   -2.69    0.012 *
## c_avg_inter    0.992      0.503    1.97    0.058 .
## c_numyrs       0.862      0.521    1.65    0.109  
## c_avg_att    -67.085     24.828   -2.70    0.011 *
## wonderlic     -0.728      0.375   -1.94    0.062 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.572 on 30 degrees of freedom
## Multiple R-squared: 0.314,   Adjusted R-squared: 0.154 
## F-statistic: 1.96 on 7 and 30 DF,  p-value: 0.0937
plot(step_reg.log.w_combine.tds)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-1

leaps.log.w_combine.tds <- regsubsets(tds ~ ., data = data.log.w_combine.for_tds, 
    nbest = 10)
subsets(leaps.log.w_combine.tds, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.w_combine.for_tds, step_reg.log.w_combine.tds, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: tds
##             Df Sum Sq Mean Sq F value Pr(>F)  
## weight       1   1.01   1.006    3.07  0.090 .
## c_avg_cmpp   1   0.06   0.062    0.19  0.666  
## c_pct        1   0.03   0.030    0.09  0.765  
## c_avg_inter  1   0.14   0.141    0.43  0.516  
## c_numyrs     1   0.12   0.120    0.37  0.549  
## c_avg_att    1   1.91   1.910    5.83  0.022 *
## wonderlic    1   1.23   1.234    3.77  0.062 .
## Residuals   30   9.82   0.327                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 7 
##                 6    21     26     32    38     46     59
## Predicted   2.282 2.049  2.446  2.728 2.001  2.563  2.364
## cvpred      2.184 1.726  2.441  2.692 1.767  2.695  2.758
## tds         3.001 2.313  2.208  2.313 2.493  2.493  2.407
## CV residual 0.816 0.587 -0.233 -0.379 0.727 -0.202 -0.351
## 
## Sum of squares = 1.9    Mean square = 0.27    n = 7 
## 
## fold 2 
## Observations in test set: 8 
##                    7     18     20    27     37    43     55    65
## Predicted   2.420181  2.819  2.313 2.795   2.71 2.177  2.048 2.468
## cvpred      2.207996  3.022  2.480 2.703  22.16 1.752  2.098 2.430
## tds         2.208274  2.313  1.808 3.371   2.65 1.960  1.629 2.896
## CV residual 0.000278 -0.709 -0.671 0.668 -19.51 0.208 -0.469 0.466
## 
## Sum of squares = 382    Mean square = 47.8    n = 8 
## 
## fold 3 
## Observations in test set: 8 
##                5     12    13    16      39    40    50   56
## Predicted   2.32  2.323 2.773 2.427  2.6119 2.240 2.308 2.08
## cvpred      1.98  2.307 2.612 2.164  2.3683 2.214 2.082 1.96
## tds         3.14  2.092 3.049 3.096  2.3125 2.407 2.573 3.26
## CV residual 1.16 -0.215 0.437 0.932 -0.0557 0.193 0.491 1.30
## 
## Sum of squares = 4.43    Mean square = 0.55    n = 8 
## 
## fold 4 
## Observations in test set: 8 
##                   4     15    17     19    28     52      63     64
## Predicted    1.3922  2.777  2.22  2.593 3.116  2.541  2.1600  2.652
## cvpred       2.1257  2.702  2.48  2.679 2.884  2.455  2.3647  2.793
## tds          0.0953  2.493  2.31  2.573 3.049  1.629  2.3125  2.493
## CV residual -2.0304 -0.209 -0.17 -0.106 0.165 -0.826 -0.0522 -0.299
## 
## Sum of squares = 5.01    Mean square = 0.63    n = 8 
## 
## fold 5 
## Observations in test set: 7 
##                 1     3     24     30     42     49     61
## Predicted   2.562  1.74  2.683 2.4681  2.285  2.777  1.666
## cvpred      2.334  1.48  3.043 2.7544  2.516  3.108 -0.116
## tds         3.262  1.13  2.092 2.8391  2.208  2.208  2.208
## CV residual 0.928 -0.35 -0.951 0.0847 -0.308 -0.899  2.324
## 
## Sum of squares = 8.2    Mean square = 1.17    n = 7 
## 
## Overall (Sum over all 7 folds) 
##   ms 
## 10.6