# Fetch Data
qb_stats_w_combine <- read.csv("../data/qb_stats_w_combine.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att", "X40", 
    "wonderlic", "cone", "shuttle", "vert_leap", "broad_jump")
college_stats = qb_stats_w_combine[, predictors]

# Set the resopnse variables
cpct = qb_stats_w_combine["completion_percentage"]

# Generate clean data set
data.log.w_combine.for_cpct = data.frame(log(na.omit(cbind(cpct, college_stats)) + 
    0.1))

# Generate the linear model
lm.log.w_combine.cpct <- lm(formula = completion_percentage ~ ., data = data.log.w_combine.for_cpct)

# Find optimum linear regression model for cpct
step_reg.log.w_combine.cpct <- stepAIC(lm.log.w_combine.cpct, direction = "both")
## Start:  AIC=-170.9
## completion_percentage ~ height + weight + age + c_avg_cmpp + 
##     c_rate + c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att + X40 + wonderlic + cone + shuttle + vert_leap + 
##     broad_jump
## 
##               Df Sum of Sq   RSS  AIC
## - height       1   0.00004 0.164 -173
## - c_avg_yds    1   0.00007 0.164 -173
## - broad_jump   1   0.00011 0.164 -173
## - c_avg_inter  1   0.00012 0.164 -173
## - c_avg_tds    1   0.00018 0.164 -173
## - c_rate       1   0.00063 0.165 -173
## - wonderlic    1   0.00252 0.167 -172
## - age          1   0.00306 0.167 -172
## - weight       1   0.00378 0.168 -172
## - shuttle      1   0.00403 0.168 -172
## - c_numyrs     1   0.00703 0.171 -171
## <none>                     0.164 -171
## - X40          1   0.00895 0.173 -171
## - c_avg_att    1   0.00985 0.174 -171
## - cone         1   0.01061 0.175 -170
## - c_pct        1   0.01171 0.176 -170
## - c_avg_cmpp   1   0.01339 0.177 -170
## - vert_leap    1   0.02265 0.187 -168
## 
## Step:  AIC=-172.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att + X40 + wonderlic + cone + shuttle + vert_leap + 
##     broad_jump
## 
##               Df Sum of Sq   RSS  AIC
## - c_avg_yds    1   0.00006 0.164 -175
## - broad_jump   1   0.00010 0.164 -175
## - c_avg_inter  1   0.00014 0.164 -175
## - c_avg_tds    1   0.00022 0.164 -175
## - c_rate       1   0.00059 0.165 -175
## - wonderlic    1   0.00273 0.167 -174
## - age          1   0.00320 0.167 -174
## - shuttle      1   0.00433 0.168 -174
## - c_numyrs     1   0.00702 0.171 -173
## - weight       1   0.00802 0.172 -173
## <none>                     0.164 -173
## - X40          1   0.00935 0.173 -173
## - c_avg_att    1   0.00990 0.174 -173
## - c_pct        1   0.01170 0.176 -172
## - cone         1   0.01185 0.176 -172
## - c_avg_cmpp   1   0.01345 0.177 -172
## + height       1   0.00004 0.164 -171
## - vert_leap    1   0.02261 0.187 -170
## 
## Step:  AIC=-174.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_inter + c_avg_tds + c_numyrs + c_avg_att + 
##     X40 + wonderlic + cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS  AIC
## - c_avg_inter  1   0.00008 0.164 -177
## - broad_jump   1   0.00017 0.164 -177
## - c_avg_tds    1   0.00187 0.166 -176
## - wonderlic    1   0.00268 0.167 -176
## - age          1   0.00332 0.168 -176
## - shuttle      1   0.00432 0.169 -176
## - c_rate       1   0.00654 0.171 -175
## - c_numyrs     1   0.00738 0.172 -175
## - weight       1   0.00802 0.172 -175
## <none>                     0.164 -175
## - X40          1   0.01093 0.175 -174
## - cone         1   0.01180 0.176 -174
## - c_pct        1   0.01275 0.177 -174
## - c_avg_att    1   0.01638 0.180 -173
## - c_avg_cmpp   1   0.01711 0.181 -173
## + c_avg_yds    1   0.00006 0.164 -173
## + height       1   0.00003 0.164 -173
## - vert_leap    1   0.02292 0.187 -172
## 
## Step:  AIC=-176.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_tds + c_numyrs + c_avg_att + X40 + wonderlic + 
##     cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq   RSS  AIC
## - broad_jump   1   0.00023 0.165 -179
## - c_avg_tds    1   0.00195 0.166 -178
## - wonderlic    1   0.00260 0.167 -178
## - age          1   0.00349 0.168 -178
## - shuttle      1   0.00424 0.169 -178
## - c_rate       1   0.00675 0.171 -177
## - weight       1   0.00794 0.172 -177
## - c_numyrs     1   0.00843 0.173 -177
## <none>                     0.164 -177
## - X40          1   0.01186 0.176 -176
## - cone         1   0.01238 0.177 -176
## + c_avg_inter  1   0.00008 0.164 -175
## + height       1   0.00005 0.164 -175
## + c_avg_yds    1   0.00000 0.164 -175
## - vert_leap    1   0.02333 0.188 -174
## - c_pct        1   0.02366 0.188 -174
## - c_avg_att    1   0.03062 0.195 -172
## - c_avg_cmpp   1   0.03086 0.195 -172
## 
## Step:  AIC=-178.8
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_avg_tds + c_numyrs + c_avg_att + X40 + wonderlic + 
##     cone + shuttle + vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## - c_avg_tds    1    0.0018 0.166 -180
## - wonderlic    1    0.0026 0.167 -180
## - age          1    0.0034 0.168 -180
## - shuttle      1    0.0043 0.169 -180
## - c_rate       1    0.0070 0.171 -179
## - weight       1    0.0077 0.172 -179
## <none>                     0.165 -179
## - c_numyrs     1    0.0094 0.174 -179
## - cone         1    0.0124 0.177 -178
## - X40          1    0.0125 0.177 -178
## + broad_jump   1    0.0002 0.164 -177
## + c_avg_inter  1    0.0001 0.164 -177
## + height       1    0.0000 0.164 -177
## + c_avg_yds    1    0.0000 0.164 -177
## - c_pct        1    0.0235 0.188 -176
## - c_avg_att    1    0.0304 0.195 -174
## - c_avg_cmpp   1    0.0306 0.195 -174
## - vert_leap    1    0.0321 0.197 -174
## 
## Step:  AIC=-180.4
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_numyrs + c_avg_att + X40 + wonderlic + cone + shuttle + 
##     vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## - wonderlic    1    0.0026 0.169 -182
## - shuttle      1    0.0032 0.170 -182
## - age          1    0.0037 0.170 -182
## - weight       1    0.0065 0.173 -181
## - c_numyrs     1    0.0089 0.175 -180
## <none>                     0.166 -180
## - cone         1    0.0117 0.178 -180
## - X40          1    0.0117 0.178 -180
## + c_avg_tds    1    0.0018 0.165 -179
## + c_avg_yds    1    0.0013 0.165 -179
## + c_avg_inter  1    0.0002 0.166 -178
## + height       1    0.0002 0.166 -178
## + broad_jump   1    0.0001 0.166 -178
## - c_pct        1    0.0221 0.188 -178
## - c_avg_cmpp   1    0.0297 0.196 -176
## - c_avg_att    1    0.0300 0.196 -176
## - vert_leap    1    0.0305 0.197 -176
## - c_rate       1    0.0533 0.220 -172
## 
## Step:  AIC=-181.8
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_numyrs + c_avg_att + X40 + cone + shuttle + vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## - shuttle      1    0.0028 0.172 -183
## - age          1    0.0077 0.177 -182
## - c_numyrs     1    0.0081 0.177 -182
## - weight       1    0.0082 0.177 -182
## <none>                     0.169 -182
## - X40          1    0.0146 0.183 -181
## + wonderlic    1    0.0026 0.166 -180
## + c_avg_tds    1    0.0018 0.167 -180
## + c_avg_yds    1    0.0011 0.168 -180
## + broad_jump   1    0.0001 0.169 -180
## + height       1    0.0001 0.169 -180
## + c_avg_inter  1    0.0000 0.169 -180
## - cone         1    0.0193 0.188 -180
## - c_pct        1    0.0211 0.190 -179
## - c_avg_cmpp   1    0.0284 0.197 -178
## - c_avg_att    1    0.0286 0.198 -178
## - vert_leap    1    0.0287 0.198 -178
## - c_rate       1    0.0507 0.220 -174
## 
## Step:  AIC=-183.2
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate + 
##     c_pct + c_numyrs + c_avg_att + X40 + cone + vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## - weight       1    0.0055 0.177 -184
## - age          1    0.0064 0.178 -184
## <none>                     0.172 -183
## - c_numyrs     1    0.0100 0.182 -183
## + shuttle      1    0.0028 0.169 -182
## - cone         1    0.0165 0.188 -182
## + wonderlic    1    0.0022 0.170 -182
## + c_avg_tds    1    0.0008 0.171 -181
## + c_avg_yds    1    0.0004 0.171 -181
## + broad_jump   1    0.0002 0.172 -181
## + c_avg_inter  1    0.0000 0.172 -181
## + height       1    0.0000 0.172 -181
## - c_pct        1    0.0214 0.193 -181
## - X40          1    0.0226 0.194 -180
## - vert_leap    1    0.0265 0.198 -180
## - c_avg_cmpp   1    0.0290 0.201 -179
## - c_avg_att    1    0.0293 0.201 -179
## - c_rate       1    0.0560 0.228 -174
## 
## Step:  AIC=-184
## completion_percentage ~ age + c_avg_cmpp + c_rate + c_pct + c_numyrs + 
##     c_avg_att + X40 + cone + vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## - age          1    0.0030 0.180 -185
## <none>                     0.177 -184
## - c_numyrs     1    0.0119 0.189 -184
## + weight       1    0.0055 0.172 -183
## + wonderlic    1    0.0039 0.173 -183
## + height       1    0.0019 0.175 -182
## + c_avg_tds    1    0.0004 0.177 -182
## + c_avg_yds    1    0.0002 0.177 -182
## + shuttle      1    0.0001 0.177 -182
## + c_avg_inter  1    0.0001 0.177 -182
## + broad_jump   1    0.0000 0.177 -182
## - X40          1    0.0215 0.199 -182
## - vert_leap    1    0.0226 0.200 -181
## - cone         1    0.0230 0.200 -181
## - c_pct        1    0.0259 0.203 -181
## - c_avg_cmpp   1    0.0347 0.212 -179
## - c_avg_att    1    0.0352 0.212 -179
## - c_rate       1    0.0548 0.232 -176
## 
## Step:  AIC=-185.3
## completion_percentage ~ c_avg_cmpp + c_rate + c_pct + c_numyrs + 
##     c_avg_att + X40 + cone + vert_leap
## 
##               Df Sum of Sq   RSS  AIC
## <none>                     0.180 -185
## - c_numyrs     1    0.0117 0.192 -185
## + wonderlic    1    0.0060 0.174 -185
## + age          1    0.0030 0.177 -184
## + weight       1    0.0021 0.178 -184
## + c_avg_tds    1    0.0008 0.179 -184
## + height       1    0.0006 0.180 -184
## + c_avg_yds    1    0.0004 0.180 -183
## + shuttle      1    0.0002 0.180 -183
## + c_avg_inter  1    0.0000 0.180 -183
## + broad_jump   1    0.0000 0.180 -183
## - X40          1    0.0218 0.202 -183
## - vert_leap    1    0.0221 0.202 -183
## - c_pct        1    0.0270 0.207 -182
## - cone         1    0.0282 0.208 -182
## - c_avg_cmpp   1    0.0359 0.216 -180
## - c_avg_att    1    0.0364 0.216 -180
## - c_rate       1    0.0573 0.237 -177
summary(step_reg.log.w_combine.cpct)
## 
## Call:
## lm(formula = completion_percentage ~ c_avg_cmpp + c_rate + c_pct + 
##     c_numyrs + c_avg_att + X40 + cone + vert_leap, data = data.log.w_combine.for_cpct)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16827 -0.03139 -0.00799  0.05066  0.11271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  33.8886    12.0750    2.81   0.0089 **
## c_avg_cmpp    6.5937     2.7422    2.40   0.0228 * 
## c_rate       -0.5702     0.1877   -3.04   0.0050 **
## c_pct        -5.5621     2.6670   -2.09   0.0459 * 
## c_numyrs      0.0975     0.0709    1.37   0.1798   
## c_avg_att    -6.6036     2.7301   -2.42   0.0221 * 
## X40          -1.1154     0.5960   -1.87   0.0714 . 
## cone          1.0250     0.4809    2.13   0.0417 * 
## vert_leap    -0.3484     0.1849   -1.88   0.0695 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.0788 on 29 degrees of freedom
## Multiple R-squared: 0.464,   Adjusted R-squared: 0.317 
## F-statistic: 3.14 on 8 and 29 DF,  p-value: 0.011
plot(step_reg.log.w_combine.cpct)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-1

leaps.log.w_combine.cpct <- regsubsets(completion_percentage ~ ., data = data.log.w_combine.for_cpct, 
    nbest = 10)
subsets(leaps.log.w_combine.cpct, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.log.w_combine.for_cpct, step_reg.log.w_combine.cpct, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: completion_percentage
##            Df Sum Sq Mean Sq F value Pr(>F)   
## c_avg_cmpp  1 0.0004  0.0004    0.06 0.8107   
## c_rate      1 0.0022  0.0022    0.35 0.5602   
## c_pct       1 0.0620  0.0620    9.98 0.0037 **
## c_numyrs    1 0.0042  0.0042    0.68 0.4157   
## c_avg_att   1 0.0281  0.0281    4.53 0.0420 * 
## X40         1 0.0010  0.0010    0.17 0.6856   
## cone        1 0.0362  0.0362    5.83 0.0223 * 
## vert_leap   1 0.0221  0.0221    3.55 0.0695 . 
## Residuals  29 0.1802  0.0062                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 7 
##                             6      24      26     32      38      46    59
## Predicted              4.0618  4.0824  4.0420 4.0176  3.9484  3.9988 3.985
## cvpred                 4.0684  4.0714  4.0578 4.0116  3.9692  4.0135 3.943
## completion_percentage  4.0639  4.0289  3.9627 4.0342  3.9160  3.9871 4.067
## CV residual           -0.0045 -0.0425 -0.0951 0.0226 -0.0532 -0.0264 0.125
## 
## Sum of squares = 0.03    Mean square = 0    n = 7 
## 
## fold 2 
## Observations in test set: 8 
##                            7      19     21     27    37       43    55
## Predicted              4.101  4.0888 4.0262 4.0023  4.11  3.97233  4.03
## cvpred                 4.097  4.0915 4.0528 3.9725  5.57  3.95735  4.06
## completion_percentage  3.968  3.9964 4.1352 4.0360  4.10  3.94932  3.94
## CV residual           -0.129 -0.0951 0.0823 0.0635 -1.47 -0.00803 -0.12
##                          65
## Predicted             4.105
## cvpred                4.048
## completion_percentage 4.159
## CV residual           0.111
## 
## Sum of squares = 2.23    Mean square = 0.28    n = 8 
## 
## fold 3 
## Observations in test set: 8 
##                            5     12    13     17      39     40      50
## Predicted             4.0093 3.9948 3.983 4.0769  4.0254 3.9676  4.0064
## cvpred                3.9790 3.9827 3.907 4.0666  4.0464 3.9612  4.0101
## completion_percentage 3.9927 4.0091 4.096 4.1651  4.0000 4.0466  3.9778
## CV residual           0.0137 0.0265 0.189 0.0985 -0.0464 0.0854 -0.0323
##                          56
## Predicted             4.054
## cvpred                4.020
## completion_percentage 4.162
## CV residual           0.142
## 
## Sum of squares = 0.08    Mean square = 0.01    n = 8 
## 
## fold 4 
## Observations in test set: 8 
##                            4       15     18    20     28    52      63
## Predicted              4.011  4.12449 4.0760 3.933 4.0108  3.99 4.13289
## cvpred                 4.036  4.11635 4.0682 3.927 4.0049  4.00 4.10705
## completion_percentage  3.932  4.10923 4.1125 4.027 4.0622  3.82 4.11087
## CV residual           -0.104 -0.00711 0.0443 0.101 0.0572 -0.18 0.00383
##                           64
## Predicted              3.933
## cvpred                 3.976
## completion_percentage  3.842
## CV residual           -0.135
## 
## Sum of squares = 0.08    Mean square = 0.01    n = 8 
## 
## fold 5 
## Observations in test set: 7 
##                            1      3    25     30    42    49     61
## Predicted             4.0381  3.860 4.073 4.0847 4.008 4.203 4.0238
## cvpred                4.0105  3.959 4.025 4.0530 4.016 4.093 3.9795
## completion_percentage 4.0254  3.791 4.140 4.1092 4.057 4.197 4.0656
## CV residual           0.0148 -0.168 0.115 0.0563 0.041 0.104 0.0861
## 
## Sum of squares = 0.06    Mean square = 0.01    n = 7 
## 
## Overall (Sum over all 7 folds) 
##     ms 
## 0.0652