# Fetch Data
qb_stats_w_combine <- read.csv("../data/qb_stats_w_combine.csv")

# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct", 
    "c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att", "X40", 
    "wonderlic", "cone", "shuttle", "vert_leap", "broad_jump")
college_stats = qb_stats_w_combine[, predictors]

# Set the resopnse variables
wins = qb_stats_w_combine["wins"]

# Generate clean data set
data.scaled.w_combine.for_wins = data.frame(scale(na.omit(cbind(wins, college_stats))))

# Generate the linear model
lm.scaled.w_combine.wins <- lm(formula = wins ~ ., data = data.scaled.w_combine.for_wins)

# Find optimum linear regression model for wins
step_reg.scaled.w_combine.wins <- stepAIC(lm.scaled.w_combine.wins, direction = "both")
## Start:  AIC=8.65
## wins ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + 
##     c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + 
##     X40 + wonderlic + cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - weight       1      0.06 19.4  6.77
## - height       1      0.12 19.5  6.89
## - cone         1      0.17 19.5  7.00
## - c_avg_inter  1      0.30 19.6  7.25
## - shuttle      1      0.40 19.7  7.45
## - c_avg_att    1      0.49 19.8  7.63
## - c_avg_cmpp   1      0.71 20.1  8.05
## - vert_leap    1      0.86 20.2  8.34
## - X40          1      0.97 20.3  8.56
## <none>                     19.3  8.65
## - c_pct        1      1.80 21.1 10.13
## - c_numyrs     1      1.84 21.2 10.20
## - broad_jump   1      2.59 21.9 11.56
## - age          1      2.91 22.3 12.13
## - c_avg_yds    1      2.96 22.3 12.20
## - c_avg_tds    1      3.11 22.4 12.46
## - c_rate       1      3.36 22.7 12.90
## - wonderlic    1      4.32 23.7 14.51
## 
## Step:  AIC=6.77
## wins ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic + 
##     cone + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - cone         1      0.22 19.6  5.21
## - c_avg_inter  1      0.27 19.7  5.31
## - shuttle      1      0.34 19.7  5.45
## - height       1      0.49 19.9  5.74
## - c_avg_att    1      0.64 20.0  6.03
## - c_avg_cmpp   1      0.67 20.1  6.09
## - vert_leap    1      0.81 20.2  6.37
## - X40          1      0.93 20.3  6.59
## <none>                     19.4  6.77
## - c_pct        1      1.87 21.3  8.35
## + weight       1      0.06 19.3  8.65
## - c_numyrs     1      2.19 21.6  8.93
## - broad_jump   1      2.73 22.1  9.89
## - c_avg_yds    1      3.02 22.4 10.40
## - c_avg_tds    1      3.47 22.9 11.19
## - c_rate       1      3.51 22.9 11.25
## - age          1      4.20 23.6 12.40
## - wonderlic    1      5.69 25.1 14.79
## 
## Step:  AIC=5.21
## wins ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter + 
##     c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic + 
##     shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - c_avg_inter  1      0.42 20.0  4.03
## - height       1      0.42 20.1  4.04
## - c_avg_cmpp   1      0.56 20.2  4.30
## - c_avg_att    1      0.66 20.3  4.51
## - shuttle      1      0.69 20.3  4.56
## - vert_leap    1      0.70 20.3  4.58
## - X40          1      1.02 20.6  5.19
## <none>                     19.6  5.21
## - c_pct        1      1.67 21.3  6.40
## + cone         1      0.22 19.4  6.77
## + weight       1      0.11 19.5  7.00
## - c_numyrs     1      2.15 21.8  7.26
## - c_avg_yds    1      2.86 22.5  8.52
## - broad_jump   1      2.99 22.6  8.74
## - c_avg_tds    1      3.26 22.9  9.20
## - c_rate       1      3.29 22.9  9.26
## - age          1      4.59 24.2 11.41
## - wonderlic    1      5.61 25.2 13.02
## 
## Step:  AIC=4.03
## wins ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_tds + 
##     c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic + shuttle + 
##     vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - c_avg_cmpp   1      0.19 20.2  2.40
## - height       1      0.40 20.4  2.81
## - shuttle      1      0.57 20.6  3.12
## - vert_leap    1      0.72 20.8  3.40
## - X40          1      0.80 20.8  3.56
## <none>                     20.0  4.03
## - c_pct        1      1.26 21.3  4.40
## + c_avg_inter  1      0.42 19.6  5.21
## + cone         1      0.36 19.7  5.31
## + weight       1      0.08 20.0  5.87
## - c_avg_yds    1      2.49 22.5  6.60
## - broad_jump   1      2.63 22.7  6.84
## - c_avg_tds    1      2.88 22.9  7.27
## - c_rate       1      2.96 23.0  7.40
## - c_numyrs     1      3.24 23.3  7.87
## - age          1      4.20 24.2  9.44
## - c_avg_att    1      4.64 24.7 10.16
## - wonderlic    1      5.22 25.3 11.06
## 
## Step:  AIC=2.4
## wins ~ height + age + c_rate + c_pct + c_avg_tds + c_avg_yds + 
##     c_numyrs + c_avg_att + X40 + wonderlic + shuttle + vert_leap + 
##     broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - height       1      0.32 20.6  1.02
## - shuttle      1      0.48 20.7  1.31
## - vert_leap    1      0.55 20.8  1.45
## - X40          1      0.85 21.1  2.01
## <none>                     20.2  2.40
## + c_avg_cmpp   1      0.19 20.0  4.03
## + cone         1      0.15 20.1  4.10
## + weight       1      0.05 20.2  4.30
## + c_avg_inter  1      0.05 20.2  4.30
## - broad_jump   1      2.48 22.7  4.92
## - c_pct        1      2.64 22.9  5.18
## - c_avg_tds    1      3.20 23.4  6.13
## - c_avg_yds    1      3.23 23.5  6.17
## - c_numyrs     1      3.80 24.0  7.11
## - age          1      4.05 24.3  7.52
## - wonderlic    1      5.05 25.3  9.09
## - c_rate       1      6.08 26.3 10.65
## - c_avg_att    1      7.16 27.4 12.22
## 
## Step:  AIC=1.02
## wins ~ age + c_rate + c_pct + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att + X40 + wonderlic + shuttle + vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - shuttle      1      0.31 20.9 -0.40
## - vert_leap    1      0.45 21.0 -0.14
## <none>                     20.6  1.02
## - X40          1      1.21 21.8  1.24
## + height       1      0.32 20.2  2.40
## + weight       1      0.31 20.2  2.43
## + cone         1      0.13 20.4  2.76
## + c_avg_cmpp   1      0.11 20.4  2.81
## + c_avg_inter  1      0.07 20.5  2.87
## - c_pct        1      2.39 22.9  3.31
## - broad_jump   1      2.80 23.4  3.99
## - c_avg_yds    1      2.99 23.5  4.31
## - c_avg_tds    1      3.34 23.9  4.89
## - c_numyrs     1      3.60 24.2  5.32
## - age          1      4.56 25.1  6.84
## - wonderlic    1      4.73 25.3  7.09
## - c_rate       1      5.80 26.4  8.71
## - c_avg_att    1      6.89 27.4 10.29
## 
## Step:  AIC=-0.4
## wins ~ age + c_rate + c_pct + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att + X40 + wonderlic + vert_leap + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## - vert_leap    1      0.28 21.1 -1.88
## - X40          1      0.91 21.8 -0.74
## <none>                     20.9 -0.40
## + cone         1      0.32 20.6  1.01
## + shuttle      1      0.31 20.6  1.02
## + height       1      0.16 20.7  1.31
## + weight       1      0.08 20.8  1.44
## + c_avg_inter  1      0.07 20.8  1.47
## + c_avg_cmpp   1      0.07 20.8  1.48
## - c_pct        1      2.42 23.3  1.87
## - broad_jump   1      2.55 23.4  2.09
## - c_avg_yds    1      2.68 23.5  2.31
## - c_numyrs     1      3.34 24.2  3.40
## - c_avg_tds    1      4.16 25.0  4.70
## - age          1      4.28 25.1  4.88
## - wonderlic    1      4.60 25.5  5.37
## - c_rate       1      5.64 26.5  6.93
## - c_avg_att    1      6.59 27.4  8.31
## 
## Step:  AIC=-1.88
## wins ~ age + c_rate + c_pct + c_avg_tds + c_avg_yds + c_numyrs + 
##     c_avg_att + X40 + wonderlic + broad_jump
## 
##               Df Sum of Sq  RSS   AIC
## <none>                     21.1 -1.88
## - X40          1      1.72 22.9 -0.83
## + vert_leap    1      0.28 20.9 -0.40
## + cone         1      0.23 20.9 -0.31
## + c_avg_inter  1      0.14 21.0 -0.14
## + shuttle      1      0.14 21.0 -0.14
## + height       1      0.13 21.0 -0.13
## + weight       1      0.07 21.1 -0.01
## + c_avg_cmpp   1      0.01 21.1  0.10
## - broad_jump   1      2.29 23.4  0.13
## - c_pct        1      2.39 23.5  0.29
## - c_avg_yds    1      2.52 23.7  0.52
## - c_numyrs     1      3.18 24.3  1.59
## - age          1      4.37 25.5  3.44
## - wonderlic    1      4.40 25.5  3.48
## - c_avg_tds    1      4.84 26.0  4.16
## - c_rate       1      5.67 26.8  5.38
## - c_avg_att    1      6.51 27.6  6.58
summary(step_reg.scaled.w_combine.wins)
## 
## Call:
## lm(formula = wins ~ age + c_rate + c_pct + c_avg_tds + c_avg_yds + 
##     c_numyrs + c_avg_att + X40 + wonderlic + broad_jump, data = data.scaled.w_combine.for_wins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.598 -0.566  0.035  0.431  1.414 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  3.21e-16   1.39e-01    0.00   1.0000   
## age         -4.02e-01   1.67e-01   -2.41   0.0230 * 
## c_rate      -1.68e+00   6.14e-01   -2.74   0.0106 * 
## c_pct        8.94e-01   5.03e-01    1.78   0.0863 . 
## c_avg_tds    1.10e+00   4.35e-01    2.53   0.0172 * 
## c_avg_yds    1.91e+00   1.05e+00    1.83   0.0781 . 
## c_numyrs     4.06e-01   1.98e-01    2.05   0.0494 * 
## c_avg_att   -2.71e+00   9.25e-01   -2.94   0.0066 **
## X40          3.43e-01   2.27e-01    1.51   0.1423   
## wonderlic   -4.05e-01   1.68e-01   -2.41   0.0226 * 
## broad_jump   4.28e-01   2.45e-01    1.74   0.0922 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.869 on 28 degrees of freedom
## Multiple R-squared: 0.444,   Adjusted R-squared: 0.245 
## F-statistic: 2.23 on 10 and 28 DF,  p-value: 0.046
plot(step_reg.scaled.w_combine.wins)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1

leaps.scaled.w_combine.wins <- regsubsets(wins ~ ., data = data.scaled.w_combine.for_wins, 
    nbest = 10)
subsets(leaps.scaled.w_combine.wins, statistic = "rsq")
## Error: invalid coordinate lengths

plot of chunk unnamed-chunk-1

cv.lm(df = data.scaled.w_combine.for_wins, step_reg.scaled.w_combine.wins, m = 5)  # 5 fold cross-validation
## Analysis of Variance Table
## 
## Response: wins
##            Df Sum Sq Mean Sq F value Pr(>F)  
## age         1   1.65    1.65    2.18  0.151  
## c_rate      1   0.13    0.13    0.17  0.683  
## c_pct       1   0.46    0.46    0.60  0.444  
## c_avg_tds   1   0.31    0.31    0.41  0.525  
## c_avg_yds   1   3.66    3.66    4.85  0.036 *
## c_numyrs    1   0.03    0.03    0.03  0.854  
## c_avg_att   1   3.68    3.68    4.88  0.036 *
## X40         1   0.00    0.00    0.00  0.946  
## wonderlic   1   4.65    4.65    6.16  0.019 *
## broad_jump  1   2.29    2.29    3.04  0.092 .
## Residuals  28  21.14    0.75                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
## 
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values.  Lines that are shown for the different folds are approximate

plot of chunk unnamed-chunk-1

## 
## fold 1 
## Observations in test set: 7 
##                  3     21     24     40     42     52     61
## Predicted   -0.815 -0.954 -0.636 -0.618 -0.369 -0.103  1.266
## cvpred      -0.845 -1.251 -0.490 -0.655 -0.325 -0.162  1.597
## wins        -1.033 -0.151 -1.621  0.143 -0.739 -1.033  0.731
## CV residual -0.188  1.100 -1.131  0.798 -0.413 -0.871 -0.866
## 
## Sum of squares = 4.84    Mean square = 0.69    n = 7 
## 
## fold 2 
## Observations in test set: 8 
##                  6      18    25    37       43     50     55     63
## Predicted   -0.226 -0.0273 1.201 1.181 -0.11491 -0.681 -1.223 -0.249
## cvpred      -0.359 -0.0628 0.994 0.739 -0.14611 -0.932 -1.490  0.326
## wins         1.025  0.1432 1.613 1.613 -0.15075 -1.033 -0.739 -1.327
## CV residual  1.384  0.2060 0.619 0.874 -0.00464 -0.101  0.751 -1.652
## 
## Sum of squares = 6.41    Mean square = 0.8    n = 8 
## 
## fold 3 
## Observations in test set: 8 
##                  5       7    16     20    28     32      49     64
## Predicted    0.642  0.0539 0.667 -1.056 1.205 -0.175  0.0726  1.034
## cvpred      -0.184  0.5387 0.197 -0.335 1.162  0.161  0.3052  0.869
## wins         1.613 -0.1508 0.731 -1.327 1.613 -0.445 -0.4447  0.437
## CV residual  1.797 -0.6894 0.535 -0.991 0.451 -0.605 -0.7500 -0.432
## 
## Sum of squares = 6.29    Mean square = 0.79    n = 8 
## 
## fold 4 
## Observations in test set: 8 
##                 12      13     26     30      38     39     59     65
## Predicted    0.271 -0.0492  0.310 -0.266 -0.0219  0.217 -1.116  0.199
## cvpred       1.093  0.0156  0.208 -0.339  0.1159  0.755 -1.295 -0.121
## wins        -1.327  0.1432 -0.445  0.731 -0.7387 -0.739 -0.739  1.613
## CV residual -2.419  0.1276 -0.653  1.070 -0.8546 -1.494  0.557  1.734
## 
## Sum of squares = 13.7    Mean square = 1.71    n = 8 
## 
## fold 5 
## Observations in test set: 8 
##                   1      4     15     17      19     27     46    56
## Predicted    0.0153 -0.418 0.4022 -0.875  0.0176  0.161  0.597 0.478
## cvpred      -0.0991  0.336 0.0543 -0.570  0.6055 -0.901  0.859 0.389
## wins         0.4372 -1.033 0.4372 -0.445 -1.0326  1.319  0.731 1.613
## CV residual  0.5363 -1.369 0.3828  0.125 -1.6381  2.220 -0.128 1.224
## 
## Sum of squares = 11.4    Mean square = 1.43    n = 8 
## 
## Overall (Sum over all 8 folds) 
##  ms 
## 1.1