# Fetch Data
qb_stats_w_combine <- read.csv("../data/qb_stats_w_combine.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att", "X40",
"wonderlic", "cone", "shuttle", "vert_leap", "broad_jump")
college_stats = qb_stats_w_combine[, predictors]
# Set the resopnse variables
cpct = qb_stats_w_combine["completion_percentage"]
# Generate clean data set
data.log.w_combine.for_cpct = data.frame(log(na.omit(cbind(cpct, college_stats)) +
0.1))
# Generate the linear model
lm.log.w_combine.cpct <- lm(formula = completion_percentage ~ ., data = data.log.w_combine.for_cpct)
# Find optimum linear regression model for cpct
step_reg.log.w_combine.cpct <- stepAIC(lm.log.w_combine.cpct, direction = "both")
## Start: AIC=-170.9
## completion_percentage ~ height + weight + age + c_avg_cmpp +
## c_rate + c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att + X40 + wonderlic + cone + shuttle + vert_leap +
## broad_jump
##
## Df Sum of Sq RSS AIC
## - height 1 0.00004 0.164 -173
## - c_avg_yds 1 0.00007 0.164 -173
## - broad_jump 1 0.00011 0.164 -173
## - c_avg_inter 1 0.00012 0.164 -173
## - c_avg_tds 1 0.00018 0.164 -173
## - c_rate 1 0.00063 0.165 -173
## - wonderlic 1 0.00252 0.167 -172
## - age 1 0.00306 0.167 -172
## - weight 1 0.00378 0.168 -172
## - shuttle 1 0.00403 0.168 -172
## - c_numyrs 1 0.00703 0.171 -171
## <none> 0.164 -171
## - X40 1 0.00895 0.173 -171
## - c_avg_att 1 0.00985 0.174 -171
## - cone 1 0.01061 0.175 -170
## - c_pct 1 0.01171 0.176 -170
## - c_avg_cmpp 1 0.01339 0.177 -170
## - vert_leap 1 0.02265 0.187 -168
##
## Step: AIC=-172.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs +
## c_avg_att + X40 + wonderlic + cone + shuttle + vert_leap +
## broad_jump
##
## Df Sum of Sq RSS AIC
## - c_avg_yds 1 0.00006 0.164 -175
## - broad_jump 1 0.00010 0.164 -175
## - c_avg_inter 1 0.00014 0.164 -175
## - c_avg_tds 1 0.00022 0.164 -175
## - c_rate 1 0.00059 0.165 -175
## - wonderlic 1 0.00273 0.167 -174
## - age 1 0.00320 0.167 -174
## - shuttle 1 0.00433 0.168 -174
## - c_numyrs 1 0.00702 0.171 -173
## - weight 1 0.00802 0.172 -173
## <none> 0.164 -173
## - X40 1 0.00935 0.173 -173
## - c_avg_att 1 0.00990 0.174 -173
## - c_pct 1 0.01170 0.176 -172
## - cone 1 0.01185 0.176 -172
## - c_avg_cmpp 1 0.01345 0.177 -172
## + height 1 0.00004 0.164 -171
## - vert_leap 1 0.02261 0.187 -170
##
## Step: AIC=-174.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_inter + c_avg_tds + c_numyrs + c_avg_att +
## X40 + wonderlic + cone + shuttle + vert_leap + broad_jump
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.00008 0.164 -177
## - broad_jump 1 0.00017 0.164 -177
## - c_avg_tds 1 0.00187 0.166 -176
## - wonderlic 1 0.00268 0.167 -176
## - age 1 0.00332 0.168 -176
## - shuttle 1 0.00432 0.169 -176
## - c_rate 1 0.00654 0.171 -175
## - c_numyrs 1 0.00738 0.172 -175
## - weight 1 0.00802 0.172 -175
## <none> 0.164 -175
## - X40 1 0.01093 0.175 -174
## - cone 1 0.01180 0.176 -174
## - c_pct 1 0.01275 0.177 -174
## - c_avg_att 1 0.01638 0.180 -173
## - c_avg_cmpp 1 0.01711 0.181 -173
## + c_avg_yds 1 0.00006 0.164 -173
## + height 1 0.00003 0.164 -173
## - vert_leap 1 0.02292 0.187 -172
##
## Step: AIC=-176.9
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_tds + c_numyrs + c_avg_att + X40 + wonderlic +
## cone + shuttle + vert_leap + broad_jump
##
## Df Sum of Sq RSS AIC
## - broad_jump 1 0.00023 0.165 -179
## - c_avg_tds 1 0.00195 0.166 -178
## - wonderlic 1 0.00260 0.167 -178
## - age 1 0.00349 0.168 -178
## - shuttle 1 0.00424 0.169 -178
## - c_rate 1 0.00675 0.171 -177
## - weight 1 0.00794 0.172 -177
## - c_numyrs 1 0.00843 0.173 -177
## <none> 0.164 -177
## - X40 1 0.01186 0.176 -176
## - cone 1 0.01238 0.177 -176
## + c_avg_inter 1 0.00008 0.164 -175
## + height 1 0.00005 0.164 -175
## + c_avg_yds 1 0.00000 0.164 -175
## - vert_leap 1 0.02333 0.188 -174
## - c_pct 1 0.02366 0.188 -174
## - c_avg_att 1 0.03062 0.195 -172
## - c_avg_cmpp 1 0.03086 0.195 -172
##
## Step: AIC=-178.8
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_avg_tds + c_numyrs + c_avg_att + X40 + wonderlic +
## cone + shuttle + vert_leap
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.0018 0.166 -180
## - wonderlic 1 0.0026 0.167 -180
## - age 1 0.0034 0.168 -180
## - shuttle 1 0.0043 0.169 -180
## - c_rate 1 0.0070 0.171 -179
## - weight 1 0.0077 0.172 -179
## <none> 0.165 -179
## - c_numyrs 1 0.0094 0.174 -179
## - cone 1 0.0124 0.177 -178
## - X40 1 0.0125 0.177 -178
## + broad_jump 1 0.0002 0.164 -177
## + c_avg_inter 1 0.0001 0.164 -177
## + height 1 0.0000 0.164 -177
## + c_avg_yds 1 0.0000 0.164 -177
## - c_pct 1 0.0235 0.188 -176
## - c_avg_att 1 0.0304 0.195 -174
## - c_avg_cmpp 1 0.0306 0.195 -174
## - vert_leap 1 0.0321 0.197 -174
##
## Step: AIC=-180.4
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_numyrs + c_avg_att + X40 + wonderlic + cone + shuttle +
## vert_leap
##
## Df Sum of Sq RSS AIC
## - wonderlic 1 0.0026 0.169 -182
## - shuttle 1 0.0032 0.170 -182
## - age 1 0.0037 0.170 -182
## - weight 1 0.0065 0.173 -181
## - c_numyrs 1 0.0089 0.175 -180
## <none> 0.166 -180
## - cone 1 0.0117 0.178 -180
## - X40 1 0.0117 0.178 -180
## + c_avg_tds 1 0.0018 0.165 -179
## + c_avg_yds 1 0.0013 0.165 -179
## + c_avg_inter 1 0.0002 0.166 -178
## + height 1 0.0002 0.166 -178
## + broad_jump 1 0.0001 0.166 -178
## - c_pct 1 0.0221 0.188 -178
## - c_avg_cmpp 1 0.0297 0.196 -176
## - c_avg_att 1 0.0300 0.196 -176
## - vert_leap 1 0.0305 0.197 -176
## - c_rate 1 0.0533 0.220 -172
##
## Step: AIC=-181.8
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_numyrs + c_avg_att + X40 + cone + shuttle + vert_leap
##
## Df Sum of Sq RSS AIC
## - shuttle 1 0.0028 0.172 -183
## - age 1 0.0077 0.177 -182
## - c_numyrs 1 0.0081 0.177 -182
## - weight 1 0.0082 0.177 -182
## <none> 0.169 -182
## - X40 1 0.0146 0.183 -181
## + wonderlic 1 0.0026 0.166 -180
## + c_avg_tds 1 0.0018 0.167 -180
## + c_avg_yds 1 0.0011 0.168 -180
## + broad_jump 1 0.0001 0.169 -180
## + height 1 0.0001 0.169 -180
## + c_avg_inter 1 0.0000 0.169 -180
## - cone 1 0.0193 0.188 -180
## - c_pct 1 0.0211 0.190 -179
## - c_avg_cmpp 1 0.0284 0.197 -178
## - c_avg_att 1 0.0286 0.198 -178
## - vert_leap 1 0.0287 0.198 -178
## - c_rate 1 0.0507 0.220 -174
##
## Step: AIC=-183.2
## completion_percentage ~ weight + age + c_avg_cmpp + c_rate +
## c_pct + c_numyrs + c_avg_att + X40 + cone + vert_leap
##
## Df Sum of Sq RSS AIC
## - weight 1 0.0055 0.177 -184
## - age 1 0.0064 0.178 -184
## <none> 0.172 -183
## - c_numyrs 1 0.0100 0.182 -183
## + shuttle 1 0.0028 0.169 -182
## - cone 1 0.0165 0.188 -182
## + wonderlic 1 0.0022 0.170 -182
## + c_avg_tds 1 0.0008 0.171 -181
## + c_avg_yds 1 0.0004 0.171 -181
## + broad_jump 1 0.0002 0.172 -181
## + c_avg_inter 1 0.0000 0.172 -181
## + height 1 0.0000 0.172 -181
## - c_pct 1 0.0214 0.193 -181
## - X40 1 0.0226 0.194 -180
## - vert_leap 1 0.0265 0.198 -180
## - c_avg_cmpp 1 0.0290 0.201 -179
## - c_avg_att 1 0.0293 0.201 -179
## - c_rate 1 0.0560 0.228 -174
##
## Step: AIC=-184
## completion_percentage ~ age + c_avg_cmpp + c_rate + c_pct + c_numyrs +
## c_avg_att + X40 + cone + vert_leap
##
## Df Sum of Sq RSS AIC
## - age 1 0.0030 0.180 -185
## <none> 0.177 -184
## - c_numyrs 1 0.0119 0.189 -184
## + weight 1 0.0055 0.172 -183
## + wonderlic 1 0.0039 0.173 -183
## + height 1 0.0019 0.175 -182
## + c_avg_tds 1 0.0004 0.177 -182
## + c_avg_yds 1 0.0002 0.177 -182
## + shuttle 1 0.0001 0.177 -182
## + c_avg_inter 1 0.0001 0.177 -182
## + broad_jump 1 0.0000 0.177 -182
## - X40 1 0.0215 0.199 -182
## - vert_leap 1 0.0226 0.200 -181
## - cone 1 0.0230 0.200 -181
## - c_pct 1 0.0259 0.203 -181
## - c_avg_cmpp 1 0.0347 0.212 -179
## - c_avg_att 1 0.0352 0.212 -179
## - c_rate 1 0.0548 0.232 -176
##
## Step: AIC=-185.3
## completion_percentage ~ c_avg_cmpp + c_rate + c_pct + c_numyrs +
## c_avg_att + X40 + cone + vert_leap
##
## Df Sum of Sq RSS AIC
## <none> 0.180 -185
## - c_numyrs 1 0.0117 0.192 -185
## + wonderlic 1 0.0060 0.174 -185
## + age 1 0.0030 0.177 -184
## + weight 1 0.0021 0.178 -184
## + c_avg_tds 1 0.0008 0.179 -184
## + height 1 0.0006 0.180 -184
## + c_avg_yds 1 0.0004 0.180 -183
## + shuttle 1 0.0002 0.180 -183
## + c_avg_inter 1 0.0000 0.180 -183
## + broad_jump 1 0.0000 0.180 -183
## - X40 1 0.0218 0.202 -183
## - vert_leap 1 0.0221 0.202 -183
## - c_pct 1 0.0270 0.207 -182
## - cone 1 0.0282 0.208 -182
## - c_avg_cmpp 1 0.0359 0.216 -180
## - c_avg_att 1 0.0364 0.216 -180
## - c_rate 1 0.0573 0.237 -177
summary(step_reg.log.w_combine.cpct)
##
## Call:
## lm(formula = completion_percentage ~ c_avg_cmpp + c_rate + c_pct +
## c_numyrs + c_avg_att + X40 + cone + vert_leap, data = data.log.w_combine.for_cpct)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16827 -0.03139 -0.00799 0.05066 0.11271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.8886 12.0750 2.81 0.0089 **
## c_avg_cmpp 6.5937 2.7422 2.40 0.0228 *
## c_rate -0.5702 0.1877 -3.04 0.0050 **
## c_pct -5.5621 2.6670 -2.09 0.0459 *
## c_numyrs 0.0975 0.0709 1.37 0.1798
## c_avg_att -6.6036 2.7301 -2.42 0.0221 *
## X40 -1.1154 0.5960 -1.87 0.0714 .
## cone 1.0250 0.4809 2.13 0.0417 *
## vert_leap -0.3484 0.1849 -1.88 0.0695 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0788 on 29 degrees of freedom
## Multiple R-squared: 0.464, Adjusted R-squared: 0.317
## F-statistic: 3.14 on 8 and 29 DF, p-value: 0.011
plot(step_reg.log.w_combine.cpct)
## Warning: NaNs produced
## Warning: NaNs produced
leaps.log.w_combine.cpct <- regsubsets(completion_percentage ~ ., data = data.log.w_combine.for_cpct,
nbest = 10)
subsets(leaps.log.w_combine.cpct, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.w_combine.for_cpct, step_reg.log.w_combine.cpct, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: completion_percentage
## Df Sum Sq Mean Sq F value Pr(>F)
## c_avg_cmpp 1 0.0004 0.0004 0.06 0.8107
## c_rate 1 0.0022 0.0022 0.35 0.5602
## c_pct 1 0.0620 0.0620 9.98 0.0037 **
## c_numyrs 1 0.0042 0.0042 0.68 0.4157
## c_avg_att 1 0.0281 0.0281 4.53 0.0420 *
## X40 1 0.0010 0.0010 0.17 0.6856
## cone 1 0.0362 0.0362 5.83 0.0223 *
## vert_leap 1 0.0221 0.0221 3.55 0.0695 .
## Residuals 29 0.1802 0.0062
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 7
## 6 24 26 32 38 46 59
## Predicted 4.0618 4.0824 4.0420 4.0176 3.9484 3.9988 3.985
## cvpred 4.0684 4.0714 4.0578 4.0116 3.9692 4.0135 3.943
## completion_percentage 4.0639 4.0289 3.9627 4.0342 3.9160 3.9871 4.067
## CV residual -0.0045 -0.0425 -0.0951 0.0226 -0.0532 -0.0264 0.125
##
## Sum of squares = 0.03 Mean square = 0 n = 7
##
## fold 2
## Observations in test set: 8
## 7 19 21 27 37 43 55
## Predicted 4.101 4.0888 4.0262 4.0023 4.11 3.97233 4.03
## cvpred 4.097 4.0915 4.0528 3.9725 5.57 3.95735 4.06
## completion_percentage 3.968 3.9964 4.1352 4.0360 4.10 3.94932 3.94
## CV residual -0.129 -0.0951 0.0823 0.0635 -1.47 -0.00803 -0.12
## 65
## Predicted 4.105
## cvpred 4.048
## completion_percentage 4.159
## CV residual 0.111
##
## Sum of squares = 2.23 Mean square = 0.28 n = 8
##
## fold 3
## Observations in test set: 8
## 5 12 13 17 39 40 50
## Predicted 4.0093 3.9948 3.983 4.0769 4.0254 3.9676 4.0064
## cvpred 3.9790 3.9827 3.907 4.0666 4.0464 3.9612 4.0101
## completion_percentage 3.9927 4.0091 4.096 4.1651 4.0000 4.0466 3.9778
## CV residual 0.0137 0.0265 0.189 0.0985 -0.0464 0.0854 -0.0323
## 56
## Predicted 4.054
## cvpred 4.020
## completion_percentage 4.162
## CV residual 0.142
##
## Sum of squares = 0.08 Mean square = 0.01 n = 8
##
## fold 4
## Observations in test set: 8
## 4 15 18 20 28 52 63
## Predicted 4.011 4.12449 4.0760 3.933 4.0108 3.99 4.13289
## cvpred 4.036 4.11635 4.0682 3.927 4.0049 4.00 4.10705
## completion_percentage 3.932 4.10923 4.1125 4.027 4.0622 3.82 4.11087
## CV residual -0.104 -0.00711 0.0443 0.101 0.0572 -0.18 0.00383
## 64
## Predicted 3.933
## cvpred 3.976
## completion_percentage 3.842
## CV residual -0.135
##
## Sum of squares = 0.08 Mean square = 0.01 n = 8
##
## fold 5
## Observations in test set: 7
## 1 3 25 30 42 49 61
## Predicted 4.0381 3.860 4.073 4.0847 4.008 4.203 4.0238
## cvpred 4.0105 3.959 4.025 4.0530 4.016 4.093 3.9795
## completion_percentage 4.0254 3.791 4.140 4.1092 4.057 4.197 4.0656
## CV residual 0.0148 -0.168 0.115 0.0563 0.041 0.104 0.0861
##
## Sum of squares = 0.06 Mean square = 0.01 n = 7
##
## Overall (Sum over all 7 folds)
## ms
## 0.0652