# Fetch Data
qb_stats_w_combine <- read.csv("../data/qb_stats_w_combine.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att", "X40",
"wonderlic", "cone", "shuttle", "vert_leap", "broad_jump")
college_stats = qb_stats_w_combine[, predictors]
# Set the resopnse variables
yds = qb_stats_w_combine["yds"]
# Generate clean data set
data.log.w_combine.for_yds = data.frame(log(na.omit(cbind(yds, college_stats)) +
0.1))
# Generate the linear model
lm.log.w_combine.yds <- lm(formula = yds ~ ., data = data.log.w_combine.for_yds)
# Find optimum linear regression model for yds
step_reg.log.w_combine.yds <- stepAIC(lm.log.w_combine.yds, direction = "both")
## Start: AIC=-54.97
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic +
## cone + shuttle + vert_leap + broad_jump
##
## Df Sum of Sq RSS AIC
## - broad_jump 1 0.000 3.78 -57.0
## - X40 1 0.006 3.79 -56.9
## - weight 1 0.010 3.79 -56.9
## - vert_leap 1 0.024 3.81 -56.7
## - age 1 0.026 3.81 -56.7
## - height 1 0.033 3.82 -56.6
## - cone 1 0.061 3.85 -56.4
## - shuttle 1 0.081 3.86 -56.2
## - c_avg_inter 1 0.082 3.87 -56.1
## - c_avg_yds 1 0.098 3.88 -56.0
## - c_rate 1 0.129 3.91 -55.7
## <none> 3.78 -55.0
## - c_avg_tds 1 0.222 4.01 -54.7
## - wonderlic 1 0.249 4.03 -54.5
## - c_numyrs 1 0.253 4.04 -54.5
## - c_avg_att 1 0.657 4.44 -50.7
## - c_pct 1 0.671 4.46 -50.6
## - c_avg_cmpp 1 0.713 4.50 -50.2
##
## Step: AIC=-56.97
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + X40 + wonderlic +
## cone + shuttle + vert_leap
##
## Df Sum of Sq RSS AIC
## - X40 1 0.006 3.79 -58.9
## - weight 1 0.009 3.79 -58.9
## - age 1 0.027 3.81 -58.7
## - height 1 0.032 3.82 -58.6
## - vert_leap 1 0.033 3.82 -58.6
## - cone 1 0.061 3.85 -58.4
## - shuttle 1 0.081 3.87 -58.1
## - c_avg_inter 1 0.089 3.87 -58.1
## - c_avg_yds 1 0.112 3.90 -57.8
## - c_rate 1 0.149 3.93 -57.5
## <none> 3.78 -57.0
## - wonderlic 1 0.250 4.03 -56.5
## - c_avg_tds 1 0.255 4.04 -56.4
## - c_numyrs 1 0.269 4.05 -56.3
## + broad_jump 1 0.000 3.78 -55.0
## - c_avg_att 1 0.664 4.45 -52.7
## - c_pct 1 0.671 4.46 -52.6
## - c_avg_cmpp 1 0.713 4.50 -52.2
##
## Step: AIC=-58.91
## yds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic +
## cone + shuttle + vert_leap
##
## Df Sum of Sq RSS AIC
## - weight 1 0.013 3.80 -60.8
## - height 1 0.027 3.82 -60.6
## - vert_leap 1 0.028 3.82 -60.6
## - age 1 0.029 3.82 -60.6
## - cone 1 0.062 3.85 -60.3
## - shuttle 1 0.076 3.87 -60.1
## - c_avg_inter 1 0.083 3.87 -60.1
## - c_avg_yds 1 0.120 3.91 -59.7
## - c_rate 1 0.156 3.95 -59.3
## <none> 3.79 -58.9
## - wonderlic 1 0.263 4.05 -58.3
## - c_numyrs 1 0.263 4.05 -58.3
## - c_avg_tds 1 0.265 4.06 -58.3
## + X40 1 0.006 3.78 -57.0
## + broad_jump 1 0.000 3.79 -56.9
## - c_avg_att 1 0.658 4.45 -54.7
## - c_pct 1 0.666 4.46 -54.6
## - c_avg_cmpp 1 0.707 4.50 -54.2
##
## Step: AIC=-60.77
## yds ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic +
## cone + shuttle + vert_leap
##
## Df Sum of Sq RSS AIC
## - vert_leap 1 0.020 3.82 -62.6
## - age 1 0.054 3.86 -62.2
## - cone 1 0.083 3.89 -61.9
## - c_avg_inter 1 0.087 3.89 -61.9
## - height 1 0.107 3.91 -61.7
## - shuttle 1 0.124 3.93 -61.5
## - c_avg_yds 1 0.133 3.94 -61.4
## - c_rate 1 0.182 3.99 -61.0
## <none> 3.80 -60.8
## - c_numyrs 1 0.300 4.10 -59.8
## - c_avg_tds 1 0.327 4.13 -59.6
## - wonderlic 1 0.379 4.18 -59.1
## + weight 1 0.013 3.79 -58.9
## + X40 1 0.010 3.79 -58.9
## + broad_jump 1 0.001 3.80 -58.8
## - c_pct 1 0.713 4.52 -56.1
## - c_avg_att 1 0.730 4.53 -55.9
## - c_avg_cmpp 1 0.776 4.58 -55.5
##
## Step: AIC=-62.57
## yds ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att + wonderlic +
## cone + shuttle
##
## Df Sum of Sq RSS AIC
## - age 1 0.050 3.87 -64.1
## - cone 1 0.066 3.89 -63.9
## - c_avg_inter 1 0.083 3.91 -63.7
## - height 1 0.092 3.92 -63.7
## - c_avg_yds 1 0.119 3.94 -63.4
## - c_rate 1 0.169 3.99 -62.9
## <none> 3.82 -62.6
## - shuttle 1 0.207 4.03 -62.5
## - c_numyrs 1 0.301 4.12 -61.6
## - c_avg_tds 1 0.320 4.14 -61.4
## - wonderlic 1 0.370 4.19 -61.0
## + vert_leap 1 0.020 3.80 -60.8
## + weight 1 0.005 3.82 -60.6
## + broad_jump 1 0.004 3.82 -60.6
## + X40 1 0.000 3.82 -60.6
## - c_pct 1 0.707 4.53 -58.0
## - c_avg_att 1 0.716 4.54 -57.9
## - c_avg_cmpp 1 0.767 4.59 -57.4
##
## Step: AIC=-64.06
## yds ~ height + c_avg_cmpp + c_rate + c_pct + c_avg_inter + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att + wonderlic + cone + shuttle
##
## Df Sum of Sq RSS AIC
## - cone 1 0.094 3.97 -65.1
## - c_avg_yds 1 0.109 3.98 -65.0
## - c_avg_inter 1 0.115 3.99 -64.9
## - height 1 0.123 4.00 -64.8
## - c_rate 1 0.157 4.03 -64.5
## <none> 3.87 -64.1
## - shuttle 1 0.226 4.10 -63.9
## - c_numyrs 1 0.303 4.18 -63.1
## - c_avg_tds 1 0.320 4.19 -63.0
## - wonderlic 1 0.321 4.20 -63.0
## + age 1 0.050 3.82 -62.6
## + weight 1 0.024 3.85 -62.3
## + vert_leap 1 0.016 3.86 -62.2
## + broad_jump 1 0.003 3.87 -62.1
## + X40 1 0.002 3.87 -62.1
## - c_pct 1 0.696 4.57 -59.6
## - c_avg_att 1 0.701 4.58 -59.6
## - c_avg_cmpp 1 0.752 4.63 -59.1
##
## Step: AIC=-65.13
## yds ~ height + c_avg_cmpp + c_rate + c_pct + c_avg_inter + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att + wonderlic + shuttle
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.076 4.04 -66.4
## - height 1 0.113 4.08 -66.0
## - c_avg_yds 1 0.122 4.09 -65.9
## - shuttle 1 0.133 4.10 -65.8
## - c_rate 1 0.165 4.13 -65.5
## <none> 3.97 -65.1
## - wonderlic 1 0.248 4.22 -64.8
## - c_numyrs 1 0.275 4.24 -64.5
## - c_avg_tds 1 0.316 4.28 -64.1
## + cone 1 0.094 3.87 -64.1
## + age 1 0.078 3.89 -63.9
## + weight 1 0.067 3.90 -63.8
## + X40 1 0.017 3.95 -63.3
## + broad_jump 1 0.003 3.96 -63.2
## + vert_leap 1 0.000 3.97 -63.1
## - c_pct 1 0.738 4.71 -60.5
## - c_avg_att 1 0.742 4.71 -60.4
## - c_avg_cmpp 1 0.796 4.76 -60.0
##
## Step: AIC=-66.39
## yds ~ height + c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att + wonderlic + shuttle
##
## Df Sum of Sq RSS AIC
## - shuttle 1 0.125 4.17 -67.2
## - height 1 0.144 4.19 -67.0
## <none> 4.04 -66.4
## - c_numyrs 1 0.271 4.32 -65.9
## + age 1 0.102 3.94 -65.4
## + c_avg_inter 1 0.076 3.97 -65.1
## + weight 1 0.073 3.97 -65.1
## - c_avg_yds 1 0.365 4.41 -65.0
## - wonderlic 1 0.370 4.41 -65.0
## + cone 1 0.055 3.99 -64.9
## + broad_jump 1 0.009 4.04 -64.5
## + X40 1 0.007 4.04 -64.5
## + vert_leap 1 0.001 4.04 -64.4
## - c_rate 1 0.510 4.55 -63.8
## - c_pct 1 0.663 4.71 -62.5
## - c_avg_tds 1 0.666 4.71 -62.4
## - c_avg_att 1 0.734 4.78 -61.9
## - c_avg_cmpp 1 0.740 4.78 -61.8
##
## Step: AIC=-67.2
## yds ~ height + c_avg_cmpp + c_rate + c_pct + c_avg_tds + c_avg_yds +
## c_numyrs + c_avg_att + wonderlic
##
## Df Sum of Sq RSS AIC
## <none> 4.17 -67.2
## - height 1 0.274 4.44 -66.7
## + shuttle 1 0.125 4.04 -66.4
## - c_numyrs 1 0.315 4.48 -66.4
## + weight 1 0.096 4.07 -66.1
## + age 1 0.091 4.08 -66.1
## + c_avg_inter 1 0.069 4.10 -65.8
## - c_avg_yds 1 0.383 4.55 -65.8
## + vert_leap 1 0.057 4.11 -65.7
## + X40 1 0.026 4.14 -65.4
## + broad_jump 1 0.010 4.16 -65.3
## + cone 1 0.000 4.17 -65.2
## - c_rate 1 0.529 4.70 -64.5
## - wonderlic 1 0.541 4.71 -64.4
## - c_avg_tds 1 0.657 4.83 -63.5
## - c_pct 1 0.714 4.88 -63.0
## - c_avg_att 1 0.782 4.95 -62.5
## - c_avg_cmpp 1 0.794 4.96 -62.4
summary(step_reg.log.w_combine.yds)
##
## Call:
## lm(formula = yds ~ height + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att + wonderlic, data = data.log.w_combine.for_yds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7806 -0.1560 0.0142 0.2552 0.5404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 228.627 100.921 2.27 0.031 *
## height 3.846 2.787 1.38 0.178
## c_avg_cmpp 46.257 19.686 2.35 0.026 *
## c_rate -12.795 6.668 -1.92 0.065 .
## c_pct -38.400 17.235 -2.23 0.034 *
## c_avg_tds 2.140 1.001 2.14 0.041 *
## c_avg_yds 6.109 3.745 1.63 0.114
## c_numyrs 0.541 0.365 1.48 0.149
## c_avg_att -54.328 23.299 -2.33 0.027 *
## wonderlic -0.486 0.251 -1.94 0.062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.379 on 29 degrees of freedom
## Multiple R-squared: 0.326, Adjusted R-squared: 0.117
## F-statistic: 1.56 on 9 and 29 DF, p-value: 0.175
plot(step_reg.log.w_combine.yds)
## Warning: NaNs produced
## Warning: NaNs produced
leaps.log.w_combine.yds <- regsubsets(yds ~ ., data = data.log.w_combine.for_yds,
nbest = 10)
subsets(leaps.log.w_combine.yds, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.w_combine.for_yds, step_reg.log.w_combine.yds, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: yds
## Df Sum Sq Mean Sq F value Pr(>F)
## height 1 0.24 0.243 1.69 0.204
## c_avg_cmpp 1 0.00 0.002 0.01 0.917
## c_rate 1 0.22 0.216 1.50 0.230
## c_pct 1 0.04 0.036 0.25 0.619
## c_avg_tds 1 0.29 0.290 2.02 0.166
## c_avg_yds 1 0.01 0.010 0.07 0.792
## c_numyrs 1 0.01 0.006 0.04 0.841
## c_avg_att 1 0.67 0.671 4.67 0.039 *
## wonderlic 1 0.54 0.541 3.76 0.062 .
## Residuals 29 4.17 0.144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 7
## 3 21 24 40 42 52 61
## Predicted 7.289 7.568 7.910 7.632 7.73 7.705 7.404
## cvpred 7.332 7.706 8.028 7.677 7.88 7.856 7.094
## yds 7.133 7.503 7.388 7.828 7.58 7.055 7.555
## CV residual -0.199 -0.202 -0.641 0.152 -0.30 -0.802 0.461
##
## Sum of squares = 1.46 Mean square = 0.21 n = 7
##
## fold 2
## Observations in test set: 8
## 6 18 25 37 43 50 55 63
## Predicted 7.655 8.033 8.2497 8.02 7.402 7.430 7.269 7.789
## cvpred 7.577 8.099 8.3497 15.54 7.345 7.445 7.247 7.845
## yds 8.131 7.618 8.2782 8.00 7.139 7.726 6.977 7.682
## CV residual 0.554 -0.481 -0.0715 -7.54 -0.206 0.282 -0.269 -0.163
##
## Sum of squares = 57.6 Mean square = 7.21 n = 8
##
## fold 3
## Observations in test set: 8
## 5 7 16 20 28 32 49 64
## Predicted 7.843 7.56 7.881 7.40 7.98 7.665 7.891 8.02
## cvpred 7.500 8.10 7.705 7.81 8.32 7.962 7.997 8.18
## yds 8.383 7.09 8.046 7.48 8.12 7.685 7.715 7.46
## CV residual 0.883 -1.01 0.341 -0.33 -0.20 -0.277 -0.282 -0.72
##
## Sum of squares = 2.75 Mean square = 0.34 n = 8
##
## fold 4
## Observations in test set: 8
## 12 13 26 30 38 39 59 65
## Predicted 7.547 7.949 7.904 7.674 7.446 7.585 7.425 7.638
## cvpred 7.551 7.573 7.718 7.730 7.215 7.654 7.817 7.434
## yds 7.406 8.307 7.860 8.097 7.738 7.526 7.334 7.953
## CV residual -0.145 0.734 0.142 0.367 0.523 -0.129 -0.483 0.518
##
## Sum of squares = 1.51 Mean square = 0.19 n = 8
##
## fold 5
## Observations in test set: 8
## 1 4 15 17 19 27 46 56
## Predicted 7.743 7.55 7.893 7.587 7.631 7.913 7.787 7.578
## cvpred 7.520 7.94 7.816 7.657 7.833 7.404 8.104 7.716
## yds 8.251 6.77 7.965 7.806 7.525 8.239 7.801 8.045
## CV residual 0.731 -1.16 0.149 0.149 -0.308 0.835 -0.303 0.329
##
## Sum of squares = 2.92 Mean square = 0.37 n = 8
##
## Overall (Sum over all 8 folds)
## ms
## 1.7