# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
tds = qb_stats["tds"]
# Generate clean data set
data.log.no_combine.for_tds = data.frame(log(na.omit(cbind(tds, college_stats)) +
0.1))
# Generate the linear model
lm.log.no_combine.tds <- lm(formula = tds ~ ., data = data.log.no_combine.for_tds)
# Find optimum linear regression model for tds
step_reg.log.no_combine.tds <- stepAIC(lm.log.no_combine.tds, direction = "both")
## Start: AIC=-154.6
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_yds 1 0.008 111 -157
## - height 1 0.014 111 -156
## - c_rate 1 0.016 111 -156
## - c_numyrs 1 0.052 111 -156
## - c_avg_tds 1 0.106 111 -156
## - c_avg_inter 1 0.262 111 -156
## - weight 1 0.639 111 -155
## - age 1 0.805 112 -155
## - c_pct 1 0.931 112 -155
## <none> 111 -155
## - c_avg_cmpp 1 0.990 112 -154
## - c_avg_att 1 1.015 112 -154
##
## Step: AIC=-156.6
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.013 111 -158
## - c_numyrs 1 0.046 111 -158
## - c_avg_tds 1 0.341 111 -158
## - c_rate 1 0.503 111 -158
## - weight 1 0.654 111 -157
## - age 1 0.808 112 -157
## - c_avg_inter 1 0.883 112 -157
## <none> 111 -157
## - c_pct 1 0.970 112 -156
## - c_avg_cmpp 1 1.005 112 -156
## - c_avg_att 1 1.009 112 -156
## + c_avg_yds 1 0.008 111 -155
##
## Step: AIC=-158.5
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.042 111 -160
## - c_avg_tds 1 0.345 111 -160
## - c_rate 1 0.511 111 -159
## - age 1 0.796 112 -159
## - weight 1 0.875 112 -159
## - c_avg_inter 1 0.876 112 -159
## <none> 111 -158
## - c_pct 1 0.956 112 -158
## - c_avg_cmpp 1 0.991 112 -158
## - c_avg_att 1 0.996 112 -158
## + height 1 0.013 111 -157
## + c_avg_yds 1 0.008 111 -156
##
## Step: AIC=-160.4
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.410 111 -162
## - c_rate 1 0.591 111 -161
## - age 1 0.805 112 -161
## - weight 1 0.834 112 -161
## <none> 111 -160
## - c_avg_inter 1 0.977 112 -160
## - c_pct 1 1.122 112 -160
## - c_avg_cmpp 1 1.153 112 -160
## - c_avg_att 1 1.157 112 -160
## + c_numyrs 1 0.042 111 -158
## + height 1 0.009 111 -158
## + c_avg_yds 1 0.002 111 -158
##
## Step: AIC=-161.6
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.185 111 -163
## - weight 1 0.753 112 -162
## - c_avg_inter 1 0.803 112 -162
## - c_pct 1 0.884 112 -162
## - c_avg_cmpp 1 0.936 112 -162
## - age 1 0.937 112 -162
## <none> 111 -162
## - c_avg_att 1 0.952 112 -162
## + c_avg_tds 1 0.410 111 -160
## + c_avg_yds 1 0.251 111 -160
## + c_numyrs 1 0.107 111 -160
## + height 1 0.010 111 -160
##
## Step: AIC=-163.2
## tds ~ weight + age + c_avg_cmpp + c_pct + c_avg_inter + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.622 112 -164
## - weight 1 0.683 112 -164
## - c_pct 1 0.753 112 -164
## - c_avg_cmpp 1 0.818 112 -163
## - c_avg_att 1 0.831 112 -163
## - age 1 0.836 112 -163
## <none> 111 -163
## + c_avg_yds 1 0.340 111 -162
## + c_rate 1 0.185 111 -162
## + c_numyrs 1 0.124 111 -161
## + height 1 0.013 111 -161
## + c_avg_tds 1 0.004 111 -161
##
## Step: AIC=-163.9
## tds ~ weight + age + c_avg_cmpp + c_pct + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.360 112 -165
## - c_avg_cmpp 1 0.403 112 -165
## - c_avg_att 1 0.408 112 -165
## - weight 1 0.417 112 -165
## - age 1 0.805 113 -164
## <none> 112 -164
## + c_avg_inter 1 0.622 111 -163
## + c_avg_yds 1 0.223 112 -162
## + c_numyrs 1 0.207 112 -162
## + c_avg_tds 1 0.062 112 -162
## + c_rate 1 0.004 112 -162
## + height 1 0.002 112 -162
##
## Step: AIC=-165.1
## tds ~ weight + age + c_avg_cmpp + c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.668 113 -166
## - age 1 0.778 113 -166
## <none> 112 -165
## - c_avg_att 1 0.974 113 -165
## - c_avg_cmpp 1 1.010 113 -165
## + c_pct 1 0.360 112 -164
## + c_numyrs 1 0.293 112 -164
## + c_avg_inter 1 0.229 112 -164
## + c_avg_yds 1 0.117 112 -163
## + c_avg_tds 1 0.041 112 -163
## + c_rate 1 0.002 112 -163
## + height 1 0.001 112 -163
##
## Step: AIC=-165.7
## tds ~ age + c_avg_cmpp + c_avg_att
##
## Df Sum of Sq RSS AIC
## - age 1 0.530 114 -167
## <none> 113 -166
## + weight 1 0.668 112 -165
## + c_pct 1 0.610 112 -165
## - c_avg_att 1 1.359 114 -165
## - c_avg_cmpp 1 1.466 114 -165
## + height 1 0.300 113 -164
## + c_numyrs 1 0.133 113 -164
## + c_avg_yds 1 0.067 113 -164
## + c_avg_inter 1 0.032 113 -164
## + c_avg_tds 1 0.027 113 -164
## + c_rate 1 0.002 113 -164
##
## Step: AIC=-166.6
## tds ~ c_avg_cmpp + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 114 -167
## + age 1 0.530 113 -166
## + c_pct 1 0.522 113 -166
## + weight 1 0.420 113 -166
## - c_avg_att 1 1.609 115 -165
## + height 1 0.249 113 -165
## - c_avg_cmpp 1 1.690 115 -165
## + c_numyrs 1 0.173 113 -165
## + c_avg_tds 1 0.086 114 -165
## + c_avg_inter 1 0.050 114 -165
## + c_avg_yds 1 0.040 114 -165
## + c_rate 1 0.003 114 -165
summary(step_reg.log.no_combine.tds)
##
## Call:
## lm(formula = tds ~ c_avg_cmpp + c_avg_att, data = data.log.no_combine.for_tds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.637 -0.274 0.080 0.430 1.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.146 0.601 5.24 3.6e-07 ***
## c_avg_cmpp 0.805 0.432 1.86 0.064 .
## c_avg_att -0.875 0.481 -1.82 0.071 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.698 on 233 degrees of freedom
## Multiple R-squared: 0.0148, Adjusted R-squared: 0.00637
## F-statistic: 1.75 on 2 and 233 DF, p-value: 0.175
plot(step_reg.log.no_combine.tds)
leaps.log.no_combine.tds <- regsubsets(tds ~ ., data = data.log.no_combine.for_tds,
nbest = 10)
subsets(leaps.log.no_combine.tds, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.log.no_combine.for_tds, step_reg.log.no_combine.tds, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: tds
## Df Sum Sq Mean Sq F value Pr(>F)
## c_avg_cmpp 1 0.1 0.100 0.21 0.650
## c_avg_att 1 1.6 1.609 3.30 0.071 .
## Residuals 233 113.6 0.487
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 15 18 24 26 35 37 39 41 45
## Predicted 2.462 2.444 2.324 2.329 2.37 2.18 2.27 2.378 2.3775 2.3400
## cvpred 2.437 2.430 2.357 2.360 2.38 2.27 2.33 2.386 2.3905 2.3629
## tds 2.646 1.808 2.493 2.779 2.57 1.81 -2.30 2.493 2.4069 2.3125
## CV residual 0.209 -0.622 0.136 0.419 0.19 -0.46 -4.63 0.107 0.0164 -0.0503
## 52 56 62 70 71 75 78 80 85
## Predicted 2.383 2.31 2.149 2.362 2.285 2.3262 2.327 2.4073 2.311
## cvpred 2.394 2.35 2.253 2.379 2.330 2.3564 2.355 2.4041 2.345
## tds 2.839 1.81 1.960 1.960 3.262 2.4069 3.049 2.4932 2.646
## CV residual 0.445 -0.54 -0.293 -0.419 0.932 0.0505 0.694 0.0891 0.301
## 88 89 103 116 118 119 121 133 136
## Predicted 2.361 2.3238 2.28 2.351 2.255 2.32 2.256 2.2295 2.301
## cvpred 2.379 2.3541 2.32 2.373 2.315 2.35 2.310 2.2935 2.339
## tds 3.262 2.4069 2.71 1.629 2.407 2.21 1.629 2.2083 2.092
## CV residual 0.883 0.0529 0.39 -0.743 0.092 -0.14 -0.681 -0.0852 -0.248
## 139 149 152 159 165 166 170 173 175 188
## Predicted 2.512 2.318 2.353 2.216 2.39720 2.363 2.287 2.30 2.26 2.38
## cvpred 2.450 2.347 2.375 2.288 2.39697 2.380 2.331 2.34 2.31 2.38
## tds 2.715 0.742 2.092 2.573 2.40695 1.960 2.646 3.40 2.71 2.09
## CV residual 0.265 -1.605 -0.283 0.285 0.00998 -0.419 0.315 1.07 0.40 -0.29
## 193 198 205 211 220 224 225 230
## Predicted 2.2049 2.344 2.26745 2.224 2.215 2.177 2.293 2.1539
## cvpred 2.2809 2.360 2.31643 2.289 2.283 2.256 2.334 2.2436
## tds 2.2083 1.808 2.31254 1.629 1.960 2.092 2.779 0.0953
## CV residual -0.0726 -0.551 -0.00389 -0.659 -0.323 -0.165 0.445 -2.1483
## 239
## Predicted 2.247
## cvpred 2.297
## tds 1.808
## CV residual -0.489
##
## Sum of squares = 37 Mean square = 0.79 n = 47
##
## fold 2
## Observations in test set: 48
## 4 5 7 14 17 33 36 42 46
## Predicted 2.402 2.3097 2.486 2.435 2.29 2.10 2.364 2.3720 2.3246
## cvpred 2.375 2.2714 2.472 2.408 2.25 2.06 2.332 2.3382 2.2876
## tds 2.493 2.3125 3.049 2.896 2.57 2.21 1.629 2.3125 2.2083
## CV residual 0.118 0.0411 0.577 0.488 0.32 0.15 -0.702 -0.0256 -0.0793
## 47 65 66 69 73 77 99 100 102 114
## Predicted 2.281 2.332 2.32 2.398 2.405 2.295 2.48 2.360 2.35 2.231
## cvpred 2.245 2.294 2.29 2.370 2.379 2.256 2.46 2.328 2.32 2.187
## tds 1.960 2.839 2.78 2.208 2.573 1.629 2.71 2.715 3.05 2.896
## CV residual -0.284 0.545 0.49 -0.162 0.193 -0.627 0.25 0.386 0.73 0.709
## 122 126 128 131 138 141 144 147 153
## Predicted 2.187 2.354 2.419 2.305 2.301 2.312 2.317 2.340 2.270
## cvpred 2.139 2.325 2.394 2.277 2.261 2.275 2.284 2.308 2.228
## tds 2.779 2.092 2.208 2.407 3.140 2.407 3.096 2.092 1.960
## CV residual 0.639 -0.233 -0.186 0.129 0.879 0.132 0.811 -0.216 -0.268
## 156 158 161 163 164 171 172 174 176 178
## Predicted 2.214 2.395 2.32 2.304 2.280 2.297 2.355 2.26 2.22 2.18
## cvpred 2.172 2.371 2.29 2.266 2.240 2.257 2.328 2.22 2.19 2.14
## tds 2.779 2.646 1.96 3.001 2.896 2.646 1.629 3.34 2.65 3.14
## CV residual 0.607 0.275 -0.33 0.735 0.656 0.389 -0.699 1.11 0.46 1.00
## 183 184 187 191 199 201 204 207 213
## Predicted 2.245 2.234 2.2609 2.190 2.273 2.224 2.454 2.247 2.208
## cvpred 2.202 2.193 2.2214 2.146 2.235 2.180 2.432 2.205 2.168
## tds 2.493 2.313 2.2083 1.411 1.411 2.573 2.715 1.808 2.493
## CV residual 0.291 0.119 -0.0131 -0.735 -0.824 0.392 0.283 -0.397 0.325
## 214
## Predicted 2.247
## cvpred 2.207
## tds 1.960
## CV residual -0.247
##
## Sum of squares = 12.1 Mean square = 0.25 n = 48
##
## fold 3
## Observations in test set: 47
## 2 6 25 29 34 43 44 48 50
## Predicted 2.415 2.2551 1.989 2.322 2.308 2.422 2.25 2.198 2.341
## cvpred 2.468 2.2436 1.739 2.348 2.306 2.460 2.25 2.160 2.362
## tds 3.140 2.3125 2.646 2.092 1.960 2.313 1.13 1.629 2.896
## CV residual 0.672 0.0689 0.907 -0.256 -0.346 -0.147 -1.12 -0.531 0.534
## 51 55 59 60 64 76 79 86 87
## Predicted 2.324 2.340 2.314 2.2860 2.31 2.315 2.28 2.3187 2.364
## cvpred 2.339 2.365 2.329 2.2731 2.32 2.308 2.28 2.3412 2.368
## tds 2.896 1.808 2.646 2.2083 3.00 1.960 2.90 2.3125 1.808
## CV residual 0.556 -0.556 0.317 -0.0648 0.68 -0.348 0.62 -0.0286 -0.559
## 90 94 101 106 107 108 112 123 124 134
## Predicted 2.364 2.28 2.357 2.328 2.294 2.398 2.285 2.348 2.35 2.279
## cvpred 2.398 2.27 2.393 2.343 2.278 2.421 2.294 2.367 2.36 2.266
## tds 1.411 2.71 2.715 1.411 2.646 2.313 2.493 2.779 2.49 2.092
## CV residual -0.987 0.44 0.322 -0.932 0.368 -0.108 0.199 0.412 0.13 -0.175
## 137 148 157 177 179 180 181 185 192
## Predicted 2.322 2.404 2.204 2.175 2.274 2.177 2.336 2.203 2.227
## cvpred 2.311 2.448 2.092 2.100 2.245 2.110 2.336 2.179 2.194
## tds 1.960 2.092 2.646 2.407 1.808 2.896 2.573 1.960 2.573
## CV residual -0.351 -0.356 0.555 0.307 -0.436 0.786 0.237 -0.219 0.379
## 194 197 200 208 215 216 217 233 236
## Predicted 2.035 2.406 2.310 2.3141 2.083 2.216 2.2369 2.139 2.353
## cvpred 1.831 2.413 2.311 2.2914 1.965 2.173 2.1864 2.074 2.321
## tds 2.407 2.646 2.896 2.2083 1.411 2.950 2.2083 2.407 0.742
## CV residual 0.576 0.233 0.585 -0.0831 -0.554 0.777 0.0219 0.333 -1.579
## 238
## Predicted 2.3405
## cvpred 2.3141
## tds 2.4069
## CV residual 0.0928
##
## Sum of squares = 13.8 Mean square = 0.29 n = 47
##
## fold 4
## Observations in test set: 47
## 9 10 11 13 16 22 27 28 31 32
## Predicted 2.324 2.376 2.4330 2.314 2.36 2.380 2.348 2.295 2.3 2.435
## cvpred 2.334 2.397 2.4641 2.321 2.37 2.400 2.363 2.299 2.3 2.466
## tds 2.779 2.573 2.4932 1.808 1.13 3.336 2.573 2.407 3.0 2.208
## CV residual 0.445 0.176 0.0291 -0.513 -1.24 0.936 0.209 0.108 0.7 -0.258
## 38 49 53 58 63 67 74 84 92 93
## Predicted 2.359 2.4478 2.232 2.407 2.357 2.370 2.289 2.41 2.33 2.342
## cvpred 2.375 2.4824 2.224 2.432 2.372 2.389 2.291 2.43 2.35 2.356
## tds 3.096 0.0953 2.407 2.646 3.096 2.646 2.493 2.71 -2.30 3.049
## CV residual 0.721 -2.3871 0.183 0.214 0.723 0.257 0.202 0.28 -4.65 0.694
## 95 97 104 105 109 110 113 117 120
## Predicted 2.4299 2.233 2.329 2.218 2.30 2.252 2.301 2.29 2.382
## cvpred 2.4613 2.226 2.341 2.208 2.31 2.248 2.307 2.29 2.407
## tds 2.4932 1.808 2.646 2.573 2.90 2.715 1.808 1.13 1.808
## CV residual 0.0319 -0.418 0.305 0.364 0.59 0.467 -0.498 -1.16 -0.598
## 125 127 130 143 145 146 162 182 189
## Predicted 2.416 2.288 2.395 2.421 2.258 2.392 2.242 2.201 2.261
## cvpred 2.445 2.292 2.420 2.452 2.256 2.415 2.237 2.190 2.260
## tds 2.313 3.096 2.573 3.096 2.839 2.092 1.808 2.573 2.092
## CV residual -0.132 0.803 0.153 0.643 0.583 -0.323 -0.428 0.382 -0.168
## 195 196 209 212 223 226 228 232 237
## Predicted 2.276 2.1938 2.332 2.28 2.319 2.409 2.392 2.393 2.350
## cvpred 2.278 2.1783 2.346 2.29 2.330 2.437 2.418 2.419 2.368
## tds 2.092 2.0919 1.960 2.90 2.896 2.715 2.313 2.896 1.960
## CV residual -0.186 -0.0864 -0.386 0.61 0.566 0.278 -0.106 0.477 -0.408
##
## Sum of squares = 38.8 Mean square = 0.83 n = 47
##
## fold 5
## Observations in test set: 47
## 1 8 12 19 20 21 23 30 40
## Predicted 2.335 2.353 2.341 2.436 2.3083 2.290 2.602 2.19 2.3688
## cvpred 2.316 2.334 2.320 2.419 2.2927 2.276 2.617 2.19 2.3578
## tds 3.262 3.001 2.208 2.493 2.3125 2.092 3.049 3.37 2.4069
## CV residual 0.946 0.666 -0.112 0.074 0.0198 -0.184 0.432 1.18 0.0491
## 54 57 61 68 72 81 82 91 96
## Predicted 2.3693 2.229 2.30 2.282 2.399 2.274 2.29 2.251 2.3905
## cvpred 2.3400 2.228 2.29 2.280 2.387 2.268 2.29 2.248 2.3746
## tds 2.3125 2.950 2.09 2.493 2.896 2.092 1.13 0.742 2.4069
## CV residual -0.0275 0.722 -0.20 0.213 0.509 -0.176 -1.15 -1.506 0.0323
## 98 111 115 129 132 135 140 142 150
## Predicted 2.51 2.291 2.307 2.322 2.282 2.228 2.4139 2.3590 2.26
## cvpred 2.50 2.292 2.303 2.311 2.278 2.226 2.3937 2.3491 2.26
## tds 2.84 2.493 1.960 1.629 2.407 2.092 2.4069 2.3125 3.30
## CV residual 0.34 0.201 -0.343 -0.682 0.129 -0.134 0.0133 -0.0366 1.04
## 151 154 155 160 167 168 169 186 190
## Predicted 2.207 2.299 2.332 2.213 2.469 2.368 2.245 2.3545 2.191
## cvpred 2.201 2.349 2.311 2.226 2.461 2.351 2.250 2.3656 2.194
## tds 2.407 1.411 3.140 2.493 1.629 2.208 1.629 2.4069 1.960
## CV residual 0.206 -0.938 0.828 0.267 -0.832 -0.143 -0.621 0.0413 -0.233
## 202 203 206 210 218 219 222 227 229
## Predicted 2.229 2.27 2.315 2.297 2.288 2.18 1.955 2.272 2.2537
## cvpred 2.225 2.26 2.308 2.294 2.301 2.21 1.997 2.292 2.2789
## tds 1.960 2.95 1.960 3.049 2.092 2.09 1.411 2.715 2.3125
## CV residual -0.264 0.69 -0.348 0.756 -0.209 -0.12 -0.586 0.422 0.0337
## 235 240
## Predicted 2.325 2.328
## cvpred 2.316 2.357
## tds 1.808 2.646
## CV residual -0.508 0.289
##
## Sum of squares = 14.3 Mean square = 0.31 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.492