# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
tds = qb_stats["tds"]
# Generate clean data set
data.scaled.no_combine.for_tds = data.frame(scale(na.omit(cbind(tds, college_stats))))
# Generate the linear model
lm.scaled.no_combine.tds <- lm(formula = tds ~ ., data = data.scaled.no_combine.for_tds)
# Find optimum linear regression model for tds
step_reg.scaled.no_combine.tds <- stepAIC(lm.scaled.no_combine.tds, direction = "both")
## Start: AIC=8.2
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_inter 1 0.04 221 6.24
## - height 1 0.07 221 6.27
## - c_numyrs 1 0.12 221 6.33
## - c_avg_att 1 0.20 221 6.41
## - c_avg_tds 1 0.20 221 6.42
## - c_avg_cmpp 1 0.81 222 7.06
## - c_rate 1 1.07 222 7.34
## - c_pct 1 1.46 222 7.75
## - age 1 1.65 222 7.95
## <none> 221 8.20
## - weight 1 1.96 223 8.28
## - c_avg_yds 1 3.20 224 9.59
##
## Step: AIC=6.24
## tds ~ height + weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.08 221 4.32
## - c_numyrs 1 0.10 221 4.35
## - c_avg_tds 1 0.20 221 4.46
## - c_avg_att 1 0.40 221 4.67
## - c_avg_cmpp 1 0.87 222 5.17
## - c_rate 1 1.06 222 5.38
## - c_pct 1 1.46 222 5.80
## - age 1 1.69 222 6.04
## <none> 221 6.24
## - weight 1 2.18 223 6.56
## - c_avg_yds 1 3.26 224 7.70
## + c_avg_inter 1 0.04 221 8.20
##
## Step: AIC=4.32
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.08 221 2.40
## - c_avg_tds 1 0.22 221 2.55
## - c_avg_att 1 0.39 221 2.74
## - c_avg_cmpp 1 0.87 222 3.25
## - c_rate 1 1.04 222 3.43
## - c_pct 1 1.42 222 3.83
## - age 1 1.64 222 4.07
## <none> 221 4.32
## - weight 1 2.85 224 5.35
## - c_avg_yds 1 3.22 224 5.74
## + height 1 0.08 221 6.24
## + c_avg_inter 1 0.05 221 6.27
##
## Step: AIC=2.4
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_tds +
## c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.21 221 0.63
## - c_avg_att 1 0.44 221 0.88
## - c_avg_cmpp 1 0.83 222 1.29
## - c_rate 1 1.04 222 1.52
## - c_pct 1 1.36 222 1.85
## - age 1 1.66 223 2.17
## <none> 221 2.40
## - weight 1 2.78 224 3.35
## - c_avg_yds 1 3.29 224 3.89
## + c_numyrs 1 0.08 221 4.32
## + height 1 0.05 221 4.35
## + c_avg_inter 1 0.02 221 4.38
##
## Step: AIC=0.63
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_att 1 0.56 222 -0.773
## - c_avg_cmpp 1 0.69 222 -0.636
## - c_rate 1 0.84 222 -0.483
## - c_pct 1 1.15 222 -0.146
## - age 1 1.56 223 0.280
## <none> 221 0.626
## - weight 1 2.75 224 1.544
## + c_avg_tds 1 0.21 221 2.405
## + c_numyrs 1 0.07 221 2.552
## + height 1 0.07 221 2.556
## + c_avg_inter 1 0.02 221 2.601
## - c_avg_yds 1 4.07 225 2.934
##
## Step: AIC=-0.77
## tds ~ weight + age + c_avg_cmpp + c_rate + c_pct + c_avg_yds
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.52 222 -2.224
## - c_pct 1 1.62 223 -1.056
## - age 1 1.64 223 -1.030
## <none> 222 -0.773
## - weight 1 2.78 224 0.173
## - c_avg_cmpp 1 3.10 225 0.508
## + c_avg_att 1 0.56 221 0.626
## + c_avg_tds 1 0.33 221 0.876
## - c_avg_yds 1 3.51 225 0.934
## + c_avg_inter 1 0.25 221 0.965
## + c_numyrs 1 0.13 222 1.092
## + height 1 0.05 222 1.174
##
## Step: AIC=-2.22
## tds ~ weight + age + c_avg_cmpp + c_pct + c_avg_yds
##
## Df Sum of Sq RSS AIC
## - c_pct 1 1.64 224 -2.489
## <none> 222 -2.224
## - age 1 2.06 224 -2.044
## - weight 1 2.67 225 -1.404
## - c_avg_cmpp 1 3.21 225 -0.841
## + c_rate 1 0.52 222 -0.773
## + c_avg_att 1 0.24 222 -0.483
## + c_numyrs 1 0.12 222 -0.353
## + c_avg_tds 1 0.03 222 -0.254
## + height 1 0.03 222 -0.253
## + c_avg_inter 1 0.01 222 -0.234
## - c_avg_yds 1 3.90 226 -0.119
##
## Step: AIC=-2.49
## tds ~ weight + age + c_avg_cmpp + c_avg_yds
##
## Df Sum of Sq RSS AIC
## <none> 224 -2.489
## + c_pct 1 1.64 222 -2.224
## + c_avg_att 1 1.53 222 -2.107
## - c_avg_cmpp 1 2.32 226 -2.060
## - age 1 2.67 226 -1.693
## + c_rate 1 0.54 223 -1.056
## - c_avg_yds 1 3.43 227 -0.902
## + c_avg_inter 1 0.37 224 -0.884
## - weight 1 3.60 227 -0.725
## + c_avg_tds 1 0.17 224 -0.666
## + c_numyrs 1 0.02 224 -0.509
## + height 1 0.01 224 -0.498
summary(step_reg.scaled.no_combine.tds)
##
## Call:
## lm(formula = tds ~ weight + age + c_avg_cmpp + c_avg_yds, data = data.scaled.no_combine.for_tds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.208 -0.692 -0.169 0.651 3.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.33e-16 6.41e-02 0.00 1.000
## weight 1.31e-01 6.79e-02 1.93 0.055 .
## age 1.09e-01 6.57e-02 1.66 0.098 .
## c_avg_cmpp -4.61e-01 2.98e-01 -1.55 0.123
## c_avg_yds 5.59e-01 2.97e-01 1.88 0.061 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.984 on 231 degrees of freedom
## Multiple R-squared: 0.0475, Adjusted R-squared: 0.031
## F-statistic: 2.88 on 4 and 231 DF, p-value: 0.0235
plot(step_reg.scaled.no_combine.tds)
leaps.scaled.no_combine.tds <- regsubsets(tds ~ ., data = data.scaled.no_combine.for_tds,
nbest = 10)
subsets(leaps.scaled.no_combine.tds, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_tds, step_reg.scaled.no_combine.tds, m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: tds
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 4.0 4.00 4.12 0.043 *
## age 1 2.2 2.16 2.23 0.137
## c_avg_cmpp 1 1.6 1.58 1.63 0.203
## c_avg_yds 1 3.4 3.43 3.54 0.061 .
## Residuals 231 223.8 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 15 18 24 26 35 37 39
## Predicted 0.2349 -0.18168 0.0327 -0.0831 0.490 0.246 -0.304 0.1392
## cvpred 0.2898 -0.00784 0.1076 0.0119 0.395 0.241 -0.125 0.1246
## tds 0.3776 -0.97268 0.0401 0.7152 0.209 -0.973 -1.985 0.0401
## CV residual 0.0879 -0.96483 -0.0676 0.7034 -0.187 -1.214 -1.860 -0.0846
## 41 45 52 56 62 70 71 75 78
## Predicted 0.279 0.0782 0.296 0.143 -0.0663 0.0798 0.140 0.638 0.422
## cvpred 0.290 0.0938 0.313 0.128 -0.0264 0.1564 0.126 0.540 0.325
## tds -0.129 -0.2975 0.884 -0.973 -0.8039 -0.8039 2.403 -0.129 1.559
## CV residual -0.419 -0.3913 0.571 -1.101 -0.7775 -0.9603 2.277 -0.669 1.234
## 80 85 88 89 103 116 118
## Predicted 0.00346 -0.0465 0.143 0.0371 -0.0792 -0.094850 0.101
## cvpred 0.08129 0.0321 0.158 0.0608 -0.0565 -0.000817 0.138
## tds 0.04005 0.3776 2.403 -0.1287 0.5464 -1.141463 -0.129
## CV residual -0.04124 0.3455 2.245 -0.1896 0.6029 -1.140645 -0.267
## 119 121 133 136 139 149 152 159
## Predicted -0.0553 0.196 -0.280 -0.00256 -0.1104 -0.1175 -0.0352 -0.247
## cvpred 0.0389 0.199 -0.213 0.00370 -0.0773 -0.0903 0.0347 -0.157
## tds -0.4663 -1.141 -0.466 -0.63510 0.5464 -1.6478 -0.6351 0.209
## CV residual -0.5053 -1.340 -0.253 -0.63880 0.6237 -1.5575 -0.6698 0.366
## 165 166 170 173 175 188 193 198
## Predicted -0.1677 -0.1058 -0.162 0.0129 -0.1165 -0.200 -0.243 -0.04440
## cvpred -0.0444 -0.0172 -0.102 0.0289 -0.0948 -0.159 -0.138 0.00551
## tds -0.1287 -0.8039 0.378 3.0782 0.5464 -0.635 -0.466 -0.97268
## CV residual -0.0844 -0.7867 0.480 3.0494 0.6412 -0.476 -0.328 -0.97819
## 205 211 220 224 225 230 239
## Predicted -0.128 -0.0635 -0.356 -0.295 -0.162 -0.252 -0.507
## cvpred -0.075 -0.0827 -0.245 -0.251 -0.111 -0.227 -0.415
## tds -0.298 -1.1415 -0.804 -0.635 0.715 -1.817 -0.973
## CV residual -0.223 -1.0588 -0.559 -0.384 0.826 -1.589 -0.558
##
## Sum of squares = 46.8 Mean square = 1 n = 47
##
## fold 2
## Observations in test set: 48
## 4 5 7 14 17 33 36 42
## Predicted -0.1434 0.344 0.158 0.348 0.1402 -0.152 0.0101 0.0483
## cvpred -0.1375 0.258 0.109 0.267 0.0969 -0.172 -0.0115 0.0290
## tds 0.0401 -0.298 1.559 1.053 0.2088 -0.466 -1.1415 -0.2975
## CV residual 0.1775 -0.556 1.450 0.785 0.1120 -0.294 -1.1300 -0.3265
## 46 47 65 66 69 73 77 99
## Predicted -0.0269 -0.0542 -0.202 0.0688 -0.0829 0.1496 0.0237 0.281
## cvpred -0.0239 -0.1124 -0.163 -0.0172 -0.1078 0.0963 -0.0244 0.141
## tds -0.4663 -0.8039 0.884 0.7152 -0.4663 0.2088 -1.1415 0.546
## CV residual -0.4424 -0.6915 1.047 0.7324 -0.3585 0.1126 -1.1171 0.405
## 100 102 114 122 126 128 131 138
## Predicted 0.156 0.218 -0.0045 0.0193 -0.0546 -0.0767 -0.297 0.118
## cvpred 0.101 0.105 -0.0224 -0.0316 -0.0984 -0.1023 -0.306 0.055
## tds 0.546 1.559 1.0528 0.7152 -0.6351 -0.4663 -0.129 1.897
## CV residual 0.446 1.454 1.0752 0.7468 -0.5367 -0.3640 0.177 1.842
## 141 144 147 153 156 158 161 163
## Predicted -0.1899 0.06407 -0.315 -0.274 0.06616 -0.289 -0.0691 -0.0509
## cvpred -0.2132 -0.00455 -0.290 -0.244 0.00745 -0.332 -0.1354 -0.0533
## tds -0.1287 1.72793 -0.635 -0.804 0.71520 0.378 -0.8039 1.3904
## CV residual 0.0844 1.73248 -0.345 -0.560 0.70775 0.710 -0.6685 1.4436
## 164 171 172 174 176 178 183 184
## Predicted 0.1497 0.1088 0.0274 0.2280 -0.284 -0.260 0.2828 0.1351
## cvpred 0.0785 0.0266 -0.0539 0.0691 -0.293 -0.266 0.1806 0.0549
## tds 1.0528 0.3776 -1.1415 2.7407 0.378 1.897 0.0401 -0.2975
## CV residual 0.9742 0.3510 -1.0876 2.6716 0.671 2.162 -0.1405 -0.3524
## 187 191 199 201 204 207 213 214
## Predicted -0.242 -0.0565 -0.28 0.0472 0.0405 -0.267 -0.3550 -0.226
## cvpred -0.255 -0.1282 -0.30 -0.0290 -0.0457 -0.247 -0.3836 -0.276
## tds -0.466 -1.3103 -1.31 0.2088 0.5464 -0.973 0.0401 -0.804
## CV residual -0.211 -1.1820 -1.01 0.2379 0.5921 -0.726 0.4236 -0.528
##
## Sum of squares = 41.8 Mean square = 0.87 n = 48
##
## fold 3
## Observations in test set: 47
## 2 6 25 29 34 43 44 48
## Predicted 0.303 0.0805 -0.147 0.163 -0.0559 0.111 0.290 0.0895
## cvpred 0.411 0.0995 -0.143 0.143 -0.0933 0.166 0.407 0.1511
## tds 1.897 -0.2975 0.378 -0.635 -0.8039 -0.298 -1.479 -1.1415
## CV residual 1.485 -0.3971 0.521 -0.778 -0.7106 -0.464 -1.886 -1.2925
## 50 51 55 59 60 64 76 79
## Predicted 0.0211 0.207 -0.0450 -0.0284 0.0252 0.535 -0.0157 -0.0316
## cvpred -0.0439 0.257 -0.0395 -0.0682 0.0493 0.641 -0.0351 0.0179
## tds 1.0528 1.053 -0.9727 0.3776 -0.4663 1.390 -0.8039 1.0528
## CV residual 1.0967 0.795 -0.9332 0.4458 -0.5156 0.749 -0.7688 1.0349
## 86 87 90 94 101 106 107 108
## Predicted 0.172 0.164 0.0967 0.0256 0.575 0.288 0.0388 -0.129
## cvpred 0.214 0.167 0.1203 0.1284 0.865 0.467 0.1083 -0.102
## tds -0.298 -0.973 -1.3103 0.5464 0.546 -1.310 0.3776 -0.298
## CV residual -0.511 -1.139 -1.4306 0.4180 -0.319 -1.777 0.2693 -0.195
## 112 123 124 134 137 148 157 177
## Predicted 0.4582 -0.0841 -0.1029 -0.0624 -0.153 0.122 -0.0563 -0.0338
## cvpred 0.5785 -0.0629 -0.0661 -0.0406 -0.156 0.184 -0.0709 0.0302
## tds 0.0401 0.7152 0.0401 -0.6351 -0.804 -0.635 0.3776 -0.1287
## CV residual -0.5384 0.7781 0.1061 -0.5945 -0.648 -0.819 0.4485 -0.1590
## 179 180 181 185 192 194 197 200
## Predicted -0.296 -0.1171 0.0569 0.658 -0.0460 -0.1445 -0.0053 -0.0629
## cvpred -0.234 -0.0712 0.1364 1.058 0.0274 -0.1490 -0.0157 -0.0689
## tds -0.973 1.0528 0.2088 -0.804 0.2088 -0.1287 0.3776 1.0528
## CV residual -0.739 1.1240 0.0724 -1.862 0.1815 0.0202 0.3933 1.1217
## 208 215 216 217 233 236 238
## Predicted -0.224 -0.181 -0.301 -0.205 -0.09500 -0.234 -0.2364
## cvpred -0.227 -0.185 -0.351 -0.172 -0.00905 -0.203 -0.1813
## tds -0.466 -1.310 1.222 -0.466 -0.12874 -1.648 -0.1287
## CV residual -0.239 -1.125 1.572 -0.294 -0.11968 -1.445 0.0526
##
## Sum of squares = 37 Mean square = 0.79 n = 47
##
## fold 4
## Observations in test set: 47
## 9 10 11 13 16 22 27 28
## Predicted 0.0305 -0.0732 0.3283 0.110 -0.101 0.200 -0.012894 0.0697
## cvpred -0.0195 -0.0885 0.3188 0.136 -0.142 0.184 -0.000759 0.0427
## tds 0.7152 0.2088 0.0401 -0.973 -1.479 2.741 0.208839 -0.1287
## CV residual 0.7347 0.2974 -0.2787 -1.108 -1.337 2.556 0.209598 -0.1714
## 31 32 38 49 53 58 63 67 74
## Predicted 0.0404 0.0220 0.255 -0.185 -0.1432 0.249 0.492 0.0410 0.4807
## cvpred 0.0238 0.0644 0.250 -0.268 -0.1657 0.253 0.479 0.0145 0.5340
## tds 1.3904 -0.4663 1.728 -1.817 -0.1287 0.378 1.728 0.3776 0.0401
## CV residual 1.3666 -0.5307 1.478 -1.548 0.0369 0.125 1.249 0.3632 -0.4939
## 84 92 93 95 97 104 105 109
## Predicted -0.288 0.223 0.0950 0.1042 0.0236 0.241 0.1050 0.0215
## cvpred -0.323 0.213 0.0708 0.1132 0.0252 0.250 0.1121 -0.0358
## tds 0.546 -1.985 1.5591 0.0401 -0.9727 0.378 0.2088 1.0528
## CV residual 0.870 -2.199 1.4883 -0.0732 -0.9979 0.127 0.0968 1.0885
## 110 113 117 120 125 127 130 143
## Predicted 0.212 -0.0093 0.120 -0.0439 0.110 0.224 0.2621 -0.0490
## cvpred 0.168 -0.0603 0.125 -0.0426 0.109 0.254 0.2689 -0.0729
## tds 0.546 -0.9727 -1.479 -0.9727 -0.298 1.728 0.2088 1.7279
## CV residual 0.379 -0.9124 -1.604 -0.9300 -0.406 1.473 -0.0601 1.8008
## 145 146 162 182 189 195 196 209
## Predicted -0.133 -0.0771 0.0587 -0.274 -0.4575 -0.149 -0.207 -0.145
## cvpred -0.186 -0.0801 0.0227 -0.362 -0.5670 -0.209 -0.276 -0.151
## tds 0.884 -0.6351 -0.9727 0.209 -0.6351 -0.635 -0.635 -0.804
## CV residual 1.070 -0.5550 -0.9954 0.571 -0.0681 -0.426 -0.359 -0.653
## 212 223 226 228 232 237
## Predicted -0.214 -0.372 -0.177 -0.27224 -0.611 -0.169
## cvpred -0.243 -0.458 -0.220 -0.30474 -0.722 -0.224
## tds 1.053 1.053 0.546 -0.29752 1.053 -0.804
## CV residual 1.296 1.511 0.766 0.00722 1.775 -0.580
##
## Sum of squares = 50.6 Mean square = 1.08 n = 47
##
## fold 5
## Observations in test set: 47
## 1 8 12 19 20 21 23 30
## Predicted -0.00162 0.0356 0.381 -0.0636 0.0436 0.282 -0.0278 0.401
## cvpred 0.00413 0.0317 0.414 -0.0717 0.0011 0.277 -0.0340 0.459
## tds 2.40308 1.3904 -0.466 0.0401 -0.2975 -0.635 1.5591 2.909
## CV residual 2.39895 1.3586 -0.880 0.1118 -0.2986 -0.913 1.5932 2.450
## 40 54 57 61 68 72 81 82
## Predicted 0.311 0.201 0.316 0.231 0.01923 -0.0524 0.168 0.416
## cvpred 0.317 0.197 0.357 0.259 0.03028 -0.0671 0.239 0.502
## tds -0.129 -0.298 1.222 -0.635 0.04005 1.0528 -0.635 -1.479
## CV residual -0.445 -0.494 0.865 -0.894 0.00977 1.1199 -0.874 -1.981
## 91 96 98 111 115 129 132 135
## Predicted 0.476 -0.0974 0.0412 -0.0191 0.0895 0.102 -0.205 0.0816
## cvpred 0.546 -0.1169 0.0292 0.0257 0.1077 0.104 -0.229 0.1683
## tds -1.648 -0.1287 0.8840 0.0401 -0.8039 -1.141 -0.129 -0.6351
## CV residual -2.194 -0.0118 0.8548 0.0144 -0.9116 -1.245 0.100 -0.8034
## 140 142 150 151 154 155 160 167
## Predicted 0.0403 -0.104 0.138 0.134 -0.255 0.118 0.1186 -0.0755
## cvpred 0.0532 -0.104 0.156 0.142 -0.256 0.111 0.1318 -0.0779
## tds -0.1287 -0.298 2.572 -0.129 -1.310 1.897 0.0401 -1.1415
## CV residual -0.1819 -0.194 2.416 -0.271 -1.055 1.786 -0.0918 -1.0635
## 168 169 186 190 202 203 206 210
## Predicted -0.05554 -0.286 -0.2106 0.0807 -0.134 0.236 -0.161 -0.00560
## cvpred -0.00739 -0.292 -0.1953 0.0731 -0.148 0.278 -0.162 -0.00714
## tds -0.46631 -1.141 -0.1287 -0.8039 -0.804 1.222 -0.804 1.55914
## CV residual -0.45892 -0.850 0.0666 -0.8770 -0.656 0.943 -0.642 1.56628
## 218 219 222 227 229 235 240
## Predicted 0.0545 -0.452 -0.48 -0.299 -0.29245 -0.262 -0.506
## cvpred 0.0603 -0.460 -0.54 -0.304 -0.29901 -0.286 -0.496
## tds -0.6351 -0.635 -1.31 0.546 -0.29752 -0.973 0.378
## CV residual -0.6954 -0.175 -0.77 0.850 0.00148 -0.687 0.874
##
## Sum of squares = 54.8 Mean square = 1.16 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.979