# Fetch Data
qb_stats <- read.csv("../data/qb_stats.csv")
# Grab the college predictors
predictors <- c("height", "weight", "age", "c_avg_cmpp", "c_rate", "c_pct",
"c_avg_inter", "c_avg_tds", "c_avg_yds", "c_numyrs", "c_avg_att")
college_stats = qb_stats[, predictors]
# Set the resopnse variables
ints = qb_stats["ints"]
# Generate clean data set
data.scaled.no_combine.for_ints = data.frame(scale(na.omit(cbind(ints, college_stats))))
# Generate the linear model
lm.scaled.no_combine.ints <- lm(formula = ints ~ ., data = data.scaled.no_combine.for_ints)
# Find optimum linear regression model for ints
step_reg.scaled.no_combine.ints <- stepAIC(lm.scaled.no_combine.ints, direction = "both")
## Start: AIC=1.67
## ints ~ height + weight + age + c_avg_cmpp + c_rate + c_pct +
## c_avg_inter + c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - weight 1 0.00 214 -0.33
## - c_avg_tds 1 0.01 214 -0.31
## - c_pct 1 0.03 214 -0.29
## - c_numyrs 1 0.05 214 -0.27
## - c_avg_cmpp 1 0.05 214 -0.27
## - c_rate 1 0.15 214 -0.16
## - c_avg_yds 1 0.43 214 0.15
## - height 1 0.58 214 0.30
## - c_avg_att 1 0.71 214 0.45
## <none> 214 1.67
## - c_avg_inter 1 4.92 219 5.02
## - age 1 5.07 219 5.18
##
## Step: AIC=-0.33
## ints ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_tds + c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_tds 1 0.01 214 -2.31
## - c_pct 1 0.03 214 -2.29
## - c_numyrs 1 0.05 214 -2.27
## - c_avg_cmpp 1 0.05 214 -2.27
## - c_rate 1 0.15 214 -2.16
## - c_avg_yds 1 0.43 214 -1.85
## - c_avg_att 1 0.71 214 -1.55
## - height 1 0.89 215 -1.35
## <none> 214 -0.33
## + weight 1 0.00 214 1.67
## - c_avg_inter 1 5.15 219 3.26
## - age 1 5.24 219 3.36
##
## Step: AIC=-2.31
## ints ~ height + age + c_avg_cmpp + c_rate + c_pct + c_avg_inter +
## c_avg_yds + c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_pct 1 0.02 214 -4.29
## - c_numyrs 1 0.05 214 -4.26
## - c_avg_cmpp 1 0.07 214 -4.24
## - c_rate 1 0.14 214 -4.16
## - c_avg_yds 1 0.42 214 -3.85
## - c_avg_att 1 0.70 214 -3.55
## - height 1 0.90 215 -3.32
## <none> 214 -2.31
## + c_avg_tds 1 0.01 214 -0.33
## + weight 1 0.00 214 -0.31
## - c_avg_inter 1 5.15 219 1.28
## - age 1 5.24 219 1.38
##
## Step: AIC=-4.29
## ints ~ height + age + c_avg_cmpp + c_rate + c_avg_inter + c_avg_yds +
## c_numyrs + c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_numyrs 1 0.04 214 -6.24
## - c_rate 1 0.16 214 -6.11
## - c_avg_cmpp 1 0.28 214 -5.98
## - c_avg_att 1 0.70 214 -5.52
## - c_avg_yds 1 0.79 214 -5.42
## - height 1 0.88 215 -5.32
## <none> 214 -4.29
## + c_pct 1 0.02 214 -2.31
## + c_avg_tds 1 0.00 214 -2.29
## + weight 1 0.00 214 -2.29
## - c_avg_inter 1 5.59 219 -0.22
## - age 1 5.65 219 -0.15
##
## Step: AIC=-6.24
## ints ~ height + age + c_avg_cmpp + c_rate + c_avg_inter + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_rate 1 0.18 214 -8.05
## - c_avg_cmpp 1 0.30 214 -7.92
## - c_avg_att 1 0.66 214 -7.52
## - c_avg_yds 1 0.76 214 -7.41
## - height 1 0.89 215 -7.27
## <none> 214 -6.24
## + c_numyrs 1 0.04 214 -4.29
## + c_pct 1 0.01 214 -4.26
## + c_avg_tds 1 0.01 214 -4.25
## + weight 1 0.00 214 -4.24
## - age 1 5.68 220 -2.08
## - c_avg_inter 1 5.70 220 -2.06
##
## Step: AIC=-8.05
## ints ~ height + age + c_avg_cmpp + c_avg_inter + c_avg_yds +
## c_avg_att
##
## Df Sum of Sq RSS AIC
## - c_avg_cmpp 1 0.22 214 -9.80
## - height 1 0.90 215 -9.06
## - c_avg_att 1 1.46 215 -8.44
## <none> 214 -8.05
## - c_avg_yds 1 2.27 216 -7.57
## + c_rate 1 0.18 214 -6.24
## + c_pct 1 0.06 214 -6.12
## + c_numyrs 1 0.06 214 -6.11
## + c_avg_tds 1 0.00 214 -6.05
## + weight 1 0.00 214 -6.05
## - c_avg_inter 1 5.54 220 -4.03
## - age 1 5.81 220 -3.75
##
## Step: AIC=-9.8
## ints ~ height + age + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## - height 1 0.82 215 -10.90
## <none> 214 -9.80
## - c_avg_yds 1 2.11 216 -9.50
## + c_avg_cmpp 1 0.22 214 -8.05
## + c_rate 1 0.10 214 -7.92
## + c_numyrs 1 0.08 214 -7.89
## + c_pct 1 0.01 214 -7.81
## + c_avg_tds 1 0.00 214 -7.80
## + weight 1 0.00 214 -7.80
## - age 1 6.08 220 -5.22
## - c_avg_att 1 6.15 220 -5.14
## - c_avg_inter 1 8.73 223 -2.41
##
## Step: AIC=-10.9
## ints ~ age + c_avg_inter + c_avg_yds + c_avg_att
##
## Df Sum of Sq RSS AIC
## <none> 215 -10.90
## - c_avg_yds 1 2.11 217 -10.61
## + height 1 0.82 214 -9.80
## + weight 1 0.34 215 -9.28
## + c_avg_cmpp 1 0.14 215 -9.06
## + c_rate 1 0.12 215 -9.04
## + c_numyrs 1 0.09 215 -9.00
## + c_avg_tds 1 0.00 215 -8.91
## + c_pct 1 0.00 215 -8.90
## - c_avg_att 1 5.91 221 -6.53
## - age 1 6.19 221 -6.23
## - c_avg_inter 1 8.13 223 -4.18
summary(step_reg.scaled.no_combine.ints)
##
## Call:
## lm(formula = ints ~ age + c_avg_inter + c_avg_yds + c_avg_att,
## data = data.scaled.no_combine.for_ints)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2899 -0.7650 0.0146 0.6994 2.4294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.81e-16 6.31e-02 0.00 1.0000
## age -1.64e-01 6.38e-02 -2.57 0.0107 *
## c_avg_inter 2.65e-01 8.98e-02 2.95 0.0035 **
## c_avg_yds 4.06e-01 2.70e-01 1.50 0.1342
## c_avg_att -7.37e-01 2.93e-01 -2.51 0.0126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.967 on 230 degrees of freedom
## Multiple R-squared: 0.0812, Adjusted R-squared: 0.0652
## F-statistic: 5.08 on 4 and 230 DF, p-value: 0.000609
plot(step_reg.scaled.no_combine.ints)
leaps.scaled.no_combine.ints <- regsubsets(ints ~ ., data = data.scaled.no_combine.for_ints,
nbest = 10)
subsets(leaps.scaled.no_combine.ints, statistic = "rsq")
## Error: invalid coordinate lengths
cv.lm(df = data.scaled.no_combine.for_ints, step_reg.scaled.no_combine.ints,
m = 5) # 5 fold cross-validation
## Analysis of Variance Table
##
## Response: ints
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 4.3 4.25 4.55 0.0340 *
## c_avg_inter 1 0.0 0.04 0.04 0.8418
## c_avg_yds 1 8.8 8.80 9.42 0.0024 **
## c_avg_att 1 5.9 5.91 6.32 0.0126 *
## Residuals 230 215.0 0.93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning:
##
## As there is >1 explanatory variable, cross-validation predicted values for
## a fold are not a linear function of corresponding overall predicted
## values. Lines that are shown for the different folds are approximate
##
## fold 1
## Observations in test set: 47
## 3 7 36 39 41 45 46 59
## Predicted -0.449 0.343 -0.179 0.2657 -0.3339 -0.0203 -0.393 -0.125
## cvpred -0.706 0.246 -0.249 0.1880 -0.4012 -0.1371 -0.390 -0.124
## ints 0.986 0.986 -0.486 0.1449 -0.0653 -1.1163 0.145 -0.696
## CV residual 1.691 0.740 -0.236 -0.0431 0.3359 -0.9792 0.535 -0.572
## 71 75 78 84 87 94 96 99
## Predicted -0.0146 -0.302 0.0597 -0.202 -0.241 0.0156 -0.0172 -0.585
## cvpred -0.1329 -0.580 -0.0684 -0.140 -0.455 -0.0282 -0.0972 -1.052
## ints 2.0366 1.406 0.1449 0.145 -1.326 0.5653 -0.6959 0.775
## CV residual 2.1696 1.987 0.2133 0.285 -0.872 0.5935 -0.5986 1.827
## 100 108 110 113 119 128 135 143
## Predicted 0.311 0.0617 0.153 -0.159 -0.124 0.1183 0.0878 0.2284
## cvpred 0.239 -0.0158 0.198 -0.206 -0.267 0.0309 -0.0558 0.0468
## ints 0.565 -0.0653 0.565 -0.696 1.196 1.1959 -0.4857 2.0366
## CV residual 0.326 -0.0495 0.367 -0.490 1.463 1.1650 -0.4298 1.9898
## 148 153 158 160 161 166 174 178
## Predicted 0.0669 -0.0824 -0.042 -0.268 0.1277 -0.0536 0.2379 0.0907
## cvpred -0.0673 -0.0323 -0.227 -0.460 -0.0533 -0.0728 -0.0203 0.0951
## ints 0.1449 0.9857 2.247 0.355 0.3551 0.3551 2.6672 2.4570
## CV residual 0.2122 1.0179 2.474 0.815 0.4084 0.4279 2.6876 2.3619
## 179 188 192 193 201 203 209 211 220
## Predicted 0.316 0.521 0.217 -0.0252 0.1291 0.302 -0.247 0.427 0.0246
## cvpred 0.265 0.436 0.179 -0.1440 0.0347 0.241 -0.462 0.384 -0.0892
## ints 0.355 0.565 0.565 -0.4857 0.7755 0.775 -0.275 -0.906 -0.0653
## CV residual 0.090 0.129 0.386 -0.3417 0.7407 0.534 0.187 -1.290 0.0239
## 221 222 232 235 237 240
## Predicted 0.2084 0.0283 0.529 0.261 0.545 0.0799
## cvpred 0.0372 0.1184 0.524 0.192 0.439 -0.1481
## ints 1.6163 0.3551 0.986 -0.696 0.145 0.7755
## CV residual 1.5791 0.2367 0.462 -0.888 -0.294 0.9235
##
## Sum of squares = 55.2 Mean square = 1.17 n = 47
##
## fold 2
## Observations in test set: 47
## 4 5 14 17 33 42 43 44
## Predicted -0.0798 -0.411 -0.0427 0.168 0.2380 -0.446 -0.0179 -0.322
## cvpred -0.0498 -0.435 0.0337 0.251 0.3367 -0.487 0.0302 -0.342
## ints 0.1449 -1.957 0.5653 1.616 -0.0653 0.986 -1.3265 0.145
## CV residual 0.1947 -1.522 0.5316 1.365 -0.4020 1.473 -1.3566 0.487
## 48 50 51 55 60 66 69 73
## Predicted 0.105 -0.908 -0.174 -0.235 -0.292 0.370 0.0272 -0.3691
## cvpred 0.124 -1.049 -0.177 -0.235 -0.318 0.514 0.0952 -0.4369
## ints -0.906 -0.486 1.196 -0.696 -0.906 -0.906 0.5653 -0.4857
## CV residual -1.030 0.563 1.373 -0.461 -0.588 -1.420 0.4700 -0.0487
## 80 101 102 106 107 112 114 123
## Predicted -0.286 0.1535 -0.4915 0.0384 0.0515 -0.216 -0.203 0.231
## cvpred -0.304 0.2053 -0.5845 0.1125 0.1115 -0.233 -0.209 0.296
## ints -1.116 0.1449 -0.4857 1.1959 -0.6959 -0.906 0.145 0.986
## CV residual -0.812 -0.0604 0.0988 1.0834 -0.8074 -0.674 0.354 0.689
## 126 129 133 138 139 145 150 155
## Predicted 0.108 0.152 0.233 -0.1888 0.0332 0.0898 -0.1003 -0.217
## cvpred 0.168 0.217 0.308 -0.1833 0.0787 0.1236 -0.0841 -0.240
## ints -1.116 -0.696 -0.275 -0.0653 -0.6959 2.2468 -0.0653 -0.906
## CV residual -1.284 -0.913 -0.584 0.1180 -0.7745 2.1232 0.0189 -0.666
## 159 163 164 169 176 177 182 184 185
## Predicted -0.0449 0.522 -0.0689 0.185 0.359 0.202 0.517 0.414 0.689
## cvpred -0.0372 0.627 -0.0796 0.261 0.480 0.278 0.661 0.478 0.854
## ints -1.1163 -1.326 1.8265 -1.116 0.986 0.775 0.775 1.826 -0.906
## CV residual -1.0790 -1.954 1.9061 -1.377 0.506 0.497 0.114 1.348 -1.760
## 189 199 210 213 233 239
## Predicted 0.339 0.351 0.0554 0.255 0.731 0.519
## cvpred 0.439 0.446 0.0783 0.356 0.876 0.661
## ints -1.747 -0.486 0.5653 0.775 2.037 -1.537
## CV residual -2.186 -0.932 0.4870 0.419 1.161 -2.198
##
## Sum of squares = 55.1 Mean square = 1.17 n = 47
##
## fold 3
## Observations in test set: 47
## 2 6 11 25 27 29 31 32
## Predicted -0.138 -0.2275 -0.0942 0.3030 -0.412 -0.424 -0.248 -0.405
## cvpred -0.118 -0.2004 -0.0979 0.2180 -0.354 -0.333 -0.215 -0.365
## ints 1.196 -0.2755 -1.3265 -0.0653 -0.906 -0.906 0.355 -0.696
## CV residual 1.314 -0.0751 -1.2286 -0.2833 -0.552 -0.573 0.570 -0.331
## 47 52 53 58 62 63 64 67
## Predicted -0.250 -0.1240 0.0666 -0.160 -0.0835 -0.473 -0.259 -0.03696
## cvpred -0.237 -0.0832 0.1044 -0.161 -0.0607 -0.382 -0.102 -0.00533
## ints -1.326 -0.2755 -0.4857 0.775 -0.6959 -1.326 0.775 -1.32646
## CV residual -1.090 -0.1923 -0.5900 0.936 -0.6352 -0.944 0.877 -1.32113
## 76 79 85 86 89 90 92 93
## Predicted -0.1160 -0.140 -0.318 -0.354 -0.0411 -0.403 0.0868 -0.0820
## cvpred -0.0957 -0.125 -0.269 -0.208 -0.0272 -0.321 0.1239 -0.0792
## ints -0.9061 0.145 -1.326 -1.326 -1.3265 -1.326 -0.6959 0.9857
## CV residual -0.8104 0.270 -1.058 -1.119 -1.2993 -1.005 -0.8198 1.0649
## 109 120 121 127 136 146 147 152
## Predicted 0.00986 0.1303 -0.431 0.0211 0.387 -0.1202 -0.0610 0.238
## cvpred 0.01700 0.1004 -0.370 0.0528 0.393 -0.0901 -0.0645 0.280
## ints 1.82645 0.1449 -1.747 0.1449 -1.116 1.1959 0.9857 -1.116
## CV residual 1.80946 0.0445 -1.377 0.0921 -1.509 1.2860 1.0501 -1.396
## 165 175 186 191 195 202 206 212 215
## Predicted -0.250 0.291 0.238 0.0215 0.2221 0.3421 0.213 0.219 0.175
## cvpred -0.185 0.291 0.189 0.0624 0.1922 0.4083 0.234 0.262 0.189
## ints 0.775 -0.696 0.986 -1.3265 -0.0653 -0.0653 0.986 0.986 0.565
## CV residual 0.961 -0.987 0.797 -1.3888 -0.2575 -0.4736 0.752 0.724 0.377
## 223 224 225 227 229 230
## Predicted 0.422 0.221 0.004757 0.484 0.199 0.333
## cvpred 0.378 0.179 0.000433 0.421 0.171 0.284
## ints 0.565 0.986 1.616259 2.457 0.355 -1.957
## CV residual 0.187 0.806 1.615826 2.036 0.184 -2.241
##
## Sum of squares = 48.9 Mean square = 1.04 n = 47
##
## fold 4
## Observations in test set: 47
## 8 9 10 13 16 20 22 23
## Predicted -0.290 -0.244 -0.0156 -0.6300 -0.333 -0.135 -0.379 0.0946
## cvpred -0.328 -0.246 -0.0719 -0.7275 -0.429 -0.292 -0.379 0.1871
## ints 0.145 -0.696 0.1449 -0.6959 -0.696 1.196 0.145 -0.2755
## CV residual 0.473 -0.450 0.2168 0.0316 -0.267 1.487 0.524 -0.4625
## 28 38 49 57 61 65 68 72
## Predicted -0.313 -0.5999 0.117 -0.113 0.00467 -0.376 -0.123 0.0196
## cvpred -0.317 -0.6467 0.101 -0.103 0.02995 -0.508 -0.142 0.0221
## ints -1.957 -0.6959 -0.275 0.775 -0.06529 0.775 0.775 -0.0653
## CV residual -1.640 -0.0491 -0.376 0.879 -0.09525 1.283 0.917 -0.0874
## 74 77 81 83 95 97 98 104
## Predicted -0.478 -0.124 0.138 0.00315 -0.0813 -0.194 -0.10012 0.178
## cvpred -0.434 -0.162 0.120 -0.02701 -0.0279 -0.202 0.00447 0.178
## ints -0.275 -1.116 -0.696 0.77548 0.1449 -0.486 -0.48568 1.406
## CV residual 0.159 -0.955 -0.816 0.80249 0.1728 -0.284 -0.49015 1.228
## 105 115 117 122 124 125 131
## Predicted 0.0689 0.0910 -0.374 -0.000329 -0.0554 -0.00804 -0.0831
## cvpred 0.0140 0.0892 -0.390 -0.061464 -0.0658 0.08043 -0.0419
## ints 0.1449 0.9857 -0.696 0.775482 0.1449 -1.32646 0.3551
## CV residual 0.1309 0.8965 -0.305 0.836946 0.2107 -1.40689 0.3970
## 137 144 149 156 157 162 171 187
## Predicted -0.0125 0.0434 0.367 0.0902 -0.0982 0.511 0.386 0.04333
## cvpred 0.0197 0.0840 0.385 0.0952 0.0330 0.465 0.359 0.00684
## ints -0.0653 0.9857 -0.486 0.9857 0.3551 -0.275 1.406 0.35509
## CV residual -0.0850 0.9017 -0.871 0.8905 0.3221 -0.740 1.047 0.34825
## 194 196 197 204 214 218 228 231
## Predicted 0.00333 0.3074 0.0311 -0.1422 0.0614 0.183 0.164 0.237
## cvpred 0.11375 0.1702 0.1202 -0.0234 0.0677 0.280 0.228 0.281
## ints -1.32646 0.1449 -0.4857 -1.7468 0.1449 0.986 -1.116 0.986
## CV residual -1.44021 -0.0253 -0.6059 -1.7235 0.0772 0.706 -1.344 0.704
##
## Sum of squares = 29.7 Mean square = 0.63 n = 47
##
## fold 5
## Observations in test set: 47
## 1 12 15 18 19 21 24 26
## Predicted -0.422 -0.602 -0.179 -0.362 0.195 -0.529 -0.267 0.0135
## cvpred -0.324 -0.476 -0.128 -0.249 0.207 -0.396 -0.171 0.0410
## ints -0.486 -0.906 -0.696 0.355 1.616 -1.116 -0.275 -0.9061
## CV residual -0.162 -0.430 -0.568 0.604 1.409 -0.720 -0.104 -0.9471
## 30 34 35 37 40 54 56 70
## Predicted -0.05459 -0.0362 -0.504 -0.709 -0.553 -0.728 -0.0567 -0.202
## cvpred 0.00302 0.0398 -0.349 -0.518 -0.376 -0.591 -0.0206 -0.106
## ints 1.40606 -0.9061 -1.326 -2.588 -2.377 -0.486 -0.6959 -0.906
## CV residual 1.40304 -0.9458 -0.978 -2.070 -2.001 0.106 -0.6753 -0.800
## 82 91 103 111 116 118 132 134
## Predicted -0.0581 0.0898 0.0013 0.224 0.0375 -0.0365 0.0307 0.2057
## cvpred -0.0263 0.0963 0.0654 0.225 0.0777 0.0341 0.0887 0.2333
## ints -1.3265 0.5653 -1.1163 0.986 -0.6959 -0.9061 -0.0653 -0.0653
## CV residual -1.3001 0.4690 -1.1817 0.760 -0.7736 -0.9401 -0.1540 -0.2986
## 141 142 151 154 167 168 170 172 173
## Predicted -0.0416 0.0745 -0.569 0.112 -0.263 0.206 0.216 0.249 0.147
## cvpred 0.0255 0.1133 -0.409 0.164 -0.145 0.193 0.235 0.256 0.187
## ints 0.3551 0.1449 0.565 0.355 -0.906 -1.116 0.565 0.145 1.406
## CV residual 0.3296 0.0316 0.974 0.192 -0.761 -1.309 0.331 -0.112 1.219
## 180 181 183 190 198 200 205 207
## Predicted 0.266 0.227 0.0867 0.1991 -0.0176 -0.0791 0.291 0.310
## cvpred 0.301 0.258 0.1500 0.2653 0.0672 0.0069 0.321 0.329
## ints 0.986 1.196 -1.3265 0.3551 -1.1163 0.7755 -0.906 -0.696
## CV residual 0.685 0.937 -1.4765 0.0898 -1.1835 0.7686 -1.227 -1.025
## 208 217 219 226 236 238
## Predicted 0.222 0.141 0.703 0.367 0.307 0.611
## cvpred 0.255 0.182 0.665 0.364 0.314 0.565
## ints -0.696 0.355 1.406 -0.906 1.196 -0.275
## CV residual -0.951 0.173 0.741 -1.270 0.882 -0.841
##
## Sum of squares = 40.7 Mean square = 0.87 n = 47
##
## Overall (Sum over all 47 folds)
## ms
## 0.977