library(dplyr)
library(Metrics)
library(MLmetrics)
library(leaps)
library(car)
library(MASS)
library(tidyverse)
library(caret)
library(leaps)
train_raw <-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_1/data/moneyball-training-data.csv")
evaluation_raw <-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_1/data/moneyball-evaluation-data.csv")
train_raw %>%
gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
ggplot(., aes(value)) +
geom_density(fill = "#3A8B63", color="#3A8B63") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
train_log <- log(train_raw)
train_log %>%
gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
ggplot(., aes(value)) +
geom_density(fill = "#3A8B63", color="#3A8B63") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
train_raw$TEAM_BASERUN_CS_log <- log(train_raw$TEAM_BASERUN_CS)
train_raw$TEAM_BASERUN_SB_log <- log(train_raw$TEAM_BASERUN_SB)
train_raw$TEAM_BATTING_3B_log <- log(train_raw$TEAM_BATTING_3B)
train_raw$TEAM_BATTING_H_log <- log(train_raw$TEAM_BATTING_H)
train_raw$TEAM_PITCHING_BB_log <- log(train_raw$TEAM_PITCHING_BB)
# Review NA counts
colSums(is.na(train_raw))
## INDEX TARGET_WINS TEAM_BATTING_H
## 0 0 0
## TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 0 0 0
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0 102 131
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 772 2085 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 0 0 102
## TEAM_FIELDING_E TEAM_FIELDING_DP TEAM_BASERUN_CS_log
## 0 286 772
## TEAM_BASERUN_SB_log TEAM_BATTING_3B_log TEAM_BATTING_H_log
## 131 0 0
## TEAM_PITCHING_BB_log
## 0
# Remove NAs
train_no_na <- na.omit(train_raw)
evaluation_no_na <- na.omit(evaluation_raw)
# Confirm NAs were removed
colSums(is.na(train_no_na))
## INDEX TARGET_WINS TEAM_BATTING_H
## 0 0 0
## TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 0 0 0
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0 0 0
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 0 0 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 0 0 0
## TEAM_FIELDING_E TEAM_FIELDING_DP TEAM_BASERUN_CS_log
## 0 0 0
## TEAM_BASERUN_SB_log TEAM_BATTING_3B_log TEAM_BATTING_H_log
## 0 0 0
## TEAM_PITCHING_BB_log
## 0
colSums(is.na(evaluation_no_na))
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 0 0 0 0
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 0 0 0 0
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 0 0 0 0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0 0 0 0
train_all <- train_no_na %>% dplyr::sample_frac(.75)
test <- dplyr::anti_join(train_no_na, train_all, by = 'INDEX')
train <- train_all %>% dplyr::select(TARGET_WINS, TEAM_BATTING_2B, TEAM_BATTING_BB,
TEAM_FIELDING_DP, TEAM_BASERUN_CS_log, TEAM_BASERUN_SB_log,
TEAM_BATTING_3B_log, TEAM_BATTING_H_log, TEAM_PITCHING_BB_log)
# Source: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
full.model <- lm(TARGET_WINS ~., data = train)
# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both",
trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_BATTING_H_log +
## TEAM_PITCHING_BB_log, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.7211 -6.3104 -0.6636 5.4361 26.1548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -844.44517 110.37862 -7.650 3.02e-12 ***
## TEAM_FIELDING_DP -0.14480 0.04214 -3.436 0.000779 ***
## TEAM_BATTING_H_log 100.19739 15.04360 6.660 5.86e-10 ***
## TEAM_PITCHING_BB_log 34.36205 5.51378 6.232 5.16e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.139 on 139 degrees of freedom
## Multiple R-squared: 0.45, Adjusted R-squared: 0.4381
## F-statistic: 37.91 on 3 and 139 DF, p-value: < 2.2e-16
# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapBackward <- train(TARGET_WINS ~., data = train,
method = "leapBackward",
tuneGrid = data.frame(nvmax = 1:8),
trControl = train.control
)
leapBackward$results
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 11.418784 0.1567622 9.646430 1.128261 0.09809412 1.238872
## 2 2 9.571039 0.4128120 7.817567 0.805867 0.12537906 1.013979
## 3 3 9.204949 0.4371151 7.330719 1.163279 0.15343859 1.206055
## 4 4 9.207068 0.4316075 7.369890 1.132334 0.14489412 1.161031
## 5 5 9.351063 0.4182177 7.458009 1.232524 0.14458205 1.222477
## 6 6 9.393950 0.4132687 7.483133 1.215765 0.13940467 1.215068
## 7 7 9.400719 0.4125055 7.502143 1.231552 0.13979147 1.224542
## 8 8 9.395844 0.4126798 7.496143 1.224118 0.13959971 1.214276
# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapForward.model <- train(TARGET_WINS ~., data = train,
method = "leapForward",
tuneGrid = data.frame(nvmax = 1:8),
trControl = train.control
)
leapForward.model$results
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 11.433324 0.1551810 9.656371 1.125869 0.09853328 1.234042
## 2 2 9.581236 0.4105748 7.825211 0.800466 0.12466907 1.015640
## 3 3 9.203149 0.4370308 7.338801 1.144175 0.15000705 1.203480
## 4 4 9.156756 0.4399732 7.357571 1.153591 0.14766024 1.181754
## 5 5 9.345103 0.4189496 7.462387 1.217403 0.14068710 1.218021
## 6 6 9.392496 0.4133375 7.482773 1.217006 0.13946952 1.215527
## 7 7 9.400719 0.4125055 7.502143 1.231552 0.13979147 1.224542
## 8 8 9.395844 0.4126798 7.496143 1.224118 0.13959971 1.214276
# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapSeq.model <- train(TARGET_WINS ~., data = train,
method = "leapSeq",
tuneGrid = data.frame(nvmax = 1:8),
trControl = train.control
)
leapSeq.model$results
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 11.433324 0.1551810 9.656371 1.1258688 0.09853328 1.234042
## 2 2 9.584091 0.4107088 7.824346 0.8028206 0.12486219 1.015516
## 3 3 9.214034 0.4358553 7.328418 1.1585758 0.15227136 1.198312
## 4 4 9.164681 0.4386858 7.355588 1.1660375 0.15051225 1.178818
## 5 5 9.265945 0.4376608 7.334581 1.3388883 0.15898456 1.307325
## 6 6 9.736159 0.3761968 7.866973 1.4263203 0.18607494 1.402325
## 7 7 9.353708 0.4169747 7.463643 1.2032888 0.13563009 1.226695
## 8 8 9.395844 0.4126798 7.496143 1.2241179 0.13959971 1.214276
summary(leapForward.model$finalModel)
## Subset selection object
## 8 Variables (and intercept)
## Forced in Forced out
## TEAM_BATTING_2B FALSE FALSE
## TEAM_BATTING_BB FALSE FALSE
## TEAM_FIELDING_DP FALSE FALSE
## TEAM_BASERUN_CS_log FALSE FALSE
## TEAM_BASERUN_SB_log FALSE FALSE
## TEAM_BATTING_3B_log FALSE FALSE
## TEAM_BATTING_H_log FALSE FALSE
## TEAM_PITCHING_BB_log FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: forward
## TEAM_BATTING_2B TEAM_BATTING_BB TEAM_FIELDING_DP TEAM_BASERUN_CS_log
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " "*" " " " "
## 3 ( 1 ) " " "*" "*" " "
## 4 ( 1 ) " " "*" "*" " "
## TEAM_BASERUN_SB_log TEAM_BATTING_3B_log TEAM_BATTING_H_log
## 1 ( 1 ) " " " " "*"
## 2 ( 1 ) " " " " "*"
## 3 ( 1 ) " " " " "*"
## 4 ( 1 ) " " "*" "*"
## TEAM_PITCHING_BB_log
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
# Fit model
automated_forward_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + TEAM_FIELDING_DP
+ TEAM_BASERUN_CS_log + TEAM_BASERUN_SB_log + TEAM_BATTING_3B_log
+ TEAM_BATTING_H_log + TEAM_PITCHING_BB_log, data = train)
# View summary
summary(automated_forward_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB +
## TEAM_FIELDING_DP + TEAM_BASERUN_CS_log + TEAM_BASERUN_SB_log +
## TEAM_BATTING_3B_log + TEAM_BATTING_H_log + TEAM_PITCHING_BB_log,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.4828 -6.4307 -0.3264 5.5139 25.2982
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -892.15522 354.56965 -2.516 0.01304 *
## TEAM_BATTING_2B -0.02572 0.03669 -0.701 0.48449
## TEAM_BATTING_BB 0.00463 0.11325 0.041 0.96745
## TEAM_FIELDING_DP -0.13765 0.04375 -3.147 0.00204 **
## TEAM_BASERUN_CS_log -0.13465 3.23281 -0.042 0.96684
## TEAM_BASERUN_SB_log 0.04798 3.20419 0.015 0.98808
## TEAM_BATTING_3B_log -3.65456 2.86501 -1.276 0.20431
## TEAM_BATTING_H_log 112.62799 19.56881 5.755 5.61e-08 ***
## TEAM_PITCHING_BB_log 30.17486 61.16897 0.493 0.62261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.233 on 134 degrees of freedom
## Multiple R-squared: 0.4588, Adjusted R-squared: 0.4265
## F-statistic: 14.2 on 8 and 134 DF, p-value: 7.598e-15
# Fit model
automated_aic_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_FIELDING_DP + TEAM_BATTING_H_log
, data = train)
# View summary
summary(automated_aic_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_FIELDING_DP +
## TEAM_BATTING_H_log, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8317 -6.1951 -0.7919 5.4379 25.2246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -664.86037 109.34383 -6.080 1.09e-08 ***
## TEAM_BATTING_BB 0.06353 0.01021 6.225 5.34e-09 ***
## TEAM_FIELDING_DP -0.14433 0.04216 -3.424 0.000813 ***
## TEAM_BATTING_H_log 100.45957 15.04121 6.679 5.33e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.141 on 139 degrees of freedom
## Multiple R-squared: 0.4497, Adjusted R-squared: 0.4378
## F-statistic: 37.87 on 3 and 139 DF, p-value: < 2.2e-16
# Make predictions on test set
automated_model_predictions = predict(automated_aic_model, test)
# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, automated_model_predictions)
## [1] 10.39413
# Fit model
backward_model <- lm(TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB
+ TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = train_all)
# View summary
summary(backward_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_HR +
## TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = train_all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.0352 -6.6163 0.2394 4.4358 21.4412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102.318031 11.662155 8.774 6.37e-15 ***
## TEAM_BASERUN_SB 0.036583 0.025051 1.460 0.146497
## TEAM_BATTING_HR 0.103014 0.027202 3.787 0.000228 ***
## TEAM_BATTING_BB 0.068761 0.010764 6.388 2.48e-09 ***
## TEAM_PITCHING_SO -0.039388 0.007718 -5.103 1.10e-06 ***
## TEAM_FIELDING_E -0.167432 0.046873 -3.572 0.000491 ***
## TEAM_FIELDING_DP -0.137666 0.040598 -3.391 0.000912 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.575 on 136 degrees of freedom
## Multiple R-squared: 0.5263, Adjusted R-squared: 0.5054
## F-statistic: 25.18 on 6 and 136 DF, p-value: < 2.2e-16
# Make predictions on test set
backward_model_predictions = predict(backward_model, test)
# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, backward_model_predictions)
## [1] 8.809609
# Fit model
foward_model <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_FIELDING_E + TEAM_FIELDING_DP
+ TEAM_PITCHING_HR + TEAM_PITCHING_SO, data = train_all)
# View summary
summary(foward_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB +
## TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_PITCHING_HR + TEAM_PITCHING_SO,
## data = train_all)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.2848 -6.2226 0.3854 4.2928 22.9671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.762312 21.870949 2.367 0.019357 *
## TEAM_BATTING_H 0.034081 0.011436 2.980 0.003414 **
## TEAM_BATTING_BB 0.067016 0.010537 6.360 2.84e-09 ***
## TEAM_FIELDING_E -0.136248 0.046100 -2.955 0.003681 **
## TEAM_FIELDING_DP -0.146971 0.039051 -3.764 0.000248 ***
## TEAM_PITCHING_HR 0.061382 0.028474 2.156 0.032866 *
## TEAM_PITCHING_SO -0.030010 0.008316 -3.609 0.000432 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.373 on 136 degrees of freedom
## Multiple R-squared: 0.5483, Adjusted R-squared: 0.5283
## F-statistic: 27.51 on 6 and 136 DF, p-value: < 2.2e-16
# Make predictions on test set
forward_model_predictions = predict(foward_model, test)
# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, forward_model_predictions)
## [1] 9.050247
# Assumption: No Multicollinearity (VIF under 5)
vif(foward_model)
## TEAM_BATTING_H TEAM_BATTING_BB TEAM_FIELDING_E TEAM_FIELDING_DP
## 1.541029 1.304365 1.236201 1.026892
## TEAM_PITCHING_HR TEAM_PITCHING_SO
## 1.563715 1.647976
# Assumption: Mean of residuals is zero
mean(residuals(foward_model))
## [1] -5.071943e-17
# Assumption: Homoscedasticity of residuals
plot(foward_model)
# Assumption: No auto-correlation
acf(residuals(foward_model), lags=20)
First, before fully evaluating models we validated that all individual predictors had p-values below 0.05, the cutoff for a 95% confidence level. Additionally, we validated that the models F-statistics were also significant at a 95% confidence level.
Then, the two primary statistics used to choose our final model were adjusted R-squared and root mean square error (RMSE). Adjusted R-squared helped guide model selection since, like R-squared, adjusted R-squared measures the amount of variation in the dependent variable explained by the independent variables, except with a correction to ensure only independent variables with predictive power raise the statistic. RMSE was perhaps even more crucial to model selection as it is the measure of the standard deviation of the residuals, essentially a measure of accuracy in the same units as the response variable. To ensure the model can generalize to unobserved data, we calculated the RMSE on our test set.
Both of our top models–forward selection and backward elimination–saw a RMSE of 8.5. Therefore, we chose the forward selection model due to its slightly higher adjusted R-squared–0.54 vs 0.53. Additionally, since both top performing models included six predictors, parsimony was not a consideration.
Lastly, we verified the forward selection model meets OLS regression assumptions. These included: no significant multicollinearity, the mean of residuals is zero, homoscedasticity of residuals, and no significant auto-correlation. We deemed all assumptions had been met, but note, there is a slight trend in the residuals vs fitted plot (Assumption: Homoscedasticity of residuals) which may indicate a small nonlinear trend.
Bhandari, Aniruddha, “Key Difference between R-squared and Adjusted R-squared for Regression Analysis”, Analytics Vidhya, 2020 https://www.analyticsvidhya.com/blog/2020/07/difference-between-r-squared-and-adjusted-r-squared/
Glen., Stephanie “RMSE: Root Mean Square Error”, StatisticsHowTo.com https://www.statisticshowto.com/probability-and-statistics/regression-analysis/rmse-root-mean-square-error/
Gupta, Aryansh, “Linear Regression Assumptions and Diagnostics in R”, RPubs, https://rpubs.com/aryn999/LinearRegressionAssumptionsAndDiagnosticsInR
Kim, Bommae, “Understanding Diagnostic Plots for Linear Regression Analysis”, University of Virginia Library, https://data.library.virginia.edu/diagnostic-plots/