library(dplyr)
library(Metrics)
library(MLmetrics)
library(leaps)
library(car)
library(MASS)

library(tidyverse)
library(caret)
library(leaps)

Load the Data

train_raw <-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_1/data/moneyball-training-data.csv")

evaluation_raw <-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_1/data/moneyball-evaluation-data.csv")

Variable Distributions

train_raw %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "#3A8B63", color="#3A8B63") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

Log Variable Distributions

train_log <- log(train_raw)

train_log %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "#3A8B63", color="#3A8B63") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

Add log transformed variables to training set, for the variables that appear more normal when log transformed

train_raw$TEAM_BASERUN_CS_log <- log(train_raw$TEAM_BASERUN_CS)
train_raw$TEAM_BASERUN_SB_log <- log(train_raw$TEAM_BASERUN_SB)
train_raw$TEAM_BATTING_3B_log <- log(train_raw$TEAM_BATTING_3B)
train_raw$TEAM_BATTING_H_log <- log(train_raw$TEAM_BATTING_H)
train_raw$TEAM_PITCHING_BB_log <- log(train_raw$TEAM_PITCHING_BB)

Remove NAs

# Review NA counts
colSums(is.na(train_raw))
##                INDEX          TARGET_WINS       TEAM_BATTING_H 
##                    0                    0                    0 
##      TEAM_BATTING_2B      TEAM_BATTING_3B      TEAM_BATTING_HR 
##                    0                    0                    0 
##      TEAM_BATTING_BB      TEAM_BATTING_SO      TEAM_BASERUN_SB 
##                    0                  102                  131 
##      TEAM_BASERUN_CS     TEAM_BATTING_HBP      TEAM_PITCHING_H 
##                  772                 2085                    0 
##     TEAM_PITCHING_HR     TEAM_PITCHING_BB     TEAM_PITCHING_SO 
##                    0                    0                  102 
##      TEAM_FIELDING_E     TEAM_FIELDING_DP  TEAM_BASERUN_CS_log 
##                    0                  286                  772 
##  TEAM_BASERUN_SB_log  TEAM_BATTING_3B_log   TEAM_BATTING_H_log 
##                  131                    0                    0 
## TEAM_PITCHING_BB_log 
##                    0
# Remove NAs
train_no_na <- na.omit(train_raw) 
evaluation_no_na <- na.omit(evaluation_raw) 

# Confirm NAs were removed
colSums(is.na(train_no_na))
##                INDEX          TARGET_WINS       TEAM_BATTING_H 
##                    0                    0                    0 
##      TEAM_BATTING_2B      TEAM_BATTING_3B      TEAM_BATTING_HR 
##                    0                    0                    0 
##      TEAM_BATTING_BB      TEAM_BATTING_SO      TEAM_BASERUN_SB 
##                    0                    0                    0 
##      TEAM_BASERUN_CS     TEAM_BATTING_HBP      TEAM_PITCHING_H 
##                    0                    0                    0 
##     TEAM_PITCHING_HR     TEAM_PITCHING_BB     TEAM_PITCHING_SO 
##                    0                    0                    0 
##      TEAM_FIELDING_E     TEAM_FIELDING_DP  TEAM_BASERUN_CS_log 
##                    0                    0                    0 
##  TEAM_BASERUN_SB_log  TEAM_BATTING_3B_log   TEAM_BATTING_H_log 
##                    0                    0                    0 
## TEAM_PITCHING_BB_log 
##                    0
colSums(is.na(evaluation_no_na))
##            INDEX   TEAM_BATTING_H  TEAM_BATTING_2B  TEAM_BATTING_3B 
##                0                0                0                0 
##  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB 
##                0                0                0                0 
##  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H TEAM_PITCHING_HR 
##                0                0                0                0 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##                0                0                0                0

Create Train, Test Split

train_all <- train_no_na %>% dplyr::sample_frac(.75)
test  <- dplyr::anti_join(train_no_na, train_all, by = 'INDEX')

Exclude variables that do not meet normality assumption

train <- train_all %>% dplyr::select(TARGET_WINS, TEAM_BATTING_2B, TEAM_BATTING_BB, 
                                             TEAM_FIELDING_DP, TEAM_BASERUN_CS_log, TEAM_BASERUN_SB_log, 
                                             TEAM_BATTING_3B_log, TEAM_BATTING_H_log, TEAM_PITCHING_BB_log)

Automated AIC

# Source: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/

full.model <- lm(TARGET_WINS ~., data = train)
# Stepwise regression model
step.model <- stepAIC(full.model, direction = "both", 
                      trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_BATTING_H_log + 
##     TEAM_PITCHING_BB_log, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7211  -6.3104  -0.6636   5.4361  26.1548 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -844.44517  110.37862  -7.650 3.02e-12 ***
## TEAM_FIELDING_DP       -0.14480    0.04214  -3.436 0.000779 ***
## TEAM_BATTING_H_log    100.19739   15.04360   6.660 5.86e-10 ***
## TEAM_PITCHING_BB_log   34.36205    5.51378   6.232 5.16e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.139 on 139 degrees of freedom
## Multiple R-squared:   0.45,  Adjusted R-squared:  0.4381 
## F-statistic: 37.91 on 3 and 139 DF,  p-value: < 2.2e-16

Automated leapBackward

# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapBackward <- train(TARGET_WINS ~., data = train,
                    method = "leapBackward", 
                    tuneGrid = data.frame(nvmax = 1:8),
                    trControl = train.control
                    )
leapBackward$results
##   nvmax      RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1     1 11.418784 0.1567622 9.646430 1.128261 0.09809412 1.238872
## 2     2  9.571039 0.4128120 7.817567 0.805867 0.12537906 1.013979
## 3     3  9.204949 0.4371151 7.330719 1.163279 0.15343859 1.206055
## 4     4  9.207068 0.4316075 7.369890 1.132334 0.14489412 1.161031
## 5     5  9.351063 0.4182177 7.458009 1.232524 0.14458205 1.222477
## 6     6  9.393950 0.4132687 7.483133 1.215765 0.13940467 1.215068
## 7     7  9.400719 0.4125055 7.502143 1.231552 0.13979147 1.224542
## 8     8  9.395844 0.4126798 7.496143 1.224118 0.13959971 1.214276

Automated leapForward

# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapForward.model <- train(TARGET_WINS ~., data = train,
                    method = "leapForward", 
                    tuneGrid = data.frame(nvmax = 1:8),
                    trControl = train.control
                    )
leapForward.model$results
##   nvmax      RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1     1 11.433324 0.1551810 9.656371 1.125869 0.09853328 1.234042
## 2     2  9.581236 0.4105748 7.825211 0.800466 0.12466907 1.015640
## 3     3  9.203149 0.4370308 7.338801 1.144175 0.15000705 1.203480
## 4     4  9.156756 0.4399732 7.357571 1.153591 0.14766024 1.181754
## 5     5  9.345103 0.4189496 7.462387 1.217403 0.14068710 1.218021
## 6     6  9.392496 0.4133375 7.482773 1.217006 0.13946952 1.215527
## 7     7  9.400719 0.4125055 7.502143 1.231552 0.13979147 1.224542
## 8     8  9.395844 0.4126798 7.496143 1.224118 0.13959971 1.214276

Automated leapSeq

# Set seed for reproducibility
set.seed(212)
# Set up repeated k-fold cross-validation
train.control <- trainControl(method = "cv", number = 10)
# Train the model
leapSeq.model <- train(TARGET_WINS ~., data = train,
                    method = "leapSeq", 
                    tuneGrid = data.frame(nvmax = 1:8),
                    trControl = train.control
                    )
leapSeq.model$results
##   nvmax      RMSE  Rsquared      MAE    RMSESD RsquaredSD    MAESD
## 1     1 11.433324 0.1551810 9.656371 1.1258688 0.09853328 1.234042
## 2     2  9.584091 0.4107088 7.824346 0.8028206 0.12486219 1.015516
## 3     3  9.214034 0.4358553 7.328418 1.1585758 0.15227136 1.198312
## 4     4  9.164681 0.4386858 7.355588 1.1660375 0.15051225 1.178818
## 5     5  9.265945 0.4376608 7.334581 1.3388883 0.15898456 1.307325
## 6     6  9.736159 0.3761968 7.866973 1.4263203 0.18607494 1.402325
## 7     7  9.353708 0.4169747 7.463643 1.2032888 0.13563009 1.226695
## 8     8  9.395844 0.4126798 7.496143 1.2241179 0.13959971 1.214276

Best model out of the automated: leapForward, leapBackward, leapSeq

summary(leapForward.model$finalModel)
## Subset selection object
## 8 Variables  (and intercept)
##                      Forced in Forced out
## TEAM_BATTING_2B          FALSE      FALSE
## TEAM_BATTING_BB          FALSE      FALSE
## TEAM_FIELDING_DP         FALSE      FALSE
## TEAM_BASERUN_CS_log      FALSE      FALSE
## TEAM_BASERUN_SB_log      FALSE      FALSE
## TEAM_BATTING_3B_log      FALSE      FALSE
## TEAM_BATTING_H_log       FALSE      FALSE
## TEAM_PITCHING_BB_log     FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: forward
##          TEAM_BATTING_2B TEAM_BATTING_BB TEAM_FIELDING_DP TEAM_BASERUN_CS_log
## 1  ( 1 ) " "             " "             " "              " "                
## 2  ( 1 ) " "             "*"             " "              " "                
## 3  ( 1 ) " "             "*"             "*"              " "                
## 4  ( 1 ) " "             "*"             "*"              " "                
##          TEAM_BASERUN_SB_log TEAM_BATTING_3B_log TEAM_BATTING_H_log
## 1  ( 1 ) " "                 " "                 "*"               
## 2  ( 1 ) " "                 " "                 "*"               
## 3  ( 1 ) " "                 " "                 "*"               
## 4  ( 1 ) " "                 "*"                 "*"               
##          TEAM_PITCHING_BB_log
## 1  ( 1 ) " "                 
## 2  ( 1 ) " "                 
## 3  ( 1 ) " "                 
## 4  ( 1 ) " "

Automated model Leap Forward

# Fit model
automated_forward_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + TEAM_FIELDING_DP 
                              + TEAM_BASERUN_CS_log + TEAM_BASERUN_SB_log + TEAM_BATTING_3B_log 
                              + TEAM_BATTING_H_log + TEAM_PITCHING_BB_log, data = train)
# View summary
summary(automated_forward_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + 
##     TEAM_FIELDING_DP + TEAM_BASERUN_CS_log + TEAM_BASERUN_SB_log + 
##     TEAM_BATTING_3B_log + TEAM_BATTING_H_log + TEAM_PITCHING_BB_log, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.4828  -6.4307  -0.3264   5.5139  25.2982 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -892.15522  354.56965  -2.516  0.01304 *  
## TEAM_BATTING_2B        -0.02572    0.03669  -0.701  0.48449    
## TEAM_BATTING_BB         0.00463    0.11325   0.041  0.96745    
## TEAM_FIELDING_DP       -0.13765    0.04375  -3.147  0.00204 ** 
## TEAM_BASERUN_CS_log    -0.13465    3.23281  -0.042  0.96684    
## TEAM_BASERUN_SB_log     0.04798    3.20419   0.015  0.98808    
## TEAM_BATTING_3B_log    -3.65456    2.86501  -1.276  0.20431    
## TEAM_BATTING_H_log    112.62799   19.56881   5.755 5.61e-08 ***
## TEAM_PITCHING_BB_log   30.17486   61.16897   0.493  0.62261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.233 on 134 degrees of freedom
## Multiple R-squared:  0.4588, Adjusted R-squared:  0.4265 
## F-statistic:  14.2 on 8 and 134 DF,  p-value: 7.598e-15

Automated model Step AIC

# Fit model
automated_aic_model <- lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_FIELDING_DP + TEAM_BATTING_H_log
                      , data = train)
# View summary
summary(automated_aic_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB + TEAM_FIELDING_DP + 
##     TEAM_BATTING_H_log, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.8317  -6.1951  -0.7919   5.4379  25.2246 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -664.86037  109.34383  -6.080 1.09e-08 ***
## TEAM_BATTING_BB       0.06353    0.01021   6.225 5.34e-09 ***
## TEAM_FIELDING_DP     -0.14433    0.04216  -3.424 0.000813 ***
## TEAM_BATTING_H_log  100.45957   15.04121   6.679 5.33e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.141 on 139 degrees of freedom
## Multiple R-squared:  0.4497, Adjusted R-squared:  0.4378 
## F-statistic: 37.87 on 3 and 139 DF,  p-value: < 2.2e-16
# Make predictions on test set
automated_model_predictions = predict(automated_aic_model, test)

# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, automated_model_predictions)
## [1] 10.39413

ORIGINAL MODELS

Model 3: Backward Elimination Model

# Fit model
backward_model <- lm(TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB 
                     + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = train_all)
# View summary
summary(backward_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_HR + 
##     TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = train_all)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.0352  -6.6163   0.2394   4.4358  21.4412 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      102.318031  11.662155   8.774 6.37e-15 ***
## TEAM_BASERUN_SB    0.036583   0.025051   1.460 0.146497    
## TEAM_BATTING_HR    0.103014   0.027202   3.787 0.000228 ***
## TEAM_BATTING_BB    0.068761   0.010764   6.388 2.48e-09 ***
## TEAM_PITCHING_SO  -0.039388   0.007718  -5.103 1.10e-06 ***
## TEAM_FIELDING_E   -0.167432   0.046873  -3.572 0.000491 ***
## TEAM_FIELDING_DP  -0.137666   0.040598  -3.391 0.000912 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.575 on 136 degrees of freedom
## Multiple R-squared:  0.5263, Adjusted R-squared:  0.5054 
## F-statistic: 25.18 on 6 and 136 DF,  p-value: < 2.2e-16
# Make predictions on test set
backward_model_predictions = predict(backward_model, test)

# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, backward_model_predictions)
## [1] 8.809609

Model 4: Forward Selection Model

# Fit model
foward_model <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_FIELDING_E + TEAM_FIELDING_DP
                   + TEAM_PITCHING_HR + TEAM_PITCHING_SO, data = train_all)
# View summary
summary(foward_model)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_BB + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP + TEAM_PITCHING_HR + TEAM_PITCHING_SO, 
##     data = train_all)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2848  -6.2226   0.3854   4.2928  22.9671 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      51.762312  21.870949   2.367 0.019357 *  
## TEAM_BATTING_H    0.034081   0.011436   2.980 0.003414 ** 
## TEAM_BATTING_BB   0.067016   0.010537   6.360 2.84e-09 ***
## TEAM_FIELDING_E  -0.136248   0.046100  -2.955 0.003681 ** 
## TEAM_FIELDING_DP -0.146971   0.039051  -3.764 0.000248 ***
## TEAM_PITCHING_HR  0.061382   0.028474   2.156 0.032866 *  
## TEAM_PITCHING_SO -0.030010   0.008316  -3.609 0.000432 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.373 on 136 degrees of freedom
## Multiple R-squared:  0.5483, Adjusted R-squared:  0.5283 
## F-statistic: 27.51 on 6 and 136 DF,  p-value: < 2.2e-16
# Make predictions on test set
forward_model_predictions = predict(foward_model, test)

# Obtain RMSE between actuals and predicted
rmse(test$TARGET_WINS, forward_model_predictions)
## [1] 9.050247

Verifying OLS Regression Assumptions

# Assumption: No Multicollinearity (VIF under 5)
vif(foward_model)
##   TEAM_BATTING_H  TEAM_BATTING_BB  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         1.541029         1.304365         1.236201         1.026892 
## TEAM_PITCHING_HR TEAM_PITCHING_SO 
##         1.563715         1.647976
# Assumption: Mean of residuals is zero
mean(residuals(foward_model))
## [1] -5.071943e-17
# Assumption: Homoscedasticity of residuals
plot(foward_model)

# Assumption: No auto-correlation
acf(residuals(foward_model), lags=20)

Model Selection

First, before fully evaluating models we validated that all individual predictors had p-values below 0.05, the cutoff for a 95% confidence level. Additionally, we validated that the models F-statistics were also significant at a 95% confidence level.

Then, the two primary statistics used to choose our final model were adjusted R-squared and root mean square error (RMSE). Adjusted R-squared helped guide model selection since, like R-squared, adjusted R-squared measures the amount of variation in the dependent variable explained by the independent variables, except with a correction to ensure only independent variables with predictive power raise the statistic. RMSE was perhaps even more crucial to model selection as it is the measure of the standard deviation of the residuals, essentially a measure of accuracy in the same units as the response variable. To ensure the model can generalize to unobserved data, we calculated the RMSE on our test set.

Both of our top models–forward selection and backward elimination–saw a RMSE of 8.5. Therefore, we chose the forward selection model due to its slightly higher adjusted R-squared–0.54 vs 0.53. Additionally, since both top performing models included six predictors, parsimony was not a consideration.

Lastly, we verified the forward selection model meets OLS regression assumptions. These included: no significant multicollinearity, the mean of residuals is zero, homoscedasticity of residuals, and no significant auto-correlation. We deemed all assumptions had been met, but note, there is a slight trend in the residuals vs fitted plot (Assumption: Homoscedasticity of residuals) which may indicate a small nonlinear trend.

References

Bhandari, Aniruddha, “Key Difference between R-squared and Adjusted R-squared for Regression Analysis”, Analytics Vidhya, 2020 https://www.analyticsvidhya.com/blog/2020/07/difference-between-r-squared-and-adjusted-r-squared/

Glen., Stephanie “RMSE: Root Mean Square Error”, StatisticsHowTo.com https://www.statisticshowto.com/probability-and-statistics/regression-analysis/rmse-root-mean-square-error/

Gupta, Aryansh, “Linear Regression Assumptions and Diagnostics in R”, RPubs, https://rpubs.com/aryn999/LinearRegressionAssumptionsAndDiagnosticsInR

Kim, Bommae, “Understanding Diagnostic Plots for Linear Regression Analysis”, University of Virginia Library, https://data.library.virginia.edu/diagnostic-plots/