1 Setup

# Set working directory and path to data
cd <- "/Users/arvindsharma/Dropbox/WCAS/Econometrics/"
setwd(cd)

# Clear the workspace
  rm(list = ls()) # Clear environment - remove all files from your workspace
  invisible(gc()) # Clear unused memory
  cat("\f")       # Clear the console
  graphics.off()  # clear all graphs


# Prepare needed libraries
packages <- c("stargazer",
              "psych",
              "tidyverse", 
              "visdat", 
              "MASS",    # backward AIC
              "leaps"    # best subset
              )

suppressPackageStartupMessages({
  
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
  
})  


df_train    <- read.csv("moneyball-training-data.csv")
# df_eval     <- read.csv("moneyball-evaluation-data.csv")
df_train$INDEX <- NULL

glimpse(df_train)
## Rows: 2,276
## Columns: 16
## $ TARGET_WINS      <int> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 68, 72, 7…
## $ TEAM_BATTING_H   <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B  <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B  <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR  <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB  <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO  <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB  <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS  <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E  <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
vis_dat(df_train)

df_train$TEAM_BATTING_HBP<- NULL

2 Model

2.1 Backward Elimination

trace arguments helps with controlling the output of stepAIC function.

# MODEL 3:  Step AIC...Backward selection... drop variable systematically...

library(MASS)
?stepAIC

model3_backward <- stepAIC(object = lm(data = df_train, 
                                       TARGET_WINS ~ .), 
                        direction = c("backward")  
                  )
## Start:  AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_BB  1       0.8 134324 6721.2
## - TEAM_PITCHING_HR  1       7.2 134330 6721.3
## - TEAM_BATTING_SO   1      55.2 134378 6721.8
## - TEAM_BATTING_H    1      56.5 134380 6721.8
## - TEAM_BATTING_HR   1      68.5 134392 6721.9
## - TEAM_BATTING_BB   1      81.0 134404 6722.1
## - TEAM_PITCHING_H   1      98.0 134421 6722.3
## <none>                          134323 6723.2
## - TEAM_PITCHING_SO  1     264.1 134587 6724.1
## - TEAM_BASERUN_CS   1     746.8 135070 6729.4
## - TEAM_BASERUN_SB   1    1557.8 135881 6738.3
## - TEAM_BATTING_3B   1    4838.9 139162 6773.8
## - TEAM_BATTING_2B   1    5166.3 139489 6777.3
## - TEAM_FIELDING_DP  1    6742.5 141066 6794.0
## - TEAM_FIELDING_E   1   22427.4 156751 6950.6
## 
## Step:  AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_HR  1       6.4 134330 6719.3
## - TEAM_BATTING_SO   1      56.2 134380 6719.8
## - TEAM_BATTING_HR   1      77.9 134402 6720.1
## - TEAM_BATTING_H    1     147.2 134471 6720.8
## <none>                          134324 6721.2
## - TEAM_PITCHING_H   1     197.5 134521 6721.4
## - TEAM_PITCHING_SO  1     266.3 134590 6722.1
## - TEAM_BASERUN_CS   1     746.5 135070 6727.4
## - TEAM_BASERUN_SB   1    1564.2 135888 6736.4
## - TEAM_BATTING_3B   1    4840.8 139165 6771.8
## - TEAM_BATTING_2B   1    5175.9 139500 6775.4
## - TEAM_FIELDING_DP  1    6744.6 141069 6792.0
## - TEAM_BATTING_BB   1   12568.9 146893 6852.1
## - TEAM_FIELDING_E   1   22491.7 156816 6949.2
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_BATTING_SO   1      51.2 134382 6717.8
## - TEAM_BATTING_H    1     144.7 134475 6718.9
## <none>                          134330 6719.3
## - TEAM_PITCHING_H   1     202.0 134532 6719.5
## - TEAM_PITCHING_SO  1     298.0 134628 6720.6
## - TEAM_BASERUN_CS   1     742.6 135073 6725.5
## - TEAM_BASERUN_SB   1    1570.4 135901 6734.5
## - TEAM_BATTING_3B   1    4842.6 139173 6769.9
## - TEAM_BATTING_2B   1    5198.7 139529 6773.7
## - TEAM_FIELDING_DP  1    6744.4 141075 6790.1
## - TEAM_BATTING_HR   1    9780.8 144111 6821.7
## - TEAM_BATTING_BB   1   12606.9 146937 6850.6
## - TEAM_FIELDING_E   1   22525.1 156855 6947.6
## 
## Step:  AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          134382 6717.8
## - TEAM_BASERUN_CS   1     737.6 135119 6724.0
## - TEAM_PITCHING_H   1    1355.1 135737 6730.7
## - TEAM_BASERUN_SB   1    1575.6 135957 6733.2
## - TEAM_BATTING_H    1    1740.1 136122 6734.9
## - TEAM_BATTING_3B   1    4849.8 139231 6768.5
## - TEAM_BATTING_2B   1    5148.1 139530 6771.7
## - TEAM_FIELDING_DP  1    6779.2 141161 6789.0
## - TEAM_PITCHING_SO  1    7395.1 141777 6795.4
## - TEAM_BATTING_HR   1    9785.1 144167 6820.3
## - TEAM_BATTING_BB   1   12619.7 147001 6849.2
## - TEAM_FIELDING_E   1   22552.0 156934 6946.4

2.2 Forward Selection

If you have a very large set of candidate predictors from which you wish to extract a few–i.e., if you’re on a fishing expedition–you should generally go forward. If, on the other hand, if you have a modest-sized set of potential variables from which you wish to eliminate a few–i.e., if you’re fine-tuning some prior selection of variables–you should generally go backward. If you’re on a fishing expedition, you should still be careful not to cast too wide a net, selecting variables that are only accidentally related to your dependent variable.

  • Number of rows should remain the same in data.
Error in `stepAIC()`:
! number of rows in use has changed: remove missing values?
# Remove rows with missing values
df_train_clean <- na.omit(df_train)
?stepAIC
null <- lm(data = df_train_clean, 
           formula = TARGET_WINS ~ 1)

full <- lm(data = df_train_clean, 
           formula = TARGET_WINS ~ .)

model3_forward <-
stepAIC(object=null, 
        scope=list(lower=null, upper=full), 
        data=df_train_clean,
        direction='forward')
## Start:  AIC=7553.15
## TARGET_WINS ~ 1
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_H    1   30855.6 208419 7350.0
## + TEAM_BATTING_BB   1   29025.8 210249 7363.0
## + TEAM_PITCHING_BB  1   20814.1 218461 7419.9
## + TEAM_BATTING_HR   1   19226.3 220049 7430.7
## + TEAM_PITCHING_HR  1   18588.4 220687 7435.0
## + TEAM_FIELDING_E   1   15498.8 223776 7455.6
## + TEAM_PITCHING_H   1   11174.3 228101 7484.1
## + TEAM_BATTING_2B   1    8984.1 230291 7498.3
## + TEAM_BASERUN_SB   1    3465.8 235809 7533.5
## + TEAM_BATTING_3B   1    1637.7 237637 7544.9
## + TEAM_PITCHING_SO  1    1065.7 238209 7548.5
## + TEAM_BATTING_SO   1     946.7 238328 7549.3
## + TEAM_FIELDING_DP  1     586.2 238689 7551.5
## <none>                          239275 7553.1
## + TEAM_BASERUN_CS   1      30.0 239245 7555.0
## 
## Step:  AIC=7349.99
## TARGET_WINS ~ TEAM_BATTING_H
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_FIELDING_E   1   21574.4 186845 7189.6
## + TEAM_BATTING_BB   1   20362.1 188057 7199.2
## + TEAM_BATTING_HR   1   13968.1 194451 7248.9
## + TEAM_PITCHING_HR  1   12856.0 195563 7257.4
## + TEAM_PITCHING_BB  1   12822.0 195597 7257.6
## + TEAM_BASERUN_SB   1    3336.3 205083 7328.0
## + TEAM_FIELDING_DP  1    2589.1 205830 7333.4
## + TEAM_BATTING_3B   1    1414.1 207005 7341.9
## + TEAM_BATTING_SO   1    1291.2 207128 7342.8
## + TEAM_BATTING_2B   1    1056.8 207363 7344.4
## + TEAM_PITCHING_SO  1     719.9 207699 7346.8
## + TEAM_PITCHING_H   1     434.3 207985 7348.9
## <none>                          208419 7350.0
## + TEAM_BASERUN_CS   1      63.2 208356 7351.5
## 
## Step:  AIC=7189.61
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_BB   1   13797.0 173048 7077.6
## + TEAM_PITCHING_BB  1    9209.2 177636 7116.5
## + TEAM_BATTING_2B   1    9022.5 177822 7118.1
## + TEAM_BATTING_3B   1    6058.3 180787 7142.6
## + TEAM_BATTING_SO   1    5324.0 181521 7148.7
## + TEAM_PITCHING_SO  1    5261.2 181584 7149.2
## + TEAM_FIELDING_DP  1    4313.6 182531 7156.9
## + TEAM_BASERUN_CS   1    3703.1 183142 7161.9
## + TEAM_BASERUN_SB   1    3123.1 183722 7166.6
## + TEAM_BATTING_HR   1     936.8 185908 7184.1
## + TEAM_PITCHING_HR  1     744.5 186101 7185.7
## <none>                          186845 7189.6
## + TEAM_PITCHING_H   1      69.5 186775 7191.1
## 
## Step:  AIC=7077.62
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_2B   1   10172.4 162876 6989.6
## + TEAM_BATTING_3B   1    7906.2 165142 7010.1
## + TEAM_FIELDING_DP  1    6589.2 166459 7021.9
## + TEAM_BASERUN_CS   1    6276.0 166772 7024.7
## + TEAM_PITCHING_SO  1    5016.4 168032 7035.9
## + TEAM_BATTING_SO   1    4811.3 168237 7037.7
## + TEAM_BASERUN_SB   1    4357.0 168691 7041.7
## <none>                          173048 7077.6
## + TEAM_PITCHING_BB  1     156.1 172892 7078.3
## + TEAM_PITCHING_H   1     149.5 172898 7078.3
## + TEAM_BATTING_HR   1      41.2 173007 7079.3
## + TEAM_PITCHING_HR  1       5.4 173043 7079.6
## 
## Step:  AIC=6989.59
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_FIELDING_DP  1    8981.8 153894 6907.3
## + TEAM_BASERUN_CS   1    6186.6 156689 6934.0
## + TEAM_BASERUN_SB   1    5666.2 157209 6939.0
## + TEAM_BATTING_3B   1    5333.1 157542 6942.1
## + TEAM_PITCHING_SO  1    1739.9 161136 6975.6
## + TEAM_BATTING_SO   1    1444.5 161431 6978.4
## + TEAM_BATTING_HR   1     260.3 162615 6989.2
## <none>                          162876 6989.6
## + TEAM_PITCHING_HR  1     134.9 162741 6990.4
## + TEAM_PITCHING_BB  1     131.7 162744 6990.4
## + TEAM_PITCHING_H   1     121.0 162755 6990.5
## 
## Step:  AIC=6907.3
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_3B   1    5275.4 148618 6857.5
## + TEAM_BASERUN_CS   1    3887.1 150007 6871.3
## + TEAM_PITCHING_SO  1    2984.9 150909 6880.2
## + TEAM_BATTING_SO   1    2921.9 150972 6880.8
## + TEAM_BASERUN_SB   1    2870.7 151023 6881.3
## <none>                          153894 6907.3
## + TEAM_BATTING_HR   1     178.3 153715 6907.6
## + TEAM_PITCHING_HR  1      97.3 153797 6908.4
## + TEAM_PITCHING_BB  1      60.7 153833 6908.7
## + TEAM_PITCHING_H   1      58.4 153835 6908.7
## 
## Step:  AIC=6857.47
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BASERUN_CS   1   2582.26 146036 6833.4
## + TEAM_BATTING_HR   1   2339.03 146279 6835.9
## + TEAM_BASERUN_SB   1   2265.84 146353 6836.6
## + TEAM_PITCHING_HR  1   1748.98 146869 6841.9
## + TEAM_PITCHING_SO  1   1181.18 147437 6847.6
## + TEAM_BATTING_SO   1    906.37 147712 6850.4
## <none>                          148618 6857.5
## + TEAM_PITCHING_BB  1    159.04 148459 6857.9
## + TEAM_PITCHING_H   1    149.32 148469 6858.0
## 
## Step:  AIC=6833.42
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_HR   1    3817.8 142218 6796.1
## + TEAM_PITCHING_HR  1    2850.5 143186 6806.1
## + TEAM_PITCHING_SO  1    1336.1 144700 6821.8
## + TEAM_BATTING_SO   1     952.1 145084 6825.7
## + TEAM_BASERUN_SB   1     256.7 145779 6832.8
## + TEAM_PITCHING_BB  1     227.0 145809 6833.1
## + TEAM_PITCHING_H   1     224.4 145812 6833.1
## <none>                          146036 6833.4
## 
## Step:  AIC=6796.06
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HR
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BATTING_SO   1    5769.0 136449 6736.5
## + TEAM_PITCHING_SO  1    5287.1 136931 6741.8
## + TEAM_BASERUN_SB   1     324.9 141893 6794.7
## <none>                          142218 6796.1
## + TEAM_PITCHING_HR  1     169.7 142049 6796.3
## + TEAM_PITCHING_BB  1     126.1 142092 6796.7
## + TEAM_PITCHING_H   1     121.9 142096 6796.8
## 
## Step:  AIC=6736.52
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HR + TEAM_BATTING_SO
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_BASERUN_SB   1   1532.68 134917 6721.7
## + TEAM_PITCHING_SO  1    369.01 136080 6734.5
## + TEAM_PITCHING_HR  1    303.24 136146 6735.2
## + TEAM_PITCHING_BB  1    296.10 136153 6735.3
## + TEAM_PITCHING_H   1    279.71 136170 6735.5
## <none>                          136449 6736.5
## 
## Step:  AIC=6721.73
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_PITCHING_SO  1    384.34 134532 6719.5
## + TEAM_PITCHING_HR  1    325.77 134591 6720.1
## + TEAM_PITCHING_BB  1    299.45 134617 6720.4
## + TEAM_PITCHING_H   1    288.33 134628 6720.6
## <none>                          134917 6721.7
## 
## Step:  AIC=6719.49
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_SO
## 
##                    Df Sum of Sq    RSS    AIC
## + TEAM_PITCHING_H   1   201.975 134330 6719.3
## <none>                          134532 6719.5
## + TEAM_PITCHING_BB  1   110.059 134422 6720.3
## + TEAM_PITCHING_HR  1    10.869 134521 6721.4
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB + 
##     TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_SO + 
##     TEAM_PITCHING_H
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          134330 6719.3
## + TEAM_PITCHING_HR  1    6.4028 134324 6721.2
## + TEAM_PITCHING_BB  1    0.0352 134330 6721.3

2.3 Stepwise - both forward and backward

Stepwise regression is a combination of both backward elimination and forward selection methods. Stepwise method is a modification of the forward selection approach and differs in that variables already in the model do not necessarily stay. As in forward selection, stepwise regression adds one variable to the model at a time. After a variable is added, however, stepwise regression checks all the variables already included again to see whether there is a need to delete any variable that does not provide an improvement to the model based on a certain criterion.

The function stepAIC() can also be used to conduct forward selection.

model3_both <- stepAIC(object = lm(data = df_train, 
                                   formula = TARGET_WINS ~ .), 
                  direction = c("both")
                  )
## Start:  AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_BB  1       0.8 134324 6721.2
## - TEAM_PITCHING_HR  1       7.2 134330 6721.3
## - TEAM_BATTING_SO   1      55.2 134378 6721.8
## - TEAM_BATTING_H    1      56.5 134380 6721.8
## - TEAM_BATTING_HR   1      68.5 134392 6721.9
## - TEAM_BATTING_BB   1      81.0 134404 6722.1
## - TEAM_PITCHING_H   1      98.0 134421 6722.3
## <none>                          134323 6723.2
## - TEAM_PITCHING_SO  1     264.1 134587 6724.1
## - TEAM_BASERUN_CS   1     746.8 135070 6729.4
## - TEAM_BASERUN_SB   1    1557.8 135881 6738.3
## - TEAM_BATTING_3B   1    4838.9 139162 6773.8
## - TEAM_BATTING_2B   1    5166.3 139489 6777.3
## - TEAM_FIELDING_DP  1    6742.5 141066 6794.0
## - TEAM_FIELDING_E   1   22427.4 156751 6950.6
## 
## Step:  AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_HR  1       6.4 134330 6719.3
## - TEAM_BATTING_SO   1      56.2 134380 6719.8
## - TEAM_BATTING_HR   1      77.9 134402 6720.1
## - TEAM_BATTING_H    1     147.2 134471 6720.8
## <none>                          134324 6721.2
## - TEAM_PITCHING_H   1     197.5 134521 6721.4
## - TEAM_PITCHING_SO  1     266.3 134590 6722.1
## + TEAM_PITCHING_BB  1       0.8 134323 6723.2
## - TEAM_BASERUN_CS   1     746.5 135070 6727.4
## - TEAM_BASERUN_SB   1    1564.2 135888 6736.4
## - TEAM_BATTING_3B   1    4840.8 139165 6771.8
## - TEAM_BATTING_2B   1    5175.9 139500 6775.4
## - TEAM_FIELDING_DP  1    6744.6 141069 6792.0
## - TEAM_BATTING_BB   1   12568.9 146893 6852.1
## - TEAM_FIELDING_E   1   22491.7 156816 6949.2
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_BATTING_SO   1      51.2 134382 6717.8
## - TEAM_BATTING_H    1     144.7 134475 6718.9
## <none>                          134330 6719.3
## - TEAM_PITCHING_H   1     202.0 134532 6719.5
## - TEAM_PITCHING_SO  1     298.0 134628 6720.6
## + TEAM_PITCHING_HR  1       6.4 134324 6721.2
## + TEAM_PITCHING_BB  1       0.0 134330 6721.3
## - TEAM_BASERUN_CS   1     742.6 135073 6725.5
## - TEAM_BASERUN_SB   1    1570.4 135901 6734.5
## - TEAM_BATTING_3B   1    4842.6 139173 6769.9
## - TEAM_BATTING_2B   1    5198.7 139529 6773.7
## - TEAM_FIELDING_DP  1    6744.4 141075 6790.1
## - TEAM_BATTING_HR   1    9780.8 144111 6821.7
## - TEAM_BATTING_BB   1   12606.9 146937 6850.6
## - TEAM_FIELDING_E   1   22525.1 156855 6947.6
## 
## Step:  AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          134382 6717.8
## + TEAM_BATTING_SO   1      51.2 134330 6719.3
## + TEAM_PITCHING_BB  1       2.7 134379 6719.8
## + TEAM_PITCHING_HR  1       1.5 134380 6719.8
## - TEAM_BASERUN_CS   1     737.6 135119 6724.0
## - TEAM_PITCHING_H   1    1355.1 135737 6730.7
## - TEAM_BASERUN_SB   1    1575.6 135957 6733.2
## - TEAM_BATTING_H    1    1740.1 136122 6734.9
## - TEAM_BATTING_3B   1    4849.8 139231 6768.5
## - TEAM_BATTING_2B   1    5148.1 139530 6771.7
## - TEAM_FIELDING_DP  1    6779.2 141161 6789.0
## - TEAM_PITCHING_SO  1    7395.1 141777 6795.4
## - TEAM_BATTING_HR   1    9785.1 144167 6820.3
## - TEAM_BATTING_BB   1   12619.7 147001 6849.2
## - TEAM_FIELDING_E   1   22552.0 156934 6946.4

3 Presenting Models

?stargazer
stargazer(
  model3_backward, model3_forward, model3_both, 
          type = "text", 
          column.labels = c("Backward", "Forward", "Stepwise")
          )
## 
## ===================================================================================================
##                                                   Dependent variable:                              
##                     -------------------------------------------------------------------------------
##                                                       TARGET_WINS                                  
##                              Backward                   Forward                   Stepwise         
##                                (1)                        (2)                       (3)            
## ---------------------------------------------------------------------------------------------------
## TEAM_BATTING_H               0.026***                    0.017                    0.026***         
##                              (0.006)                    (0.013)                   (0.006)          
##                                                                                                    
## TEAM_BATTING_2B             -0.070***                  -0.071***                 -0.070***         
##                              (0.009)                    (0.009)                   (0.009)          
##                                                                                                    
## TEAM_BATTING_3B              0.162***                  0.162***                   0.162***         
##                              (0.022)                    (0.022)                   (0.022)          
##                                                                                                    
## TEAM_BATTING_HR              0.098***                  0.098***                   0.098***         
##                              (0.009)                    (0.009)                   (0.009)          
##                                                                                                    
## TEAM_BATTING_SO                                          0.015                                     
##                                                         (0.021)                                    
##                                                                                                    
## TEAM_BATTING_BB              0.039***                  0.039***                   0.039***         
##                              (0.003)                    (0.003)                   (0.003)          
##                                                                                                    
## TEAM_BASERUN_SB              0.036***                  0.036***                   0.036***         
##                              (0.009)                    (0.009)                   (0.009)          
##                                                                                                    
## TEAM_BASERUN_CS              0.052***                  0.052***                   0.052***         
##                              (0.018)                    (0.018)                   (0.018)          
##                                                                                                    
## TEAM_PITCHING_H              0.009***                    0.018                    0.009***         
##                              (0.002)                    (0.012)                   (0.002)          
##                                                                                                    
## TEAM_PITCHING_SO            -0.021***                   -0.035*                  -0.021***         
##                              (0.002)                    (0.020)                   (0.002)          
##                                                                                                    
## TEAM_FIELDING_E             -0.156***                  -0.156***                 -0.156***         
##                              (0.010)                    (0.010)                   (0.010)          
##                                                                                                    
## TEAM_FIELDING_DP            -0.113***                  -0.113***                 -0.113***         
##                              (0.013)                    (0.013)                   (0.013)          
##                                                                                                    
## Constant                    58.446***                  57.876***                 58.446***         
##                              (6.589)                    (6.634)                   (6.589)          
##                                                                                                    
## ---------------------------------------------------------------------------------------------------
## Observations                  1,486                      1,486                     1,486           
## R2                            0.438                      0.439                     0.438           
## Adjusted R2                   0.434                      0.434                     0.434           
## Residual Std. Error     9.548 (df = 1474)          9.550 (df = 1473)         9.548 (df = 1474)     
## F Statistic         104.596*** (df = 11; 1474) 95.898*** (df = 12; 1473) 104.596*** (df = 11; 1474)
## ===================================================================================================
## Note:                                                                   *p<0.1; **p<0.05; ***p<0.01

3.1 Best Subset

  • Cannot have missing values in data.
# Remove rows with missing values
df_train_clean <- na.omit(df_train)
head(col(df_train_clean[,-1])) # 14 independent vars
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
## [2,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
## [3,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
## [4,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
## [5,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
## [6,]    1    2    3    4    5    6    7    8    9    10    11    12    13    14
library(leaps)
?regsubsets()

# Run regsubsets on cleaned data
subset_model <- regsubsets(
  x = df_train_clean[,-1],       # Excluding TARGET_WINS
  y = df_train_clean[,1],        # TARGET_WINS as the outcome
  nbest = 1,                     # Number of best subsets to record - keep
  nvmax = 15                      # Maximum number of predictors - change
)

# Display summary of best models
model_summary <- summary(subset_model)
print(model_summary)
## Subset selection object
## 14 Variables  (and intercept)
##                  Forced in Forced out
## TEAM_BATTING_H       FALSE      FALSE
## TEAM_BATTING_2B      FALSE      FALSE
## TEAM_BATTING_3B      FALSE      FALSE
## TEAM_BATTING_HR      FALSE      FALSE
## TEAM_BATTING_BB      FALSE      FALSE
## TEAM_BATTING_SO      FALSE      FALSE
## TEAM_BASERUN_SB      FALSE      FALSE
## TEAM_BASERUN_CS      FALSE      FALSE
## TEAM_PITCHING_H      FALSE      FALSE
## TEAM_PITCHING_HR     FALSE      FALSE
## TEAM_PITCHING_BB     FALSE      FALSE
## TEAM_PITCHING_SO     FALSE      FALSE
## TEAM_FIELDING_E      FALSE      FALSE
## TEAM_FIELDING_DP     FALSE      FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: exhaustive
##           TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1  ( 1 )  "*"            " "             " "             " "            
## 2  ( 1 )  "*"            " "             " "             " "            
## 3  ( 1 )  "*"            " "             " "             " "            
## 4  ( 1 )  "*"            "*"             " "             " "            
## 5  ( 1 )  "*"            "*"             " "             " "            
## 6  ( 1 )  " "            " "             "*"             "*"            
## 7  ( 1 )  " "            " "             "*"             "*"            
## 8  ( 1 )  " "            "*"             "*"             "*"            
## 9  ( 1 )  "*"            "*"             "*"             " "            
## 10  ( 1 ) "*"            "*"             "*"             " "            
## 11  ( 1 ) "*"            "*"             "*"             "*"            
## 12  ( 1 ) "*"            "*"             "*"             "*"            
## 13  ( 1 ) "*"            "*"             "*"             "*"            
## 14  ( 1 ) "*"            "*"             "*"             "*"            
##           TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1  ( 1 )  " "             " "             " "             " "            
## 2  ( 1 )  " "             " "             " "             " "            
## 3  ( 1 )  "*"             " "             " "             " "            
## 4  ( 1 )  "*"             " "             " "             " "            
## 5  ( 1 )  "*"             " "             " "             " "            
## 6  ( 1 )  " "             " "             "*"             " "            
## 7  ( 1 )  " "             " "             "*"             " "            
## 8  ( 1 )  " "             " "             "*"             " "            
## 9  ( 1 )  "*"             " "             "*"             " "            
## 10  ( 1 ) "*"             " "             "*"             "*"            
## 11  ( 1 ) "*"             " "             "*"             "*"            
## 12  ( 1 ) "*"             "*"             "*"             "*"            
## 13  ( 1 ) "*"             "*"             "*"             "*"            
## 14  ( 1 ) "*"             "*"             "*"             "*"            
##           TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 1  ( 1 )  " "             " "              " "              " "             
## 2  ( 1 )  " "             " "              " "              " "             
## 3  ( 1 )  " "             " "              " "              " "             
## 4  ( 1 )  " "             " "              " "              " "             
## 5  ( 1 )  " "             " "              " "              " "             
## 6  ( 1 )  " "             " "              "*"              "*"             
## 7  ( 1 )  " "             " "              "*"              "*"             
## 8  ( 1 )  " "             " "              "*"              "*"             
## 9  ( 1 )  " "             "*"              " "              "*"             
## 10  ( 1 ) " "             "*"              " "              "*"             
## 11  ( 1 ) "*"             " "              " "              "*"             
## 12  ( 1 ) "*"             " "              " "              "*"             
## 13  ( 1 ) "*"             "*"              " "              "*"             
## 14  ( 1 ) "*"             "*"              "*"              "*"             
##           TEAM_FIELDING_E TEAM_FIELDING_DP
## 1  ( 1 )  " "             " "             
## 2  ( 1 )  "*"             " "             
## 3  ( 1 )  "*"             " "             
## 4  ( 1 )  "*"             " "             
## 5  ( 1 )  "*"             "*"             
## 6  ( 1 )  "*"             " "             
## 7  ( 1 )  "*"             "*"             
## 8  ( 1 )  "*"             "*"             
## 9  ( 1 )  "*"             "*"             
## 10  ( 1 ) "*"             "*"             
## 11  ( 1 ) "*"             "*"             
## 12  ( 1 ) "*"             "*"             
## 13  ( 1 ) "*"             "*"             
## 14  ( 1 ) "*"             "*"
cbind(model_summary$which, 
      round(x = cbind(rsq=model_summary$rsq, 
                  adjr2=model_summary$adjr2, 
                  cp=model_summary$cp, 
                  bic=model_summary$bic, 
                  rss=model_summary$rss),
            digits =  3)
      )
##    (Intercept) TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1            1              1               0               0               0
## 2            1              1               0               0               0
## 3            1              1               0               0               0
## 4            1              1               1               0               0
## 5            1              1               1               0               0
## 6            1              0               0               1               1
## 7            1              0               0               1               1
## 8            1              0               1               1               1
## 9            1              1               1               1               0
## 10           1              1               1               1               0
## 11           1              1               1               1               1
## 12           1              1               1               1               1
## 13           1              1               1               1               1
## 14           1              1               1               1               1
##    TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1                0               0               0               0
## 2                0               0               0               0
## 3                1               0               0               0
## 4                1               0               0               0
## 5                1               0               0               0
## 6                0               0               1               0
## 7                0               0               1               0
## 8                0               0               1               0
## 9                1               0               1               0
## 10               1               0               1               1
## 11               1               0               1               1
## 12               1               1               1               1
## 13               1               1               1               1
## 14               1               1               1               1
##    TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 1                0                0                0                0
## 2                0                0                0                0
## 3                0                0                0                0
## 4                0                0                0                0
## 5                0                0                0                0
## 6                0                0                1                1
## 7                0                0                1                1
## 8                0                0                1                1
## 9                0                1                0                1
## 10               0                1                0                1
## 11               1                0                0                1
## 12               1                0                0                1
## 13               1                1                0                1
## 14               1                1                1                1
##    TEAM_FIELDING_E TEAM_FIELDING_DP   rsq adjr2      cp      bic      rss
## 1                0                0 0.129 0.128 800.442 -190.551 208419.3
## 2                1                0 0.219 0.218 566.177 -345.627 186845.0
## 3                1                0 0.277 0.275 417.084 -452.315 173048.0
## 4                1                0 0.319 0.317 307.683 -535.036 162875.6
## 5                1                1 0.357 0.355 211.322 -612.024 153893.8
## 6                1                0 0.385 0.382 140.120 -670.708 147209.4
## 7                1                1 0.409 0.407  77.673 -724.030 141324.5
## 8                1                1 0.414 0.411  67.789 -728.181 140239.3
## 9                1                1 0.434 0.431  16.266 -773.588 135351.9
## 10               1                1 0.438 0.434   9.869 -774.727 134585.1
## 11               1                1 0.438 0.434   9.640 -769.672 134381.6
## 12               1                1 0.439 0.434  11.079 -762.935 134330.4
## 13               1                1 0.439 0.434  13.009 -755.702 134324.0
## 14               1                1 0.439 0.433  15.000 -748.407 134323.2