# Set working directory and path to data
cd <- "/Users/arvindsharma/Dropbox/WCAS/Econometrics/"
setwd(cd)
# Clear the workspace
rm(list = ls()) # Clear environment - remove all files from your workspace
invisible(gc()) # Clear unused memory
cat("\f") # Clear the console
graphics.off() # clear all graphs
# Prepare needed libraries
packages <- c("stargazer",
"psych",
"tidyverse",
"visdat",
"MASS", # backward AIC
"leaps" # best subset
)
suppressPackageStartupMessages({
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
})
df_train <- read.csv("moneyball-training-data.csv")
# df_eval <- read.csv("moneyball-evaluation-data.csv")
df_train$INDEX <- NULL
glimpse(df_train)
## Rows: 2,276
## Columns: 16
## $ TARGET_WINS <int> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 68, 72, 7…
## $ TEAM_BATTING_H <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
vis_dat(df_train)
df_train$TEAM_BATTING_HBP<- NULL
trace
arguments helps with controlling the output of
stepAIC
function.
# MODEL 3: Step AIC...Backward selection... drop variable systematically...
library(MASS)
?stepAIC
model3_backward <- stepAIC(object = lm(data = df_train,
TARGET_WINS ~ .),
direction = c("backward")
)
## Start: AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 0.8 134324 6721.2
## - TEAM_PITCHING_HR 1 7.2 134330 6721.3
## - TEAM_BATTING_SO 1 55.2 134378 6721.8
## - TEAM_BATTING_H 1 56.5 134380 6721.8
## - TEAM_BATTING_HR 1 68.5 134392 6721.9
## - TEAM_BATTING_BB 1 81.0 134404 6722.1
## - TEAM_PITCHING_H 1 98.0 134421 6722.3
## <none> 134323 6723.2
## - TEAM_PITCHING_SO 1 264.1 134587 6724.1
## - TEAM_BASERUN_CS 1 746.8 135070 6729.4
## - TEAM_BASERUN_SB 1 1557.8 135881 6738.3
## - TEAM_BATTING_3B 1 4838.9 139162 6773.8
## - TEAM_BATTING_2B 1 5166.3 139489 6777.3
## - TEAM_FIELDING_DP 1 6742.5 141066 6794.0
## - TEAM_FIELDING_E 1 22427.4 156751 6950.6
##
## Step: AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 6.4 134330 6719.3
## - TEAM_BATTING_SO 1 56.2 134380 6719.8
## - TEAM_BATTING_HR 1 77.9 134402 6720.1
## - TEAM_BATTING_H 1 147.2 134471 6720.8
## <none> 134324 6721.2
## - TEAM_PITCHING_H 1 197.5 134521 6721.4
## - TEAM_PITCHING_SO 1 266.3 134590 6722.1
## - TEAM_BASERUN_CS 1 746.5 135070 6727.4
## - TEAM_BASERUN_SB 1 1564.2 135888 6736.4
## - TEAM_BATTING_3B 1 4840.8 139165 6771.8
## - TEAM_BATTING_2B 1 5175.9 139500 6775.4
## - TEAM_FIELDING_DP 1 6744.6 141069 6792.0
## - TEAM_BATTING_BB 1 12568.9 146893 6852.1
## - TEAM_FIELDING_E 1 22491.7 156816 6949.2
##
## Step: AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 51.2 134382 6717.8
## - TEAM_BATTING_H 1 144.7 134475 6718.9
## <none> 134330 6719.3
## - TEAM_PITCHING_H 1 202.0 134532 6719.5
## - TEAM_PITCHING_SO 1 298.0 134628 6720.6
## - TEAM_BASERUN_CS 1 742.6 135073 6725.5
## - TEAM_BASERUN_SB 1 1570.4 135901 6734.5
## - TEAM_BATTING_3B 1 4842.6 139173 6769.9
## - TEAM_BATTING_2B 1 5198.7 139529 6773.7
## - TEAM_FIELDING_DP 1 6744.4 141075 6790.1
## - TEAM_BATTING_HR 1 9780.8 144111 6821.7
## - TEAM_BATTING_BB 1 12606.9 146937 6850.6
## - TEAM_FIELDING_E 1 22525.1 156855 6947.6
##
## Step: AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 134382 6717.8
## - TEAM_BASERUN_CS 1 737.6 135119 6724.0
## - TEAM_PITCHING_H 1 1355.1 135737 6730.7
## - TEAM_BASERUN_SB 1 1575.6 135957 6733.2
## - TEAM_BATTING_H 1 1740.1 136122 6734.9
## - TEAM_BATTING_3B 1 4849.8 139231 6768.5
## - TEAM_BATTING_2B 1 5148.1 139530 6771.7
## - TEAM_FIELDING_DP 1 6779.2 141161 6789.0
## - TEAM_PITCHING_SO 1 7395.1 141777 6795.4
## - TEAM_BATTING_HR 1 9785.1 144167 6820.3
## - TEAM_BATTING_BB 1 12619.7 147001 6849.2
## - TEAM_FIELDING_E 1 22552.0 156934 6946.4
If you have a very large set of candidate predictors from which you wish to extract a few–i.e., if you’re on a fishing expedition–you should generally go forward. If, on the other hand, if you have a modest-sized set of potential variables from which you wish to eliminate a few–i.e., if you’re fine-tuning some prior selection of variables–you should generally go backward. If you’re on a fishing expedition, you should still be careful not to cast too wide a net, selecting variables that are only accidentally related to your dependent variable.
Error in `stepAIC()`:
! number of rows in use has changed: remove missing values?
# Remove rows with missing values
df_train_clean <- na.omit(df_train)
?stepAIC
null <- lm(data = df_train_clean,
formula = TARGET_WINS ~ 1)
full <- lm(data = df_train_clean,
formula = TARGET_WINS ~ .)
model3_forward <-
stepAIC(object=null,
scope=list(lower=null, upper=full),
data=df_train_clean,
direction='forward')
## Start: AIC=7553.15
## TARGET_WINS ~ 1
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_H 1 30855.6 208419 7350.0
## + TEAM_BATTING_BB 1 29025.8 210249 7363.0
## + TEAM_PITCHING_BB 1 20814.1 218461 7419.9
## + TEAM_BATTING_HR 1 19226.3 220049 7430.7
## + TEAM_PITCHING_HR 1 18588.4 220687 7435.0
## + TEAM_FIELDING_E 1 15498.8 223776 7455.6
## + TEAM_PITCHING_H 1 11174.3 228101 7484.1
## + TEAM_BATTING_2B 1 8984.1 230291 7498.3
## + TEAM_BASERUN_SB 1 3465.8 235809 7533.5
## + TEAM_BATTING_3B 1 1637.7 237637 7544.9
## + TEAM_PITCHING_SO 1 1065.7 238209 7548.5
## + TEAM_BATTING_SO 1 946.7 238328 7549.3
## + TEAM_FIELDING_DP 1 586.2 238689 7551.5
## <none> 239275 7553.1
## + TEAM_BASERUN_CS 1 30.0 239245 7555.0
##
## Step: AIC=7349.99
## TARGET_WINS ~ TEAM_BATTING_H
##
## Df Sum of Sq RSS AIC
## + TEAM_FIELDING_E 1 21574.4 186845 7189.6
## + TEAM_BATTING_BB 1 20362.1 188057 7199.2
## + TEAM_BATTING_HR 1 13968.1 194451 7248.9
## + TEAM_PITCHING_HR 1 12856.0 195563 7257.4
## + TEAM_PITCHING_BB 1 12822.0 195597 7257.6
## + TEAM_BASERUN_SB 1 3336.3 205083 7328.0
## + TEAM_FIELDING_DP 1 2589.1 205830 7333.4
## + TEAM_BATTING_3B 1 1414.1 207005 7341.9
## + TEAM_BATTING_SO 1 1291.2 207128 7342.8
## + TEAM_BATTING_2B 1 1056.8 207363 7344.4
## + TEAM_PITCHING_SO 1 719.9 207699 7346.8
## + TEAM_PITCHING_H 1 434.3 207985 7348.9
## <none> 208419 7350.0
## + TEAM_BASERUN_CS 1 63.2 208356 7351.5
##
## Step: AIC=7189.61
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_BB 1 13797.0 173048 7077.6
## + TEAM_PITCHING_BB 1 9209.2 177636 7116.5
## + TEAM_BATTING_2B 1 9022.5 177822 7118.1
## + TEAM_BATTING_3B 1 6058.3 180787 7142.6
## + TEAM_BATTING_SO 1 5324.0 181521 7148.7
## + TEAM_PITCHING_SO 1 5261.2 181584 7149.2
## + TEAM_FIELDING_DP 1 4313.6 182531 7156.9
## + TEAM_BASERUN_CS 1 3703.1 183142 7161.9
## + TEAM_BASERUN_SB 1 3123.1 183722 7166.6
## + TEAM_BATTING_HR 1 936.8 185908 7184.1
## + TEAM_PITCHING_HR 1 744.5 186101 7185.7
## <none> 186845 7189.6
## + TEAM_PITCHING_H 1 69.5 186775 7191.1
##
## Step: AIC=7077.62
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_2B 1 10172.4 162876 6989.6
## + TEAM_BATTING_3B 1 7906.2 165142 7010.1
## + TEAM_FIELDING_DP 1 6589.2 166459 7021.9
## + TEAM_BASERUN_CS 1 6276.0 166772 7024.7
## + TEAM_PITCHING_SO 1 5016.4 168032 7035.9
## + TEAM_BATTING_SO 1 4811.3 168237 7037.7
## + TEAM_BASERUN_SB 1 4357.0 168691 7041.7
## <none> 173048 7077.6
## + TEAM_PITCHING_BB 1 156.1 172892 7078.3
## + TEAM_PITCHING_H 1 149.5 172898 7078.3
## + TEAM_BATTING_HR 1 41.2 173007 7079.3
## + TEAM_PITCHING_HR 1 5.4 173043 7079.6
##
## Step: AIC=6989.59
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B
##
## Df Sum of Sq RSS AIC
## + TEAM_FIELDING_DP 1 8981.8 153894 6907.3
## + TEAM_BASERUN_CS 1 6186.6 156689 6934.0
## + TEAM_BASERUN_SB 1 5666.2 157209 6939.0
## + TEAM_BATTING_3B 1 5333.1 157542 6942.1
## + TEAM_PITCHING_SO 1 1739.9 161136 6975.6
## + TEAM_BATTING_SO 1 1444.5 161431 6978.4
## + TEAM_BATTING_HR 1 260.3 162615 6989.2
## <none> 162876 6989.6
## + TEAM_PITCHING_HR 1 134.9 162741 6990.4
## + TEAM_PITCHING_BB 1 131.7 162744 6990.4
## + TEAM_PITCHING_H 1 121.0 162755 6990.5
##
## Step: AIC=6907.3
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_3B 1 5275.4 148618 6857.5
## + TEAM_BASERUN_CS 1 3887.1 150007 6871.3
## + TEAM_PITCHING_SO 1 2984.9 150909 6880.2
## + TEAM_BATTING_SO 1 2921.9 150972 6880.8
## + TEAM_BASERUN_SB 1 2870.7 151023 6881.3
## <none> 153894 6907.3
## + TEAM_BATTING_HR 1 178.3 153715 6907.6
## + TEAM_PITCHING_HR 1 97.3 153797 6908.4
## + TEAM_PITCHING_BB 1 60.7 153833 6908.7
## + TEAM_PITCHING_H 1 58.4 153835 6908.7
##
## Step: AIC=6857.47
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B
##
## Df Sum of Sq RSS AIC
## + TEAM_BASERUN_CS 1 2582.26 146036 6833.4
## + TEAM_BATTING_HR 1 2339.03 146279 6835.9
## + TEAM_BASERUN_SB 1 2265.84 146353 6836.6
## + TEAM_PITCHING_HR 1 1748.98 146869 6841.9
## + TEAM_PITCHING_SO 1 1181.18 147437 6847.6
## + TEAM_BATTING_SO 1 906.37 147712 6850.4
## <none> 148618 6857.5
## + TEAM_PITCHING_BB 1 159.04 148459 6857.9
## + TEAM_PITCHING_H 1 149.32 148469 6858.0
##
## Step: AIC=6833.42
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_HR 1 3817.8 142218 6796.1
## + TEAM_PITCHING_HR 1 2850.5 143186 6806.1
## + TEAM_PITCHING_SO 1 1336.1 144700 6821.8
## + TEAM_BATTING_SO 1 952.1 145084 6825.7
## + TEAM_BASERUN_SB 1 256.7 145779 6832.8
## + TEAM_PITCHING_BB 1 227.0 145809 6833.1
## + TEAM_PITCHING_H 1 224.4 145812 6833.1
## <none> 146036 6833.4
##
## Step: AIC=6796.06
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_HR
##
## Df Sum of Sq RSS AIC
## + TEAM_BATTING_SO 1 5769.0 136449 6736.5
## + TEAM_PITCHING_SO 1 5287.1 136931 6741.8
## + TEAM_BASERUN_SB 1 324.9 141893 6794.7
## <none> 142218 6796.1
## + TEAM_PITCHING_HR 1 169.7 142049 6796.3
## + TEAM_PITCHING_BB 1 126.1 142092 6796.7
## + TEAM_PITCHING_H 1 121.9 142096 6796.8
##
## Step: AIC=6736.52
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_HR + TEAM_BATTING_SO
##
## Df Sum of Sq RSS AIC
## + TEAM_BASERUN_SB 1 1532.68 134917 6721.7
## + TEAM_PITCHING_SO 1 369.01 136080 6734.5
## + TEAM_PITCHING_HR 1 303.24 136146 6735.2
## + TEAM_PITCHING_BB 1 296.10 136153 6735.3
## + TEAM_PITCHING_H 1 279.71 136170 6735.5
## <none> 136449 6736.5
##
## Step: AIC=6721.73
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_SO 1 384.34 134532 6719.5
## + TEAM_PITCHING_HR 1 325.77 134591 6720.1
## + TEAM_PITCHING_BB 1 299.45 134617 6720.4
## + TEAM_PITCHING_H 1 288.33 134628 6720.6
## <none> 134917 6721.7
##
## Step: AIC=6719.49
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_SO
##
## Df Sum of Sq RSS AIC
## + TEAM_PITCHING_H 1 201.975 134330 6719.3
## <none> 134532 6719.5
## + TEAM_PITCHING_BB 1 110.059 134422 6720.3
## + TEAM_PITCHING_HR 1 10.869 134521 6721.4
##
## Step: AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_FIELDING_E + TEAM_BATTING_BB +
## TEAM_BATTING_2B + TEAM_FIELDING_DP + TEAM_BATTING_3B + TEAM_BASERUN_CS +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_PITCHING_SO +
## TEAM_PITCHING_H
##
## Df Sum of Sq RSS AIC
## <none> 134330 6719.3
## + TEAM_PITCHING_HR 1 6.4028 134324 6721.2
## + TEAM_PITCHING_BB 1 0.0352 134330 6721.3
Stepwise regression is a combination of both backward elimination and forward selection methods. Stepwise method is a modification of the forward selection approach and differs in that variables already in the model do not necessarily stay. As in forward selection, stepwise regression adds one variable to the model at a time. After a variable is added, however, stepwise regression checks all the variables already included again to see whether there is a need to delete any variable that does not provide an improvement to the model based on a certain criterion.
The function stepAIC()
can also be used to conduct
forward selection.
model3_both <- stepAIC(object = lm(data = df_train,
formula = TARGET_WINS ~ .),
direction = c("both")
)
## Start: AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 0.8 134324 6721.2
## - TEAM_PITCHING_HR 1 7.2 134330 6721.3
## - TEAM_BATTING_SO 1 55.2 134378 6721.8
## - TEAM_BATTING_H 1 56.5 134380 6721.8
## - TEAM_BATTING_HR 1 68.5 134392 6721.9
## - TEAM_BATTING_BB 1 81.0 134404 6722.1
## - TEAM_PITCHING_H 1 98.0 134421 6722.3
## <none> 134323 6723.2
## - TEAM_PITCHING_SO 1 264.1 134587 6724.1
## - TEAM_BASERUN_CS 1 746.8 135070 6729.4
## - TEAM_BASERUN_SB 1 1557.8 135881 6738.3
## - TEAM_BATTING_3B 1 4838.9 139162 6773.8
## - TEAM_BATTING_2B 1 5166.3 139489 6777.3
## - TEAM_FIELDING_DP 1 6742.5 141066 6794.0
## - TEAM_FIELDING_E 1 22427.4 156751 6950.6
##
## Step: AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 6.4 134330 6719.3
## - TEAM_BATTING_SO 1 56.2 134380 6719.8
## - TEAM_BATTING_HR 1 77.9 134402 6720.1
## - TEAM_BATTING_H 1 147.2 134471 6720.8
## <none> 134324 6721.2
## - TEAM_PITCHING_H 1 197.5 134521 6721.4
## - TEAM_PITCHING_SO 1 266.3 134590 6722.1
## + TEAM_PITCHING_BB 1 0.8 134323 6723.2
## - TEAM_BASERUN_CS 1 746.5 135070 6727.4
## - TEAM_BASERUN_SB 1 1564.2 135888 6736.4
## - TEAM_BATTING_3B 1 4840.8 139165 6771.8
## - TEAM_BATTING_2B 1 5175.9 139500 6775.4
## - TEAM_FIELDING_DP 1 6744.6 141069 6792.0
## - TEAM_BATTING_BB 1 12568.9 146893 6852.1
## - TEAM_FIELDING_E 1 22491.7 156816 6949.2
##
## Step: AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 51.2 134382 6717.8
## - TEAM_BATTING_H 1 144.7 134475 6718.9
## <none> 134330 6719.3
## - TEAM_PITCHING_H 1 202.0 134532 6719.5
## - TEAM_PITCHING_SO 1 298.0 134628 6720.6
## + TEAM_PITCHING_HR 1 6.4 134324 6721.2
## + TEAM_PITCHING_BB 1 0.0 134330 6721.3
## - TEAM_BASERUN_CS 1 742.6 135073 6725.5
## - TEAM_BASERUN_SB 1 1570.4 135901 6734.5
## - TEAM_BATTING_3B 1 4842.6 139173 6769.9
## - TEAM_BATTING_2B 1 5198.7 139529 6773.7
## - TEAM_FIELDING_DP 1 6744.4 141075 6790.1
## - TEAM_BATTING_HR 1 9780.8 144111 6821.7
## - TEAM_BATTING_BB 1 12606.9 146937 6850.6
## - TEAM_FIELDING_E 1 22525.1 156855 6947.6
##
## Step: AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 134382 6717.8
## + TEAM_BATTING_SO 1 51.2 134330 6719.3
## + TEAM_PITCHING_BB 1 2.7 134379 6719.8
## + TEAM_PITCHING_HR 1 1.5 134380 6719.8
## - TEAM_BASERUN_CS 1 737.6 135119 6724.0
## - TEAM_PITCHING_H 1 1355.1 135737 6730.7
## - TEAM_BASERUN_SB 1 1575.6 135957 6733.2
## - TEAM_BATTING_H 1 1740.1 136122 6734.9
## - TEAM_BATTING_3B 1 4849.8 139231 6768.5
## - TEAM_BATTING_2B 1 5148.1 139530 6771.7
## - TEAM_FIELDING_DP 1 6779.2 141161 6789.0
## - TEAM_PITCHING_SO 1 7395.1 141777 6795.4
## - TEAM_BATTING_HR 1 9785.1 144167 6820.3
## - TEAM_BATTING_BB 1 12619.7 147001 6849.2
## - TEAM_FIELDING_E 1 22552.0 156934 6946.4
?stargazer
stargazer(
model3_backward, model3_forward, model3_both,
type = "text",
column.labels = c("Backward", "Forward", "Stepwise")
)
##
## ===================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------
## TARGET_WINS
## Backward Forward Stepwise
## (1) (2) (3)
## ---------------------------------------------------------------------------------------------------
## TEAM_BATTING_H 0.026*** 0.017 0.026***
## (0.006) (0.013) (0.006)
##
## TEAM_BATTING_2B -0.070*** -0.071*** -0.070***
## (0.009) (0.009) (0.009)
##
## TEAM_BATTING_3B 0.162*** 0.162*** 0.162***
## (0.022) (0.022) (0.022)
##
## TEAM_BATTING_HR 0.098*** 0.098*** 0.098***
## (0.009) (0.009) (0.009)
##
## TEAM_BATTING_SO 0.015
## (0.021)
##
## TEAM_BATTING_BB 0.039*** 0.039*** 0.039***
## (0.003) (0.003) (0.003)
##
## TEAM_BASERUN_SB 0.036*** 0.036*** 0.036***
## (0.009) (0.009) (0.009)
##
## TEAM_BASERUN_CS 0.052*** 0.052*** 0.052***
## (0.018) (0.018) (0.018)
##
## TEAM_PITCHING_H 0.009*** 0.018 0.009***
## (0.002) (0.012) (0.002)
##
## TEAM_PITCHING_SO -0.021*** -0.035* -0.021***
## (0.002) (0.020) (0.002)
##
## TEAM_FIELDING_E -0.156*** -0.156*** -0.156***
## (0.010) (0.010) (0.010)
##
## TEAM_FIELDING_DP -0.113*** -0.113*** -0.113***
## (0.013) (0.013) (0.013)
##
## Constant 58.446*** 57.876*** 58.446***
## (6.589) (6.634) (6.589)
##
## ---------------------------------------------------------------------------------------------------
## Observations 1,486 1,486 1,486
## R2 0.438 0.439 0.438
## Adjusted R2 0.434 0.434 0.434
## Residual Std. Error 9.548 (df = 1474) 9.550 (df = 1473) 9.548 (df = 1474)
## F Statistic 104.596*** (df = 11; 1474) 95.898*** (df = 12; 1473) 104.596*** (df = 11; 1474)
## ===================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Remove rows with missing values
df_train_clean <- na.omit(df_train)
head(col(df_train_clean[,-1])) # 14 independent vars
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## [2,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## [3,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## [4,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## [5,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## [6,] 1 2 3 4 5 6 7 8 9 10 11 12 13 14
library(leaps)
?regsubsets()
# Run regsubsets on cleaned data
subset_model <- regsubsets(
x = df_train_clean[,-1], # Excluding TARGET_WINS
y = df_train_clean[,1], # TARGET_WINS as the outcome
nbest = 1, # Number of best subsets to record - keep
nvmax = 15 # Maximum number of predictors - change
)
# Display summary of best models
model_summary <- summary(subset_model)
print(model_summary)
## Subset selection object
## 14 Variables (and intercept)
## Forced in Forced out
## TEAM_BATTING_H FALSE FALSE
## TEAM_BATTING_2B FALSE FALSE
## TEAM_BATTING_3B FALSE FALSE
## TEAM_BATTING_HR FALSE FALSE
## TEAM_BATTING_BB FALSE FALSE
## TEAM_BATTING_SO FALSE FALSE
## TEAM_BASERUN_SB FALSE FALSE
## TEAM_BASERUN_CS FALSE FALSE
## TEAM_PITCHING_H FALSE FALSE
## TEAM_PITCHING_HR FALSE FALSE
## TEAM_PITCHING_BB FALSE FALSE
## TEAM_PITCHING_SO FALSE FALSE
## TEAM_FIELDING_E FALSE FALSE
## TEAM_FIELDING_DP FALSE FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: exhaustive
## TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" " " " " " "
## 3 ( 1 ) "*" " " " " " "
## 4 ( 1 ) "*" "*" " " " "
## 5 ( 1 ) "*" "*" " " " "
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " "*" "*" "*"
## 9 ( 1 ) "*" "*" "*" " "
## 10 ( 1 ) "*" "*" "*" " "
## 11 ( 1 ) "*" "*" "*" "*"
## 12 ( 1 ) "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*"
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) "*" " " " " " "
## 4 ( 1 ) "*" " " " " " "
## 5 ( 1 ) "*" " " " " " "
## 6 ( 1 ) " " " " "*" " "
## 7 ( 1 ) " " " " "*" " "
## 8 ( 1 ) " " " " "*" " "
## 9 ( 1 ) "*" " " "*" " "
## 10 ( 1 ) "*" " " "*" "*"
## 11 ( 1 ) "*" " " "*" "*"
## 12 ( 1 ) "*" "*" "*" "*"
## 13 ( 1 ) "*" "*" "*" "*"
## 14 ( 1 ) "*" "*" "*" "*"
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## 9 ( 1 ) " " "*" " " "*"
## 10 ( 1 ) " " "*" " " "*"
## 11 ( 1 ) "*" " " " " "*"
## 12 ( 1 ) "*" " " " " "*"
## 13 ( 1 ) "*" "*" " " "*"
## 14 ( 1 ) "*" "*" "*" "*"
## TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 ( 1 ) " " " "
## 2 ( 1 ) "*" " "
## 3 ( 1 ) "*" " "
## 4 ( 1 ) "*" " "
## 5 ( 1 ) "*" "*"
## 6 ( 1 ) "*" " "
## 7 ( 1 ) "*" "*"
## 8 ( 1 ) "*" "*"
## 9 ( 1 ) "*" "*"
## 10 ( 1 ) "*" "*"
## 11 ( 1 ) "*" "*"
## 12 ( 1 ) "*" "*"
## 13 ( 1 ) "*" "*"
## 14 ( 1 ) "*" "*"
cbind(model_summary$which,
round(x = cbind(rsq=model_summary$rsq,
adjr2=model_summary$adjr2,
cp=model_summary$cp,
bic=model_summary$bic,
rss=model_summary$rss),
digits = 3)
)
## (Intercept) TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 1 1 0 0 0
## 2 1 1 0 0 0
## 3 1 1 0 0 0
## 4 1 1 1 0 0
## 5 1 1 1 0 0
## 6 1 0 0 1 1
## 7 1 0 0 1 1
## 8 1 0 1 1 1
## 9 1 1 1 1 0
## 10 1 1 1 1 0
## 11 1 1 1 1 1
## 12 1 1 1 1 1
## 13 1 1 1 1 1
## 14 1 1 1 1 1
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 0 0 0 0
## 2 0 0 0 0
## 3 1 0 0 0
## 4 1 0 0 0
## 5 1 0 0 0
## 6 0 0 1 0
## 7 0 0 1 0
## 8 0 0 1 0
## 9 1 0 1 0
## 10 1 0 1 1
## 11 1 0 1 1
## 12 1 1 1 1
## 13 1 1 1 1
## 14 1 1 1 1
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 1 1
## 7 0 0 1 1
## 8 0 0 1 1
## 9 0 1 0 1
## 10 0 1 0 1
## 11 1 0 0 1
## 12 1 0 0 1
## 13 1 1 0 1
## 14 1 1 1 1
## TEAM_FIELDING_E TEAM_FIELDING_DP rsq adjr2 cp bic rss
## 1 0 0 0.129 0.128 800.442 -190.551 208419.3
## 2 1 0 0.219 0.218 566.177 -345.627 186845.0
## 3 1 0 0.277 0.275 417.084 -452.315 173048.0
## 4 1 0 0.319 0.317 307.683 -535.036 162875.6
## 5 1 1 0.357 0.355 211.322 -612.024 153893.8
## 6 1 0 0.385 0.382 140.120 -670.708 147209.4
## 7 1 1 0.409 0.407 77.673 -724.030 141324.5
## 8 1 1 0.414 0.411 67.789 -728.181 140239.3
## 9 1 1 0.434 0.431 16.266 -773.588 135351.9
## 10 1 1 0.438 0.434 9.869 -774.727 134585.1
## 11 1 1 0.438 0.434 9.640 -769.672 134381.6
## 12 1 1 0.439 0.434 11.079 -762.935 134330.4
## 13 1 1 0.439 0.434 13.009 -755.702 134324.0
## 14 1 1 0.439 0.433 15.000 -748.407 134323.2