Moneyball.Training.Data

Clean the Data

rm(list=ls())
df <- read.csv("~/Downloads/moneyball-training-data-3.csv", header=TRUE)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(visdat)


for(i in colnames(df)){
  df[,i][is.na(df[,i])] <- median(df[,i], na.rm=TRUE)
}

vis_miss(df)

Summary Stats on the Clean Data

clean_df <- as.data.frame(df)

library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
stargazer(clean_df, type ="text", summary.stat = c("Mean", "Median", "Min", "Max"))

===================================================
Statistic          Mean    Median   Min     Max    
---------------------------------------------------
INDEX            1,268.464 1,270.5   1     2,535   
TARGET_WINS       80.791     82      0      146    
TEAM_BATTING_H   1,469.270  1,454   891    2,554   
TEAM_BATTING_2B   241.247    238    69      458    
TEAM_BATTING_3B   55.250     47      0      223    
TEAM_BATTING_HR   99.612     102     0      264    
TEAM_BATTING_BB   501.559    512     0      878    
TEAM_BATTING_SO   736.250    750     0     1,399   
TEAM_BASERUN_SB   123.394    101     0      697    
TEAM_BASERUN_CS   51.514     49      0      201    
TEAM_BATTING_HBP  58.114     58     29       95    
TEAM_PITCHING_H  1,779.210  1,518  1,137   30,132  
TEAM_PITCHING_HR  105.699    107     0      343    
TEAM_PITCHING_BB  553.008   536.5    0     3,645   
TEAM_PITCHING_SO  817.541  813.500 0.000 19,278.000
TEAM_FIELDING_E   246.481    159    65     1,898   
TEAM_FIELDING_DP  146.716    149    52      228    
---------------------------------------------------

Regression

kitchen_sink <- lm(data = clean_df, formula = TARGET_WINS ~ .)

Backward Selection

library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
best_model <-
stepAIC(object = kitchen_sink, 
        direction = "backward")
Start:  AIC=11718.52
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
    TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
- TEAM_PITCHING_BB  1       4.5 386150 11716
- TEAM_PITCHING_HR  1      48.5 386194 11717
- TEAM_BATTING_HBP  1      75.9 386221 11717
- TEAM_BASERUN_CS   1      80.3 386226 11717
- INDEX             1     247.3 386393 11718
<none>                          386146 11718
- TEAM_BATTING_BB   1     524.6 386670 11720
- TEAM_BATTING_HR   1     639.6 386785 11720
- TEAM_PITCHING_H   1     857.8 387003 11722
- TEAM_BATTING_2B   1     900.5 387046 11722
- TEAM_PITCHING_SO  1    1612.6 387758 11726
- TEAM_BATTING_SO   1    1843.8 387989 11727
- TEAM_BATTING_3B   1    2603.5 388749 11732
- TEAM_BASERUN_SB   1    5964.8 392110 11751
- TEAM_FIELDING_E   1   10905.3 397051 11780
- TEAM_FIELDING_DP  1   14891.9 401037 11803
- TEAM_BATTING_H    1   29812.4 415958 11886

Step:  AIC=11716.54
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
    TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
- TEAM_BATTING_HBP  1      76.2 386226 11715
- TEAM_BASERUN_CS   1      81.8 386232 11715
- TEAM_PITCHING_HR  1      86.6 386237 11715
- INDEX             1     248.6 386399 11716
<none>                          386150 11716
- TEAM_BATTING_HR   1     765.5 386915 11719
- TEAM_BATTING_2B   1     898.9 387049 11720
- TEAM_PITCHING_H   1    1005.6 387156 11720
- TEAM_BATTING_BB   1    1786.7 387937 11725
- TEAM_BATTING_SO   1    2034.6 388185 11726
- TEAM_BATTING_3B   1    2603.2 388753 11730
- TEAM_PITCHING_SO  1    3247.1 389397 11734
- TEAM_BASERUN_SB   1    6158.6 392309 11751
- TEAM_FIELDING_E   1   10902.6 397053 11778
- TEAM_FIELDING_DP  1   14887.4 401037 11801
- TEAM_BATTING_H    1   29859.4 416009 11884

Step:  AIC=11714.99
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
- TEAM_BASERUN_CS   1      84.6 386311 11714
- TEAM_PITCHING_HR  1      84.7 386311 11714
- INDEX             1     242.3 386469 11714
<none>                          386226 11715
- TEAM_BATTING_HR   1     770.5 386997 11718
- TEAM_BATTING_2B   1     886.8 387113 11718
- TEAM_PITCHING_H   1    1005.6 387232 11719
- TEAM_BATTING_BB   1    1795.4 388022 11724
- TEAM_BATTING_SO   1    2007.4 388234 11725
- TEAM_BATTING_3B   1    2598.2 388824 11728
- TEAM_PITCHING_SO  1    3238.8 389465 11732
- TEAM_BASERUN_SB   1    6148.1 392374 11749
- TEAM_FIELDING_E   1   10861.6 397088 11776
- TEAM_FIELDING_DP  1   14950.5 401177 11799
- TEAM_BATTING_H    1   29878.9 416105 11883

Step:  AIC=11713.49
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
    TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
- TEAM_PITCHING_HR  1      81.2 386392 11712
- INDEX             1     242.5 386553 11713
<none>                          386311 11714
- TEAM_BATTING_HR   1     822.9 387134 11716
- TEAM_BATTING_2B   1     910.0 387221 11717
- TEAM_PITCHING_H   1    1033.2 387344 11718
- TEAM_BATTING_BB   1    1886.5 388197 11723
- TEAM_BATTING_SO   1    2008.6 388319 11723
- TEAM_BATTING_3B   1    2638.3 388949 11727
- TEAM_PITCHING_SO  1    3260.8 389572 11731
- TEAM_BASERUN_SB   1    6116.6 392427 11747
- TEAM_FIELDING_E   1   10972.2 397283 11775
- TEAM_FIELDING_DP  1   15008.4 401319 11798
- TEAM_BATTING_H    1   29835.2 416146 11881

Step:  AIC=11711.97
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
- INDEX             1     247.7 386640 11711
<none>                          386392 11712
- TEAM_BATTING_2B   1     926.3 387318 11715
- TEAM_PITCHING_H   1     958.1 387350 11716
- TEAM_BATTING_BB   1    1879.4 388271 11721
- TEAM_BATTING_SO   1    1990.3 388382 11722
- TEAM_BATTING_3B   1    2823.2 389215 11726
- TEAM_PITCHING_SO  1    3318.3 389710 11729
- TEAM_BASERUN_SB   1    6121.3 392513 11746
- TEAM_BATTING_HR   1    8473.8 394866 11759
- TEAM_FIELDING_E   1   10892.6 397285 11773
- TEAM_FIELDING_DP  1   14974.3 401366 11796
- TEAM_BATTING_H    1   30426.1 416818 11882

Step:  AIC=11711.43
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS   AIC
<none>                          386640 11711
- TEAM_BATTING_2B   1     929.4 387569 11715
- TEAM_PITCHING_H   1    1001.0 387641 11715
- TEAM_BATTING_BB   1    1999.1 388639 11721
- TEAM_BATTING_SO   1    2060.9 388701 11722
- TEAM_BATTING_3B   1    2739.4 389379 11726
- TEAM_PITCHING_SO  1    3328.3 389968 11729
- TEAM_BASERUN_SB   1    5986.1 392626 11744
- TEAM_BATTING_HR   1    8364.1 395004 11758
- TEAM_FIELDING_E   1   10786.9 397427 11772
- TEAM_FIELDING_DP  1   15152.3 401792 11797
- TEAM_BATTING_H    1   30558.9 417199 11883

Insights

  • In the final regression we should keep the TEAM_BATTING_2B, TEAM_BATTING_BB, TEAM_BATTING_SO, TEAM_BATTING_3B, TEAM_PITCHING_SO, TEAM_BASERUN_SB, TEAM_BATTING_HR, TEAM_FIELDING_E, TEAM_FIELDING_DP, TEAM_BATTING_H