MoneyballLinearRegression

Importing the Data

remove(list=ls()) # Cleaning the environment
money <- read.csv("~/Downloads/moneyball-training-data.csv") # Importing the Data

Cleaning the Data

head(is.na(money)) # search for missing values in data set
     INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
[1,] FALSE       FALSE          FALSE           FALSE           FALSE
[2,] FALSE       FALSE          FALSE           FALSE           FALSE
[3,] FALSE       FALSE          FALSE           FALSE           FALSE
[4,] FALSE       FALSE          FALSE           FALSE           FALSE
[5,] FALSE       FALSE          FALSE           FALSE           FALSE
[6,] FALSE       FALSE          FALSE           FALSE           FALSE
     TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
[1,]           FALSE           FALSE           FALSE            TRUE
[2,]           FALSE           FALSE           FALSE           FALSE
[3,]           FALSE           FALSE           FALSE           FALSE
[4,]           FALSE           FALSE           FALSE           FALSE
[5,]           FALSE           FALSE           FALSE           FALSE
[6,]           FALSE           FALSE           FALSE           FALSE
     TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
[1,]            TRUE             TRUE           FALSE            FALSE
[2,]           FALSE             TRUE           FALSE            FALSE
[3,]           FALSE             TRUE           FALSE            FALSE
[4,]           FALSE             TRUE           FALSE            FALSE
[5,]           FALSE             TRUE           FALSE            FALSE
[6,]           FALSE             TRUE           FALSE            FALSE
     TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
[1,]            FALSE            FALSE           FALSE             TRUE
[2,]            FALSE            FALSE           FALSE            FALSE
[3,]            FALSE            FALSE           FALSE            FALSE
[4,]            FALSE            FALSE           FALSE            FALSE
[5,]            FALSE            FALSE           FALSE            FALSE
[6,]            FALSE            FALSE           FALSE            FALSE
head(colSums(is.na(money))) # number of missing values in each variable of the data set
          INDEX     TARGET_WINS  TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
              0               0               0               0               0 
TEAM_BATTING_HR 
              0 
?head
head(money, n = 10) # first 4 entries
   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
1      1          39           1445             194              39
2      2          70           1339             219              22
3      3          86           1377             232              35
4      4          70           1387             209              38
5      5          82           1297             186              27
6      6          75           1279             200              36
7      7          80           1244             179              54
8      8          85           1273             171              37
9     11          86           1391             197              40
10    12          76           1271             213              18
   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
1               13             143             842              NA
2              190             685            1075              37
3              137             602             917              46
4               96             451             922              43
5              102             472             920              49
6               92             443             973             107
7              122             525            1062              80
8              115             456            1027              40
9              114             447             922              69
10              96             441             827              72
   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
1               NA               NA            9364               84
2               28               NA            1347              191
3               27               NA            1377              137
4               30               NA            1396               97
5               39               NA            1297              102
6               59               NA            1279               92
7               54               NA            1244              122
8               36               NA            1281              116
9               27               NA            1391              114
10              34               NA            1271               96
   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
1               927             5456            1011               NA
2               689             1082             193              155
3               602              917             175              153
4               454              928             164              156
5               472              920             138              168
6               443              973             123              149
7               525             1062             136              186
8               459             1033             112              136
9               447              922             127              169
10              441              827             131              159
#install.packages("visdat")

library(visdat)

# vis_miss(df)

vis_dat(money)

# Removing the variables with the most missing values

money_clean <- money
money_clean$TEAM_BATTING_HBP = NULL
money_clean$TEAM_BATTING_CS = NULL
vis_miss(money_clean)

clean_df <- money_clean

Ggplot

library(ggplot2)

df_melted <- reshape2::melt(clean_df)
No id variables; using all as measure variables
ggplot(df_melted, aes(value)) + geom_histogram() + facet_wrap (~ variable, scales = "free_x")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1393 rows containing non-finite outside the scale range
(`stat_bin()`).

reg1 <- 
  lm(data = clean_df, formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B)

kitchen_sink <-
  lm(data = clean_df, formula = TARGET_WINS ~ .)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
# Creating a predicting equation

stargazer(reg1, kitchen_sink, type = "text")

=======================================================================
                                    Dependent variable:                
                    ---------------------------------------------------
                                        TARGET_WINS                    
                               (1)                       (2)           
-----------------------------------------------------------------------
INDEX                                                  0.0003          
                                                      (0.0004)         
                                                                       
TEAM_BATTING_H              0.036***                    0.016          
                             (0.003)                   (0.020)         
                                                                       
TEAM_BATTING_2B             0.035***                  -0.070***        
                             (0.008)                   (0.009)         
                                                                       
TEAM_BATTING_3B                                       0.160***         
                                                       (0.022)         
                                                                       
TEAM_BATTING_HR                                         0.075          
                                                       (0.085)         
                                                                       
TEAM_BATTING_BB                                         0.043          
                                                       (0.046)         
                                                                       
TEAM_BATTING_SO                                         0.018          
                                                       (0.023)         
                                                                       
TEAM_BASERUN_SB                                       0.035***         
                                                       (0.009)         
                                                                       
TEAM_BASERUN_CS                                       0.054***         
                                                       (0.018)         
                                                                       
TEAM_PITCHING_H                                         0.019          
                                                       (0.018)         
                                                                       
TEAM_PITCHING_HR                                        0.022          
                                                       (0.082)         
                                                                       
TEAM_PITCHING_BB                                       -0.003          
                                                       (0.045)         
                                                                       
TEAM_PITCHING_SO                                       -0.038*         
                                                       (0.022)         
                                                                       
TEAM_FIELDING_E                                       -0.156***        
                                                       (0.010)         
                                                                       
TEAM_FIELDING_DP                                      -0.113***        
                                                       (0.013)         
                                                                       
Constant                    19.477***                 57.843***        
                             (3.102)                   (6.645)         
                                                                       
-----------------------------------------------------------------------
Observations                  2,276                     1,486          
R2                            0.158                     0.439          
Adjusted R2                   0.158                     0.433          
Residual Std. Error    14.457 (df = 2273)         9.557 (df = 1470)    
F Statistic         213.858*** (df = 2; 2273) 76.630*** (df = 15; 1470)
=======================================================================
Note:                                       *p<0.1; **p<0.05; ***p<0.01
library(MASS)
?stepAIC()

# Finding the best model

best_model <- 
  stepAIC(object = kitchen_sink, 
        direction = "backward")
Start:  AIC=6724.68
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
    TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- TEAM_PITCHING_BB  1       0.5 134278 6722.7
- TEAM_PITCHING_HR  1       6.6 134285 6722.8
- INDEX             1      45.3 134323 6723.2
- TEAM_BATTING_SO   1      52.0 134330 6723.3
- TEAM_BATTING_H    1      59.8 134338 6723.3
- TEAM_BATTING_HR   1      70.3 134348 6723.5
- TEAM_BATTING_BB   1      78.1 134356 6723.5
- TEAM_PITCHING_H   1      93.3 134371 6723.7
<none>                          134278 6724.7
- TEAM_PITCHING_SO  1     260.2 134538 6725.6
- TEAM_BASERUN_CS   1     780.9 135059 6731.3
- TEAM_BASERUN_SB   1    1430.9 135709 6738.4
- TEAM_BATTING_3B   1    4664.4 138942 6773.4
- TEAM_BATTING_2B   1    5147.7 139426 6778.6
- TEAM_FIELDING_DP  1    6780.8 141059 6795.9
- TEAM_FIELDING_E   1   22451.0 156729 6952.4

Step:  AIC=6722.69
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- TEAM_PITCHING_HR  1       6.2 134285 6720.8
- INDEX             1      45.6 134324 6721.2
- TEAM_BATTING_SO   1      52.7 134331 6721.3
- TEAM_BATTING_HR   1      78.9 134357 6721.6
- TEAM_BATTING_H    1     149.9 134428 6722.3
<none>                          134278 6722.7
- TEAM_PITCHING_H   1     193.8 134472 6722.8
- TEAM_PITCHING_SO  1     262.0 134540 6723.6
- TEAM_BASERUN_CS   1     780.8 135059 6729.3
- TEAM_BASERUN_SB   1    1435.3 135714 6736.5
- TEAM_BATTING_3B   1    4668.0 138946 6771.5
- TEAM_BATTING_2B   1    5156.1 139434 6776.7
- TEAM_FIELDING_DP  1    6782.9 141061 6793.9
- TEAM_BATTING_BB   1   12608.7 146887 6854.1
- TEAM_FIELDING_E   1   22517.9 156796 6951.1

Step:  AIC=6720.76
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
    TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- INDEX             1      45.8 134330 6719.3
- TEAM_BATTING_SO   1      47.9 134332 6719.3
- TEAM_BATTING_H    1     147.4 134432 6720.4
<none>                          134285 6720.8
- TEAM_PITCHING_H   1     198.2 134483 6720.9
- TEAM_PITCHING_SO  1     293.6 134578 6722.0
- TEAM_BASERUN_CS   1     777.0 135062 6727.3
- TEAM_BASERUN_SB   1    1440.7 135725 6734.6
- TEAM_BATTING_3B   1    4669.4 138954 6769.5
- TEAM_BATTING_2B   1    5178.5 139463 6775.0
- TEAM_FIELDING_DP  1    6783.0 141067 6792.0
- TEAM_BATTING_HR   1    9801.3 144086 6823.4
- TEAM_BATTING_BB   1   12647.1 146932 6852.5
- TEAM_FIELDING_E   1   22551.2 156836 6949.4

Step:  AIC=6719.26
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
    TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- TEAM_BATTING_SO   1      51.2 134382 6717.8
- TEAM_BATTING_H    1     144.7 134475 6718.9
<none>                          134330 6719.3
- TEAM_PITCHING_H   1     202.0 134532 6719.5
- TEAM_PITCHING_SO  1     298.0 134628 6720.6
- TEAM_BASERUN_CS   1     742.6 135073 6725.5
- TEAM_BASERUN_SB   1    1570.4 135901 6734.5
- TEAM_BATTING_3B   1    4842.6 139173 6769.9
- TEAM_BATTING_2B   1    5198.7 139529 6773.7
- TEAM_FIELDING_DP  1    6744.4 141075 6790.1
- TEAM_BATTING_HR   1    9780.8 144111 6821.7
- TEAM_BATTING_BB   1   12606.9 146937 6850.6
- TEAM_FIELDING_E   1   22525.1 156855 6947.6

Step:  AIC=6717.83
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
    TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
<none>                          134382 6717.8
- TEAM_BASERUN_CS   1     737.6 135119 6724.0
- TEAM_PITCHING_H   1    1355.1 135737 6730.7
- TEAM_BASERUN_SB   1    1575.6 135957 6733.2
- TEAM_BATTING_H    1    1740.1 136122 6734.9
- TEAM_BATTING_3B   1    4849.8 139231 6768.5
- TEAM_BATTING_2B   1    5148.1 139530 6771.7
- TEAM_FIELDING_DP  1    6779.2 141161 6789.0
- TEAM_PITCHING_SO  1    7395.1 141777 6795.4
- TEAM_BATTING_HR   1    9785.1 144167 6820.3
- TEAM_BATTING_BB   1   12619.7 147001 6849.2
- TEAM_FIELDING_E   1   22552.0 156934 6946.4
  1. In our final model, we kept TARGET_WINS and TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATTING_HR, TEAM_BATTING_BB, TEAM_BASERUN_SB, TEAM_BASERUN_CS, TEAM_PITCHING_H, TEAM_PITCHING_SO, TEAM_FIELDING_E, and TEAM_FIELDING_DP.
  2. For the most part these variables make sense because they all positively impact the teams performance and winning/scoring chances.