Set-Up

Install/Load Packages

rm(list =ls())
library(ggplot2)
library(visdat)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(reshape2)

Import Data

dfmoneyball <- read.csv("moneyball-training-data.csv")

Visualize Missing Values

vis_dat(dfmoneyball)

Clean Data

dfmoneyball2 <- dfmoneyball[ , c(-11)]
dfmoneyballclean <- na.omit(dfmoneyball2)

Summary Statistics

Base Money Ball Statistics

head(dfmoneyballclean)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
## 7     7          80           1244             179              54
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
## 7             122             525            1062              80
##   TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 2              28            1347              191              689
## 3              27            1377              137              602
## 4              30            1396               97              454
## 5              39            1297              102              472
## 6              59            1279               92              443
## 7              54            1244              122              525
##   TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 2             1082             193              155
## 3              917             175              153
## 4              928             164              156
## 5              920             138              168
## 6              973             123              149
## 7             1062             136              186

Summary Statistics Table

stargazer(dfmoneyballclean, type = "text", 
          title = "Money Ball Statistics",
          summary.stat = c("Mean", "sd", "Min", "Max"),
          notes = "1486 Trials for all Stats",
          covariate.labels = c("Index", 
                               "Wins",
                               "Batting Base Hits ",
                               "Batting Doubles",
                               "Batting Triples",
                               "Batting Homeruns",
                               "Batting Walks",
                               "Batting Strikeouts",
                               "Baserunning Stolen Bases",
                               "Baserunning Caught Stealing",
                               "Pitching Hits Allowed",
                               "Pitching Homeruns Allowed",
                               "Pitching Walks Allowed",
                               "Pitching Strikeouts",
                               "Fielding Errors",
                               "Fielding Double Plays"))
## 
## Money Ball Statistics
## ==========================================================
## Statistic                     Mean    St. Dev.  Min   Max 
## ----------------------------------------------------------
## Index                       1,273.812 725.508    2   2,534
## Wins                         80.997    12.694   41    117 
## Batting Base Hits           1,452.157 104.336  1,137 1,786
## Batting Doubles              250.970   42.026   154   377 
## Batting Triples              42.905    18.649   11    129 
## Batting Homeruns             129.842   48.609   11    264 
## Batting Walks                541.888   80.567   309   878 
## Batting Strikeouts           841.743  200.355   326  1,399
## Baserunning Stolen Bases     95.858    44.345   18    314 
## Baserunning Caught Stealing  52.963    22.851   11    201 
## Pitching Hits Allowed       1,505.122 173.472  1,137 2,394
## Pitching Homeruns Allowed    134.069   50.902   12    343 
## Pitching Walks Allowed       561.570   97.347   325  1,090
## Pitching Strikeouts          869.253  211.859   345  1,781
## Fielding Errors              143.145   38.954   65    360 
## Fielding Double Plays        153.743   20.321   87    228 
## ----------------------------------------------------------
## 1486 Trials for all Stats

Graphs

df_melted <- melt(dfmoneyballclean)
## No id variables; using all as measure variables
ggplot(df_melted, aes(x = value)) + 
  geom_histogram() +
  facet_wrap(~variable, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Linear Regressions

First Regression(all data)

dfmoneyballclean2 <- dfmoneyballclean[,-c(1)]
moneyballregression <- lm(data = dfmoneyballclean2, TARGET_WINS ~.) 
summary(moneyballregression)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfmoneyballclean2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.5627  -6.6932  -0.1328   6.5249  27.8525 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      57.912438   6.642839   8.718  < 2e-16 ***
## TEAM_BATTING_H    0.015434   0.019626   0.786   0.4318    
## TEAM_BATTING_2B  -0.070472   0.009369  -7.522 9.36e-14 ***
## TEAM_BATTING_3B   0.161551   0.022192   7.280 5.43e-13 ***
## TEAM_BATTING_HR   0.073952   0.085392   0.866   0.3866    
## TEAM_BATTING_BB   0.043765   0.046454   0.942   0.3463    
## TEAM_BATTING_SO   0.018250   0.023463   0.778   0.4368    
## TEAM_BASERUN_SB   0.035880   0.008687   4.130 3.83e-05 ***
## TEAM_BASERUN_CS   0.052124   0.018227   2.860   0.0043 ** 
## TEAM_PITCHING_H   0.019044   0.018381   1.036   0.3003    
## TEAM_PITCHING_HR  0.022997   0.082092   0.280   0.7794    
## TEAM_PITCHING_BB -0.004180   0.044692  -0.094   0.9255    
## TEAM_PITCHING_SO -0.038176   0.022447  -1.701   0.0892 .  
## TEAM_FIELDING_E  -0.155876   0.009946 -15.672  < 2e-16 ***
## TEAM_FIELDING_DP -0.112885   0.013137  -8.593  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.556 on 1471 degrees of freedom
## Multiple R-squared:  0.4386, Adjusted R-squared:  0.4333 
## F-statistic:  82.1 on 14 and 1471 DF,  p-value: < 2.2e-16

Variable Removal

Finding Useless Variables

library(MASS)
stepAIC(object = moneyballregression, 
        direction = "backward")     
## Start:  AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_BB  1       0.8 134324 6721.2
## - TEAM_PITCHING_HR  1       7.2 134330 6721.3
## - TEAM_BATTING_SO   1      55.2 134378 6721.8
## - TEAM_BATTING_H    1      56.5 134380 6721.8
## - TEAM_BATTING_HR   1      68.5 134392 6721.9
## - TEAM_BATTING_BB   1      81.0 134404 6722.1
## - TEAM_PITCHING_H   1      98.0 134421 6722.3
## <none>                          134323 6723.2
## - TEAM_PITCHING_SO  1     264.1 134587 6724.1
## - TEAM_BASERUN_CS   1     746.8 135070 6729.4
## - TEAM_BASERUN_SB   1    1557.8 135881 6738.3
## - TEAM_BATTING_3B   1    4838.9 139162 6773.8
## - TEAM_BATTING_2B   1    5166.3 139489 6777.3
## - TEAM_FIELDING_DP  1    6742.5 141066 6794.0
## - TEAM_FIELDING_E   1   22427.4 156751 6950.6
## 
## Step:  AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_HR  1       6.4 134330 6719.3
## - TEAM_BATTING_SO   1      56.2 134380 6719.8
## - TEAM_BATTING_HR   1      77.9 134402 6720.1
## - TEAM_BATTING_H    1     147.2 134471 6720.8
## <none>                          134324 6721.2
## - TEAM_PITCHING_H   1     197.5 134521 6721.4
## - TEAM_PITCHING_SO  1     266.3 134590 6722.1
## - TEAM_BASERUN_CS   1     746.5 135070 6727.4
## - TEAM_BASERUN_SB   1    1564.2 135888 6736.4
## - TEAM_BATTING_3B   1    4840.8 139165 6771.8
## - TEAM_BATTING_2B   1    5175.9 139500 6775.4
## - TEAM_FIELDING_DP  1    6744.6 141069 6792.0
## - TEAM_BATTING_BB   1   12568.9 146893 6852.1
## - TEAM_FIELDING_E   1   22491.7 156816 6949.2
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_BATTING_SO   1      51.2 134382 6717.8
## - TEAM_BATTING_H    1     144.7 134475 6718.9
## <none>                          134330 6719.3
## - TEAM_PITCHING_H   1     202.0 134532 6719.5
## - TEAM_PITCHING_SO  1     298.0 134628 6720.6
## - TEAM_BASERUN_CS   1     742.6 135073 6725.5
## - TEAM_BASERUN_SB   1    1570.4 135901 6734.5
## - TEAM_BATTING_3B   1    4842.6 139173 6769.9
## - TEAM_BATTING_2B   1    5198.7 139529 6773.7
## - TEAM_FIELDING_DP  1    6744.4 141075 6790.1
## - TEAM_BATTING_HR   1    9780.8 144111 6821.7
## - TEAM_BATTING_BB   1   12606.9 146937 6850.6
## - TEAM_FIELDING_E   1   22525.1 156855 6947.6
## 
## Step:  AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          134382 6717.8
## - TEAM_BASERUN_CS   1     737.6 135119 6724.0
## - TEAM_PITCHING_H   1    1355.1 135737 6730.7
## - TEAM_BASERUN_SB   1    1575.6 135957 6733.2
## - TEAM_BATTING_H    1    1740.1 136122 6734.9
## - TEAM_BATTING_3B   1    4849.8 139231 6768.5
## - TEAM_BATTING_2B   1    5148.1 139530 6771.7
## - TEAM_FIELDING_DP  1    6779.2 141161 6789.0
## - TEAM_PITCHING_SO  1    7395.1 141777 6795.4
## - TEAM_BATTING_HR   1    9785.1 144167 6820.3
## - TEAM_BATTING_BB   1   12619.7 147001 6849.2
## - TEAM_FIELDING_E   1   22552.0 156934 6946.4
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = dfmoneyballclean2)
## 
## Coefficients:
##      (Intercept)    TEAM_BATTING_H   TEAM_BATTING_2B   TEAM_BATTING_3B  
##         58.44606           0.02550          -0.06983           0.16162  
##  TEAM_BATTING_HR   TEAM_BATTING_BB   TEAM_BASERUN_SB   TEAM_BASERUN_CS  
##          0.09775           0.03948           0.03603           0.05177  
##  TEAM_PITCHING_H  TEAM_PITCHING_SO   TEAM_FIELDING_E  TEAM_FIELDING_DP  
##          0.00907          -0.02083          -0.15597          -0.11315

Removal of Variables

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

dfselected <- dplyr::select(dfmoneyballclean2, -c(TEAM_BATTING_H, TEAM_PITCHING_H))

Final Regression

finalregression <- lm(data = dfselected, TARGET_WINS ~.) 
summary(finalregression)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfselected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.549  -6.714   0.261   6.518  32.134 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      95.800657   4.380426  21.870  < 2e-16 ***
## TEAM_BATTING_2B  -0.021020   0.006810  -3.087  0.00206 ** 
## TEAM_BATTING_3B   0.220307   0.021134  10.424  < 2e-16 ***
## TEAM_BATTING_HR   0.107772   0.085814   1.256  0.20936    
## TEAM_BATTING_BB   0.015782   0.031239   0.505  0.61348    
## TEAM_BATTING_SO  -0.006358   0.020565  -0.309  0.75725    
## TEAM_BASERUN_SB   0.044526   0.008771   5.076 4.34e-07 ***
## TEAM_BASERUN_CS   0.048784   0.018560   2.628  0.00867 ** 
## TEAM_PITCHING_HR  0.022182   0.082430   0.269  0.78789    
## TEAM_PITCHING_BB  0.021527   0.029861   0.721  0.47108    
## TEAM_PITCHING_SO -0.023125   0.019683  -1.175  0.24024    
## TEAM_FIELDING_E  -0.153743   0.010117 -15.196  < 2e-16 ***
## TEAM_FIELDING_DP -0.103948   0.013323  -7.802 1.14e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.733 on 1473 degrees of freedom
## Multiple R-squared:  0.4168, Adjusted R-squared:  0.412 
## F-statistic: 87.72 on 12 and 1473 DF,  p-value: < 2.2e-16