Imports

rm(list=ls())
df <- read.csv("D:/Johnny/Study/Others/R/Boston College Experience Data Analysis/Weekend Homework/moneyball-training-data.csv")
library(stargazer,MASS)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(visdat)

Cleaning Dataset

df <- df[,-11]
df <- df[,-1]
visdat::vis_dat(df)

df <- df[,-9]
for(i in colnames(df)){
  df[,i][is.na(df[,i])] <- median(df[,i], na.rm=TRUE)
}
visdat::vis_dat(df)

Checking data

stargazer(df,type="text")
## 
## ===========================================================
## Statistic          N     Mean    St. Dev.   Min     Max    
## -----------------------------------------------------------
## TARGET_WINS      2,276  80.791    15.752     0      146    
## TEAM_BATTING_H   2,276 1,469.270  144.591   891    2,554   
## TEAM_BATTING_2B  2,276  241.247   46.801    69      458    
## TEAM_BATTING_3B  2,276  55.250    27.939     0      223    
## TEAM_BATTING_HR  2,276  99.612    60.547     0      264    
## TEAM_BATTING_BB  2,276  501.559   122.671    0      878    
## TEAM_BATTING_SO  2,276  736.250   242.909    0     1,399   
## TEAM_BASERUN_SB  2,276  123.394   85.406     0      697    
## TEAM_PITCHING_H  2,276 1,779.210 1,406.843 1,137   30,132  
## TEAM_PITCHING_HR 2,276  105.699   61.299     0      343    
## TEAM_PITCHING_BB 2,276  553.008   166.357    0     3,645   
## TEAM_PITCHING_SO 2,276  817.541   540.545  0.000 19,278.000
## TEAM_FIELDING_E  2,276  246.481   227.771   65     1,898   
## TEAM_FIELDING_DP 2,276  146.716   24.538    52      228    
## -----------------------------------------------------------
sum(is.na(df))
## [1] 0

Fitting Model

model <- lm(TARGET_WINS ~., data = df)
summary(model)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.827  -8.580   0.103   8.432  58.544 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.9775583  5.3046349   4.332 1.54e-05 ***
## TEAM_BATTING_H    0.0488787  0.0036941  13.232  < 2e-16 ***
## TEAM_BATTING_2B  -0.0212136  0.0091699  -2.313 0.020791 *  
## TEAM_BATTING_3B   0.0649302  0.0167897   3.867 0.000113 ***
## TEAM_BATTING_HR   0.0545602  0.0273630   1.994 0.046279 *  
## TEAM_BATTING_BB   0.0105502  0.0058352   1.808 0.070734 .  
## TEAM_BATTING_SO  -0.0084176  0.0025457  -3.307 0.000959 ***
## TEAM_BASERUN_SB   0.0247806  0.0042572   5.821 6.69e-09 ***
## TEAM_PITCHING_H  -0.0008598  0.0003668  -2.344 0.019147 *  
## TEAM_PITCHING_HR  0.0123395  0.0243703   0.506 0.612672    
## TEAM_PITCHING_BB  0.0008863  0.0041539   0.213 0.831065    
## TEAM_PITCHING_SO  0.0028087  0.0009218   3.047 0.002338 ** 
## TEAM_FIELDING_E  -0.0191590  0.0024016  -7.978 2.35e-15 ***
## TEAM_FIELDING_DP -0.1219877  0.0129372  -9.429  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2262 degrees of freedom
## Multiple R-squared:  0.3152, Adjusted R-squared:  0.3113 
## F-statistic:  80.1 on 13 and 2262 DF,  p-value: < 2.2e-16

Optimization: Backward Selection

library(MASS)
model_back <- stepAIC(model, direction = "backward")
## Start:  AIC=11714.87
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS   AIC
## - TEAM_PITCHING_BB  1       7.8 386553 11713
## - TEAM_PITCHING_HR  1      43.8 386589 11713
## <none>                          386546 11715
## - TEAM_BATTING_BB   1     558.6 387104 11716
## - TEAM_BATTING_HR   1     679.4 387225 11717
## - TEAM_BATTING_2B   1     914.5 387460 11718
## - TEAM_PITCHING_H   1     939.2 387485 11718
## - TEAM_PITCHING_SO  1    1586.6 388132 11722
## - TEAM_BATTING_SO   1    1868.5 388414 11724
## - TEAM_BATTING_3B   1    2555.7 389101 11728
## - TEAM_BASERUN_SB   1    5790.1 392336 11747
## - TEAM_FIELDING_E   1   10875.8 397421 11776
## - TEAM_FIELDING_DP  1   15193.6 401739 11801
## - TEAM_BATTING_H    1   29917.9 416463 11882
## 
## Step:  AIC=11712.92
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS   AIC
## - TEAM_PITCHING_HR  1      86.4 386640 11711
## <none>                          386553 11713
## - TEAM_BATTING_HR   1     793.8 387347 11716
## - TEAM_BATTING_2B   1     912.6 387466 11716
## - TEAM_PITCHING_H   1    1080.6 387634 11717
## - TEAM_BATTING_BB   1    2005.6 388559 11723
## - TEAM_BATTING_SO   1    2079.5 388633 11723
## - TEAM_BATTING_3B   1    2555.4 389109 11726
## - TEAM_PITCHING_SO  1    3269.0 389822 11730
## - TEAM_BASERUN_SB   1    5983.2 392536 11746
## - TEAM_FIELDING_E   1   10870.9 397424 11774
## - TEAM_FIELDING_DP  1   15186.6 401740 11799
## - TEAM_BATTING_H    1   29953.0 416506 11881
## 
## Step:  AIC=11711.43
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS   AIC
## <none>                          386640 11711
## - TEAM_BATTING_2B   1     929.4 387569 11715
## - TEAM_PITCHING_H   1    1001.0 387641 11715
## - TEAM_BATTING_BB   1    1999.1 388639 11721
## - TEAM_BATTING_SO   1    2060.9 388701 11722
## - TEAM_BATTING_3B   1    2739.4 389379 11726
## - TEAM_PITCHING_SO  1    3328.3 389968 11729
## - TEAM_BASERUN_SB   1    5986.1 392626 11744
## - TEAM_BATTING_HR   1    8364.1 395004 11758
## - TEAM_FIELDING_E   1   10786.9 397427 11772
## - TEAM_FIELDING_DP  1   15152.3 401792 11797
## - TEAM_BATTING_H    1   30558.9 417199 11883
summary(model_back)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.598  -8.593   0.085   8.445  58.582 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.3440443  5.2338369   4.269 2.04e-05 ***
## TEAM_BATTING_H    0.0490922  0.0036699  13.377  < 2e-16 ***
## TEAM_BATTING_2B  -0.0213744  0.0091626  -2.333 0.019746 *  
## TEAM_BATTING_3B   0.0665763  0.0166230   4.005 6.40e-05 ***
## TEAM_BATTING_HR   0.0674046  0.0096315   6.998 3.40e-12 ***
## TEAM_BATTING_BB   0.0115464  0.0033748   3.421 0.000634 ***
## TEAM_BATTING_SO  -0.0085211  0.0024529  -3.474 0.000523 ***
## TEAM_BASERUN_SB   0.0249207  0.0042092   5.920 3.70e-09 ***
## TEAM_PITCHING_H  -0.0007770  0.0003209  -2.421 0.015552 *  
## TEAM_PITCHING_SO  0.0029662  0.0006719   4.415 1.06e-05 ***
## TEAM_FIELDING_E  -0.0190100  0.0023919  -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894  0.0129296  -9.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared:  0.3151, Adjusted R-squared:  0.3117 
## F-statistic: 94.68 on 11 and 2264 DF,  p-value: < 2.2e-16
stargazer(model,model_back,type="text")
## 
## =======================================================================
##                                     Dependent variable:                
##                     ---------------------------------------------------
##                                         TARGET_WINS                    
##                                (1)                       (2)           
## -----------------------------------------------------------------------
## TEAM_BATTING_H              0.049***                  0.049***         
##                              (0.004)                   (0.004)         
##                                                                        
## TEAM_BATTING_2B             -0.021**                  -0.021**         
##                              (0.009)                   (0.009)         
##                                                                        
## TEAM_BATTING_3B             0.065***                  0.067***         
##                              (0.017)                   (0.017)         
##                                                                        
## TEAM_BATTING_HR              0.055**                  0.067***         
##                              (0.027)                   (0.010)         
##                                                                        
## TEAM_BATTING_BB              0.011*                   0.012***         
##                              (0.006)                   (0.003)         
##                                                                        
## TEAM_BATTING_SO             -0.008***                 -0.009***        
##                              (0.003)                   (0.002)         
##                                                                        
## TEAM_BASERUN_SB             0.025***                  0.025***         
##                              (0.004)                   (0.004)         
##                                                                        
## TEAM_PITCHING_H             -0.001**                  -0.001**         
##                             (0.0004)                  (0.0003)         
##                                                                        
## TEAM_PITCHING_HR              0.012                                    
##                              (0.024)                                   
##                                                                        
## TEAM_PITCHING_BB              0.001                                    
##                              (0.004)                                   
##                                                                        
## TEAM_PITCHING_SO            0.003***                  0.003***         
##                              (0.001)                   (0.001)         
##                                                                        
## TEAM_FIELDING_E             -0.019***                 -0.019***        
##                              (0.002)                   (0.002)         
##                                                                        
## TEAM_FIELDING_DP            -0.122***                 -0.122***        
##                              (0.013)                   (0.013)         
##                                                                        
## Constant                    22.978***                 22.344***        
##                              (5.305)                   (5.234)         
##                                                                        
## -----------------------------------------------------------------------
## Observations                  2,276                     2,276          
## R2                            0.315                     0.315          
## Adjusted R2                   0.311                     0.312          
## Residual Std. Error    13.072 (df = 2262)        13.068 (df = 2264)    
## F Statistic         80.103*** (df = 13; 2262) 94.678*** (df = 11; 2264)
## =======================================================================
## Note:                                       *p<0.1; **p<0.05; ***p<0.01

Two variables, (Pitching_HR and Pitching_BB) are removed. Given they have the lowest level of significance, I argue that this removal is reasonable.

Overall the model performance is poor with a 0.312 Adjusted R^2, which is a 0.01 improvement made through the removal of two varaibles. This is too much data and variables for a linear regression model, and more advanced models could improve the performance.