Imports
rm(list=ls())
df <- read.csv("D:/Johnny/Study/Others/R/Boston College Experience Data Analysis/Weekend Homework/moneyball-training-data.csv")
library(stargazer,MASS)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(visdat)
Cleaning Dataset
df <- df[,-11]
df <- df[,-1]
visdat::vis_dat(df)

df <- df[,-9]
for(i in colnames(df)){
df[,i][is.na(df[,i])] <- median(df[,i], na.rm=TRUE)
}
visdat::vis_dat(df)

Checking data
stargazer(df,type="text")
##
## ===========================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------
## TARGET_WINS 2,276 80.791 15.752 0 146
## TEAM_BATTING_H 2,276 1,469.270 144.591 891 2,554
## TEAM_BATTING_2B 2,276 241.247 46.801 69 458
## TEAM_BATTING_3B 2,276 55.250 27.939 0 223
## TEAM_BATTING_HR 2,276 99.612 60.547 0 264
## TEAM_BATTING_BB 2,276 501.559 122.671 0 878
## TEAM_BATTING_SO 2,276 736.250 242.909 0 1,399
## TEAM_BASERUN_SB 2,276 123.394 85.406 0 697
## TEAM_PITCHING_H 2,276 1,779.210 1,406.843 1,137 30,132
## TEAM_PITCHING_HR 2,276 105.699 61.299 0 343
## TEAM_PITCHING_BB 2,276 553.008 166.357 0 3,645
## TEAM_PITCHING_SO 2,276 817.541 540.545 0.000 19,278.000
## TEAM_FIELDING_E 2,276 246.481 227.771 65 1,898
## TEAM_FIELDING_DP 2,276 146.716 24.538 52 228
## -----------------------------------------------------------
sum(is.na(df))
## [1] 0
Fitting Model
model <- lm(TARGET_WINS ~., data = df)
summary(model)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.827 -8.580 0.103 8.432 58.544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.9775583 5.3046349 4.332 1.54e-05 ***
## TEAM_BATTING_H 0.0488787 0.0036941 13.232 < 2e-16 ***
## TEAM_BATTING_2B -0.0212136 0.0091699 -2.313 0.020791 *
## TEAM_BATTING_3B 0.0649302 0.0167897 3.867 0.000113 ***
## TEAM_BATTING_HR 0.0545602 0.0273630 1.994 0.046279 *
## TEAM_BATTING_BB 0.0105502 0.0058352 1.808 0.070734 .
## TEAM_BATTING_SO -0.0084176 0.0025457 -3.307 0.000959 ***
## TEAM_BASERUN_SB 0.0247806 0.0042572 5.821 6.69e-09 ***
## TEAM_PITCHING_H -0.0008598 0.0003668 -2.344 0.019147 *
## TEAM_PITCHING_HR 0.0123395 0.0243703 0.506 0.612672
## TEAM_PITCHING_BB 0.0008863 0.0041539 0.213 0.831065
## TEAM_PITCHING_SO 0.0028087 0.0009218 3.047 0.002338 **
## TEAM_FIELDING_E -0.0191590 0.0024016 -7.978 2.35e-15 ***
## TEAM_FIELDING_DP -0.1219877 0.0129372 -9.429 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2262 degrees of freedom
## Multiple R-squared: 0.3152, Adjusted R-squared: 0.3113
## F-statistic: 80.1 on 13 and 2262 DF, p-value: < 2.2e-16
Optimization: Backward Selection
library(MASS)
model_back <- stepAIC(model, direction = "backward")
## Start: AIC=11714.87
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 7.8 386553 11713
## - TEAM_PITCHING_HR 1 43.8 386589 11713
## <none> 386546 11715
## - TEAM_BATTING_BB 1 558.6 387104 11716
## - TEAM_BATTING_HR 1 679.4 387225 11717
## - TEAM_BATTING_2B 1 914.5 387460 11718
## - TEAM_PITCHING_H 1 939.2 387485 11718
## - TEAM_PITCHING_SO 1 1586.6 388132 11722
## - TEAM_BATTING_SO 1 1868.5 388414 11724
## - TEAM_BATTING_3B 1 2555.7 389101 11728
## - TEAM_BASERUN_SB 1 5790.1 392336 11747
## - TEAM_FIELDING_E 1 10875.8 397421 11776
## - TEAM_FIELDING_DP 1 15193.6 401739 11801
## - TEAM_BATTING_H 1 29917.9 416463 11882
##
## Step: AIC=11712.92
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 86.4 386640 11711
## <none> 386553 11713
## - TEAM_BATTING_HR 1 793.8 387347 11716
## - TEAM_BATTING_2B 1 912.6 387466 11716
## - TEAM_PITCHING_H 1 1080.6 387634 11717
## - TEAM_BATTING_BB 1 2005.6 388559 11723
## - TEAM_BATTING_SO 1 2079.5 388633 11723
## - TEAM_BATTING_3B 1 2555.4 389109 11726
## - TEAM_PITCHING_SO 1 3269.0 389822 11730
## - TEAM_BASERUN_SB 1 5983.2 392536 11746
## - TEAM_FIELDING_E 1 10870.9 397424 11774
## - TEAM_FIELDING_DP 1 15186.6 401740 11799
## - TEAM_BATTING_H 1 29953.0 416506 11881
##
## Step: AIC=11711.43
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 386640 11711
## - TEAM_BATTING_2B 1 929.4 387569 11715
## - TEAM_PITCHING_H 1 1001.0 387641 11715
## - TEAM_BATTING_BB 1 1999.1 388639 11721
## - TEAM_BATTING_SO 1 2060.9 388701 11722
## - TEAM_BATTING_3B 1 2739.4 389379 11726
## - TEAM_PITCHING_SO 1 3328.3 389968 11729
## - TEAM_BASERUN_SB 1 5986.1 392626 11744
## - TEAM_BATTING_HR 1 8364.1 395004 11758
## - TEAM_FIELDING_E 1 10786.9 397427 11772
## - TEAM_FIELDING_DP 1 15152.3 401792 11797
## - TEAM_BATTING_H 1 30558.9 417199 11883
summary(model_back)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.598 -8.593 0.085 8.445 58.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.3440443 5.2338369 4.269 2.04e-05 ***
## TEAM_BATTING_H 0.0490922 0.0036699 13.377 < 2e-16 ***
## TEAM_BATTING_2B -0.0213744 0.0091626 -2.333 0.019746 *
## TEAM_BATTING_3B 0.0665763 0.0166230 4.005 6.40e-05 ***
## TEAM_BATTING_HR 0.0674046 0.0096315 6.998 3.40e-12 ***
## TEAM_BATTING_BB 0.0115464 0.0033748 3.421 0.000634 ***
## TEAM_BATTING_SO -0.0085211 0.0024529 -3.474 0.000523 ***
## TEAM_BASERUN_SB 0.0249207 0.0042092 5.920 3.70e-09 ***
## TEAM_PITCHING_H -0.0007770 0.0003209 -2.421 0.015552 *
## TEAM_PITCHING_SO 0.0029662 0.0006719 4.415 1.06e-05 ***
## TEAM_FIELDING_E -0.0190100 0.0023919 -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894 0.0129296 -9.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared: 0.3151, Adjusted R-squared: 0.3117
## F-statistic: 94.68 on 11 and 2264 DF, p-value: < 2.2e-16
stargazer(model,model_back,type="text")
##
## =======================================================================
## Dependent variable:
## ---------------------------------------------------
## TARGET_WINS
## (1) (2)
## -----------------------------------------------------------------------
## TEAM_BATTING_H 0.049*** 0.049***
## (0.004) (0.004)
##
## TEAM_BATTING_2B -0.021** -0.021**
## (0.009) (0.009)
##
## TEAM_BATTING_3B 0.065*** 0.067***
## (0.017) (0.017)
##
## TEAM_BATTING_HR 0.055** 0.067***
## (0.027) (0.010)
##
## TEAM_BATTING_BB 0.011* 0.012***
## (0.006) (0.003)
##
## TEAM_BATTING_SO -0.008*** -0.009***
## (0.003) (0.002)
##
## TEAM_BASERUN_SB 0.025*** 0.025***
## (0.004) (0.004)
##
## TEAM_PITCHING_H -0.001** -0.001**
## (0.0004) (0.0003)
##
## TEAM_PITCHING_HR 0.012
## (0.024)
##
## TEAM_PITCHING_BB 0.001
## (0.004)
##
## TEAM_PITCHING_SO 0.003*** 0.003***
## (0.001) (0.001)
##
## TEAM_FIELDING_E -0.019*** -0.019***
## (0.002) (0.002)
##
## TEAM_FIELDING_DP -0.122*** -0.122***
## (0.013) (0.013)
##
## Constant 22.978*** 22.344***
## (5.305) (5.234)
##
## -----------------------------------------------------------------------
## Observations 2,276 2,276
## R2 0.315 0.315
## Adjusted R2 0.311 0.312
## Residual Std. Error 13.072 (df = 2262) 13.068 (df = 2264)
## F Statistic 80.103*** (df = 13; 2262) 94.678*** (df = 11; 2264)
## =======================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Two variables, (Pitching_HR and Pitching_BB) are removed. Given they
have the lowest level of significance, I argue that this removal is
reasonable.
Overall the model performance is poor with a 0.312 Adjusted R^2,
which is a 0.01 improvement made through the removal of two varaibles.
This is too much data and variables for a linear regression model, and
more advanced models could improve the performance.