Data wrangling steps to filter, mutate, and select variables, combine data frames, and change variable attributes.

#Teams dataframe from the Lahman Database
df1 <- Teams %>% 
  dplyr::filter(teamID == "OAK" & yearID <= 2011 & yearID >=1991) %>%
  mutate(AVG = H/AB, 
         OBP = (H + BB + HBP)/(AB + BB + HBP + SF),
         SLP = (BB + X2B*2 + X3B*3 + HR*4)/AB,
         RC = ((H + BB) * (BB + X2B*2 + X3B*3 + HR*4)) / (AB + BB),
         WP = W/G) %>%
  dplyr::select('yearID', 'teamID', 'R':'FP', 'AVG', 'SLP', 'OBP', 'RC', 'WP')

#Local dataframe with adjusted payroll info
df2 <- read.csv("Final_Oakland.csv")
df2 <- df2 %>%
  dplyr::select('Year', 'WHIP', 'Payroll.Adjusted.')

df_final <- right_join(df1, df2, by = c("yearID"="Year")) %>%
  relocate(WP, .after = Payroll.Adjusted.)
df_final$Payroll.Adjusted. <- as.numeric(gsub("[$,]", "", df_final$Payroll.Adjusted.))

str(df_final)
## 'data.frame':    21 obs. of  35 variables:
##  $ yearID           : int  1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 ...
##  $ teamID           : Factor w/ 149 levels "ALT","ANA","ARI",..: 96 96 96 96 96 96 96 96 96 96 ...
##  $ R                : int  760 745 715 549 730 861 764 804 893 947 ...
##  $ AB               : int  5410 5387 5543 3885 4916 5630 5589 5490 5519 5560 ...
##  $ H                : int  1342 1389 1408 1009 1296 1492 1451 1413 1430 1501 ...
##  $ X2B              : int  246 219 260 178 228 283 274 295 287 281 ...
##  $ X3B              : int  19 24 21 13 18 21 23 13 20 23 ...
##  $ HR               : int  159 142 158 113 169 243 197 149 235 239 ...
##  $ BB               : int  642 707 622 417 565 640 642 633 770 750 ...
##  $ SO               : int  981 831 1048 686 911 1114 1181 1122 1129 1159 ...
##  $ SB               : int  151 143 131 91 112 58 71 131 70 40 ...
##  $ CS               : int  64 59 59 39 46 35 36 47 37 15 ...
##  $ HBP              : int  50 49 33 18 45 58 49 55 71 52 ...
##  $ SF               : int  49 59 49 51 58 39 40 46 41 44 ...
##  $ RA               : int  776 672 846 589 761 900 946 866 846 813 ...
##  $ ER               : int  734 599 791 535 698 841 880 766 750 730 ...
##  $ ERA              : num  4.57 3.73 4.9 4.8 4.93 5.2 5.48 4.81 4.69 4.58 ...
##  $ CG               : int  14 8 8 12 8 7 2 12 6 7 ...
##  $ SHO              : int  10 9 2 9 4 5 1 4 5 11 ...
##  $ SV               : int  49 58 42 23 34 34 38 39 48 43 ...
##  $ IPouts           : int  4333 4341 4357 3010 3819 4369 4336 4302 4315 4306 ...
##  $ HA               : int  1425 1396 1551 979 1320 1638 1734 1555 1537 1535 ...
##  $ HRA              : int  155 129 157 128 153 205 197 179 160 158 ...
##  $ BBA              : int  655 601 680 510 556 644 642 529 569 615 ...
##  $ SOA              : int  892 843 864 732 890 884 953 922 967 963 ...
##  $ E                : int  107 125 111 88 102 103 122 141 122 134 ...
##  $ DP               : int  150 158 161 105 151 195 170 155 166 164 ...
##  $ FP               : num  0.982 0.979 0.982 0.979 0.981 0.984 0.98 0.977 0.98 0.978 ...
##  $ AVG              : num  0.248 0.258 0.254 0.26 0.264 ...
##  $ SLP              : num  0.338 0.331 0.331 0.325 0.356 ...
##  $ OBP              : num  0.331 0.346 0.33 0.33 0.341 ...
##  $ RC               : num  599 614 605 419 595 ...
##  $ WHIP             : num  1.44 1.38 1.54 1.48 1.47 ...
##  $ Payroll.Adjusted.: num  43238258 49868972 43061069 39190145 41317970 ...
##  $ WP               : num  0.519 0.593 0.42 0.447 0.465 ...



By comparing the model with traditional box stats and the modelw ith moneyball states, we can tell the effectiveness of moneybal stats in evaluating team performance (not player evaluation in this case). If moneyball stats can explain win percentage at a level similar to traditional box stats, we then can say moneyball stats are concise and effective in evaluating team performance.

#Prediction model with traditional box stats, with payroll as a control variable
options(scipen = 999)


Model_BoxStats <- lm(WP ~ AB+H+X2B+X3B+HR+BB+SO+SB+CS+HBP+SF+RA+ER+ERA+CG+SHO+SV+IPouts+Payroll.Adjusted., df_final)
summary(Model_BoxStats)
## 
## Call:
## lm(formula = WP ~ AB + H + X2B + X3B + HR + BB + SO + SB + CS + 
##     HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + Payroll.Adjusted., 
##     data = df_final)
## 
## Residuals:
##          1          2          3          4          5          6          7 
##  0.0051325  0.0020362 -0.0070529 -0.0012918  0.0029370 -0.0024862  0.0029194 
##          8          9         10         11         12         13         14 
## -0.0032678 -0.0016504  0.0048051 -0.0016518 -0.0034884  0.0024902  0.0027654 
##         15         16         17         18         19         20         21 
##  0.0001864 -0.0035164 -0.0030473  0.0027589  0.0037681 -0.0036997  0.0013536 
## 
## Coefficients:
##                          Estimate      Std. Error t value Pr(>|t|)
## (Intercept)        0.684844060268 30.967393235416   0.022    0.986
## AB                -0.000326730543  0.001074194565  -0.304    0.812
## H                  0.000650082507  0.001119831966   0.581    0.665
## X2B               -0.000437249742  0.003273942443  -0.134    0.915
## X3B               -0.004389480855  0.006878599092  -0.638    0.638
## HR                 0.000918019126  0.001757361555   0.522    0.694
## BB                -0.000209127034  0.000639348555  -0.327    0.799
## SO                 0.000108919976  0.000671586679   0.162    0.898
## SB                 0.000149943562  0.002609393657   0.057    0.963
## CS                -0.000544747438  0.005208661508  -0.105    0.934
## HBP                0.000444402240  0.004169236679   0.107    0.932
## SF                -0.001114070484  0.004040306656  -0.276    0.829
## RA                -0.000508986011  0.001283620286  -0.397    0.760
## ER                -0.000033810789  0.040859637213  -0.001    0.999
## ERA               -0.020521823509  6.492930641425  -0.003    0.998
## CG                 0.001625979637  0.007958287958   0.204    0.872
## SHO               -0.000510380961  0.004197730252  -0.122    0.923
## SV                 0.004313157022  0.007425194550   0.581    0.665
## IPouts             0.000262062794  0.008521847055   0.031    0.980
## Payroll.Adjusted. -0.000000000236  0.000000001579  -0.149    0.906
## 
## Residual standard error: 0.01515 on 1 degrees of freedom
## Multiple R-squared:  0.9975, Adjusted R-squared:  0.951 
## F-statistic: 21.42 on 19 and 1 DF,  p-value: 0.1688
vif(Model_BoxStats) 
##                AB                 H               X2B               X3B 
##       14983.04798        1317.28566        1313.75428          88.49999 
##                HR                BB                SO                SB 
##         438.67634         243.00272         701.06103         988.10238 
##                CS               HBP                SF                RA 
##         544.64180         277.62608          88.55771        1388.76272 
##                ER               ERA                CG               SHO 
##     1303905.37537     1251498.35838          79.19433          32.01551 
##                SV            IPouts Payroll.Adjusted. 
##         306.03080      600323.33699          43.68295
#Refined model removing variables with high multi-collinearity
Model_BoxStats_refined <- lm(WP ~ H+X2B+X3B+HR+BB+SO+SB+CS+HBP+SF+RA+CG+SHO+SV+Payroll.Adjusted., df_final)
summary(Model_BoxStats_refined) 
## 
## Call:
## lm(formula = WP ~ H + X2B + X3B + HR + BB + SO + SB + CS + HBP + 
##     SF + RA + CG + SHO + SV + Payroll.Adjusted., data = df_final)
## 
## Residuals:
##          1          2          3          4          5          6          7 
##  0.0038515  0.0051103 -0.0082793  0.0026320 -0.0037443 -0.0032626  0.0094180 
##          8          9         10         11         12         13         14 
## -0.0029213 -0.0042194  0.0046156  0.0013924 -0.0023556  0.0024166  0.0049590 
##         15         16         17         18         19         20         21 
## -0.0039767 -0.0046332 -0.0031052  0.0006613  0.0050942 -0.0028266 -0.0008266 
## 
## Coefficients:
##                           Estimate       Std. Error t value Pr(>|t|)    
## (Intercept)        0.5501639755985  0.0598856003214   9.187 0.000256 ***
## H                  0.0004081738741  0.0000755681415   5.401 0.002939 ** 
## X2B               -0.0006598282030  0.0002390401121  -2.760 0.039819 *  
## X3B               -0.0063243619764  0.0012137747489  -5.210 0.003437 ** 
## HR                 0.0012043589836  0.0002259403542   5.330 0.003114 ** 
## BB                -0.0002092193140  0.0000976711637  -2.142 0.085093 .  
## SO                 0.0000686867486  0.0000569290630   1.207 0.281579    
## SB                 0.0005158646533  0.0002334442477   2.210 0.078124 .  
## CS                -0.0010945019654  0.0005124464553  -2.136 0.085768 .  
## HBP                0.0002732361996  0.0004642031846   0.589 0.581704    
## SF                -0.0006724667102  0.0003707516098  -1.814 0.129439    
## RA                -0.0008022772036  0.0000983463302  -8.158 0.000450 ***
## CG                 0.0008558867785  0.0010483008168   0.816 0.451373    
## SHO               -0.0006914502718  0.0010434339871  -0.663 0.536864    
## SV                 0.0048803568944  0.0007893908156   6.182 0.001614 ** 
## Payroll.Adjusted. -0.0000000008219  0.0000000002787  -2.950 0.031898 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.008914 on 5 degrees of freedom
## Multiple R-squared:  0.9958, Adjusted R-squared:  0.983 
## F-statistic:  78.2 on 15 and 5 DF,  p-value: 0.00006715
vif(Model_BoxStats_refined) 
##                 H               X2B               X3B                HR 
##         17.323073         20.224921          7.957793         20.940294 
##                BB                SO                SB                CS 
##         16.377276         14.547654         22.838222         15.224004 
##               HBP                SF                RA                CG 
##          9.938861          2.153461         23.542058          3.968258 
##               SHO                SV Payroll.Adjusted. 
##          5.712610          9.988650          3.926477
lm.beta(Model_BoxStats_refined) #standardized coefficients to show the relative importance of each predictor: ranging 0-1; larger value means more important, smaller value means less important; unit being same across different coefficients.
##                 H               X2B               X3B                HR 
##        0.65501455       -0.36168919       -0.42825945        0.71069877 
##                BB                SO                SB                CS 
##       -0.25257379        0.13408106        0.30769169       -0.24280885 
##               HBP                SF                RA                CG 
##        0.05406677       -0.07755115       -1.15324221        0.04738734 
##               SHO                SV Payroll.Adjusted. 
##       -0.04614719        0.56930492       -0.17029566
plot(Model_BoxStats_refined)

#Prediction model with Moneyball stats including batting average, slugging percentage, 
#on-base percentage, runs created, Walks and Hits per Inning Pitched, with payroll as a control variable
Model_MB <- lm(WP ~  AVG + SLP + OBP + RC + WHIP + Payroll.Adjusted., df_final)
summary(Model_MB) 
## 
## Call:
## lm(formula = WP ~ AVG + SLP + OBP + RC + WHIP + Payroll.Adjusted., 
##     data = df_final)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.073272 -0.015354  0.001224  0.017911  0.049050 
## 
## Coefficients:
##                           Estimate       Std. Error t value  Pr(>|t|)    
## (Intercept)        0.2125831139568  0.3959693903659   0.537    0.5998    
## AVG               -1.1446485581684  1.7517730802435  -0.653    0.5241    
## SLP                0.5777082896275  0.6704957730882   0.862    0.4034    
## OBP                3.4643761981074  1.8763753409680   1.846    0.0861 .  
## RC                -0.0000431887942  0.0002057544018  -0.210    0.8368    
## WHIP              -0.5356745159143  0.0852611149237  -6.283 0.0000201 ***
## Payroll.Adjusted.  0.0000000002344  0.0000000006500   0.361    0.7237    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03554 on 14 degrees of freedom
## Multiple R-squared:  0.8111, Adjusted R-squared:  0.7302 
## F-statistic: 10.02 on 6 and 14 DF,  p-value: 0.0002153
vif(Model_MB)
##               AVG               SLP               OBP                RC 
##          2.524430          9.527752          7.349163          7.172675 
##              WHIP Payroll.Adjusted. 
##          1.373198          1.344296
lm.beta(Model_MB) #standardized coefficients to show the relative importance of each predictor: ranging 0-1; larger value means more important, smaller value means less important; unit being same across different coefficients.
##               AVG               SLP               OBP                RC 
##       -0.12059097        0.30892035        0.58138392       -0.06529821 
##              WHIP Payroll.Adjusted. 
##       -0.85517559        0.04857386
plot(Model_MB)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.