Data wrangling steps to filter, mutate, and select
variables, combine data frames, and change variable
attributes.
#Teams dataframe from the Lahman Database
df1 <- Teams %>%
dplyr::filter(teamID == "OAK" & yearID <= 2011 & yearID >=1991) %>%
mutate(AVG = H/AB,
OBP = (H + BB + HBP)/(AB + BB + HBP + SF),
SLP = (BB + X2B*2 + X3B*3 + HR*4)/AB,
RC = ((H + BB) * (BB + X2B*2 + X3B*3 + HR*4)) / (AB + BB),
WP = W/G) %>%
dplyr::select('yearID', 'teamID', 'R':'FP', 'AVG', 'SLP', 'OBP', 'RC', 'WP')
#Local dataframe with adjusted payroll info
df2 <- read.csv("Final_Oakland.csv")
df2 <- df2 %>%
dplyr::select('Year', 'WHIP', 'Payroll.Adjusted.')
df_final <- right_join(df1, df2, by = c("yearID"="Year")) %>%
relocate(WP, .after = Payroll.Adjusted.)
df_final$Payroll.Adjusted. <- as.numeric(gsub("[$,]", "", df_final$Payroll.Adjusted.))
str(df_final)
## 'data.frame': 21 obs. of 35 variables:
## $ yearID : int 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 ...
## $ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 96 96 96 96 96 96 96 96 96 96 ...
## $ R : int 760 745 715 549 730 861 764 804 893 947 ...
## $ AB : int 5410 5387 5543 3885 4916 5630 5589 5490 5519 5560 ...
## $ H : int 1342 1389 1408 1009 1296 1492 1451 1413 1430 1501 ...
## $ X2B : int 246 219 260 178 228 283 274 295 287 281 ...
## $ X3B : int 19 24 21 13 18 21 23 13 20 23 ...
## $ HR : int 159 142 158 113 169 243 197 149 235 239 ...
## $ BB : int 642 707 622 417 565 640 642 633 770 750 ...
## $ SO : int 981 831 1048 686 911 1114 1181 1122 1129 1159 ...
## $ SB : int 151 143 131 91 112 58 71 131 70 40 ...
## $ CS : int 64 59 59 39 46 35 36 47 37 15 ...
## $ HBP : int 50 49 33 18 45 58 49 55 71 52 ...
## $ SF : int 49 59 49 51 58 39 40 46 41 44 ...
## $ RA : int 776 672 846 589 761 900 946 866 846 813 ...
## $ ER : int 734 599 791 535 698 841 880 766 750 730 ...
## $ ERA : num 4.57 3.73 4.9 4.8 4.93 5.2 5.48 4.81 4.69 4.58 ...
## $ CG : int 14 8 8 12 8 7 2 12 6 7 ...
## $ SHO : int 10 9 2 9 4 5 1 4 5 11 ...
## $ SV : int 49 58 42 23 34 34 38 39 48 43 ...
## $ IPouts : int 4333 4341 4357 3010 3819 4369 4336 4302 4315 4306 ...
## $ HA : int 1425 1396 1551 979 1320 1638 1734 1555 1537 1535 ...
## $ HRA : int 155 129 157 128 153 205 197 179 160 158 ...
## $ BBA : int 655 601 680 510 556 644 642 529 569 615 ...
## $ SOA : int 892 843 864 732 890 884 953 922 967 963 ...
## $ E : int 107 125 111 88 102 103 122 141 122 134 ...
## $ DP : int 150 158 161 105 151 195 170 155 166 164 ...
## $ FP : num 0.982 0.979 0.982 0.979 0.981 0.984 0.98 0.977 0.98 0.978 ...
## $ AVG : num 0.248 0.258 0.254 0.26 0.264 ...
## $ SLP : num 0.338 0.331 0.331 0.325 0.356 ...
## $ OBP : num 0.331 0.346 0.33 0.33 0.341 ...
## $ RC : num 599 614 605 419 595 ...
## $ WHIP : num 1.44 1.38 1.54 1.48 1.47 ...
## $ Payroll.Adjusted.: num 43238258 49868972 43061069 39190145 41317970 ...
## $ WP : num 0.519 0.593 0.42 0.447 0.465 ...
By comparing the model with traditional box stats
and the modelw ith moneyball states, we can tell the effectiveness of
moneybal stats in evaluating team performance (not player evaluation in
this case). If moneyball stats can explain win percentage at a level
similar to traditional box stats, we then can say moneyball stats are
concise and effective in evaluating team performance.
#Prediction model with traditional box stats, with payroll as a control variable
options(scipen = 999)
Model_BoxStats <- lm(WP ~ AB+H+X2B+X3B+HR+BB+SO+SB+CS+HBP+SF+RA+ER+ERA+CG+SHO+SV+IPouts+Payroll.Adjusted., df_final)
summary(Model_BoxStats)
##
## Call:
## lm(formula = WP ~ AB + H + X2B + X3B + HR + BB + SO + SB + CS +
## HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + Payroll.Adjusted.,
## data = df_final)
##
## Residuals:
## 1 2 3 4 5 6 7
## 0.0051325 0.0020362 -0.0070529 -0.0012918 0.0029370 -0.0024862 0.0029194
## 8 9 10 11 12 13 14
## -0.0032678 -0.0016504 0.0048051 -0.0016518 -0.0034884 0.0024902 0.0027654
## 15 16 17 18 19 20 21
## 0.0001864 -0.0035164 -0.0030473 0.0027589 0.0037681 -0.0036997 0.0013536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.684844060268 30.967393235416 0.022 0.986
## AB -0.000326730543 0.001074194565 -0.304 0.812
## H 0.000650082507 0.001119831966 0.581 0.665
## X2B -0.000437249742 0.003273942443 -0.134 0.915
## X3B -0.004389480855 0.006878599092 -0.638 0.638
## HR 0.000918019126 0.001757361555 0.522 0.694
## BB -0.000209127034 0.000639348555 -0.327 0.799
## SO 0.000108919976 0.000671586679 0.162 0.898
## SB 0.000149943562 0.002609393657 0.057 0.963
## CS -0.000544747438 0.005208661508 -0.105 0.934
## HBP 0.000444402240 0.004169236679 0.107 0.932
## SF -0.001114070484 0.004040306656 -0.276 0.829
## RA -0.000508986011 0.001283620286 -0.397 0.760
## ER -0.000033810789 0.040859637213 -0.001 0.999
## ERA -0.020521823509 6.492930641425 -0.003 0.998
## CG 0.001625979637 0.007958287958 0.204 0.872
## SHO -0.000510380961 0.004197730252 -0.122 0.923
## SV 0.004313157022 0.007425194550 0.581 0.665
## IPouts 0.000262062794 0.008521847055 0.031 0.980
## Payroll.Adjusted. -0.000000000236 0.000000001579 -0.149 0.906
##
## Residual standard error: 0.01515 on 1 degrees of freedom
## Multiple R-squared: 0.9975, Adjusted R-squared: 0.951
## F-statistic: 21.42 on 19 and 1 DF, p-value: 0.1688
vif(Model_BoxStats)
## AB H X2B X3B
## 14983.04798 1317.28566 1313.75428 88.49999
## HR BB SO SB
## 438.67634 243.00272 701.06103 988.10238
## CS HBP SF RA
## 544.64180 277.62608 88.55771 1388.76272
## ER ERA CG SHO
## 1303905.37537 1251498.35838 79.19433 32.01551
## SV IPouts Payroll.Adjusted.
## 306.03080 600323.33699 43.68295
#Refined model removing variables with high multi-collinearity
Model_BoxStats_refined <- lm(WP ~ H+X2B+X3B+HR+BB+SO+SB+CS+HBP+SF+RA+CG+SHO+SV+Payroll.Adjusted., df_final)
summary(Model_BoxStats_refined)
##
## Call:
## lm(formula = WP ~ H + X2B + X3B + HR + BB + SO + SB + CS + HBP +
## SF + RA + CG + SHO + SV + Payroll.Adjusted., data = df_final)
##
## Residuals:
## 1 2 3 4 5 6 7
## 0.0038515 0.0051103 -0.0082793 0.0026320 -0.0037443 -0.0032626 0.0094180
## 8 9 10 11 12 13 14
## -0.0029213 -0.0042194 0.0046156 0.0013924 -0.0023556 0.0024166 0.0049590
## 15 16 17 18 19 20 21
## -0.0039767 -0.0046332 -0.0031052 0.0006613 0.0050942 -0.0028266 -0.0008266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5501639755985 0.0598856003214 9.187 0.000256 ***
## H 0.0004081738741 0.0000755681415 5.401 0.002939 **
## X2B -0.0006598282030 0.0002390401121 -2.760 0.039819 *
## X3B -0.0063243619764 0.0012137747489 -5.210 0.003437 **
## HR 0.0012043589836 0.0002259403542 5.330 0.003114 **
## BB -0.0002092193140 0.0000976711637 -2.142 0.085093 .
## SO 0.0000686867486 0.0000569290630 1.207 0.281579
## SB 0.0005158646533 0.0002334442477 2.210 0.078124 .
## CS -0.0010945019654 0.0005124464553 -2.136 0.085768 .
## HBP 0.0002732361996 0.0004642031846 0.589 0.581704
## SF -0.0006724667102 0.0003707516098 -1.814 0.129439
## RA -0.0008022772036 0.0000983463302 -8.158 0.000450 ***
## CG 0.0008558867785 0.0010483008168 0.816 0.451373
## SHO -0.0006914502718 0.0010434339871 -0.663 0.536864
## SV 0.0048803568944 0.0007893908156 6.182 0.001614 **
## Payroll.Adjusted. -0.0000000008219 0.0000000002787 -2.950 0.031898 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.008914 on 5 degrees of freedom
## Multiple R-squared: 0.9958, Adjusted R-squared: 0.983
## F-statistic: 78.2 on 15 and 5 DF, p-value: 0.00006715
vif(Model_BoxStats_refined)
## H X2B X3B HR
## 17.323073 20.224921 7.957793 20.940294
## BB SO SB CS
## 16.377276 14.547654 22.838222 15.224004
## HBP SF RA CG
## 9.938861 2.153461 23.542058 3.968258
## SHO SV Payroll.Adjusted.
## 5.712610 9.988650 3.926477
lm.beta(Model_BoxStats_refined) #standardized coefficients to show the relative importance of each predictor: ranging 0-1; larger value means more important, smaller value means less important; unit being same across different coefficients.
## H X2B X3B HR
## 0.65501455 -0.36168919 -0.42825945 0.71069877
## BB SO SB CS
## -0.25257379 0.13408106 0.30769169 -0.24280885
## HBP SF RA CG
## 0.05406677 -0.07755115 -1.15324221 0.04738734
## SHO SV Payroll.Adjusted.
## -0.04614719 0.56930492 -0.17029566
plot(Model_BoxStats_refined)
#Prediction model with Moneyball stats including batting average, slugging percentage,
#on-base percentage, runs created, Walks and Hits per Inning Pitched, with payroll as a control variable
Model_MB <- lm(WP ~ AVG + SLP + OBP + RC + WHIP + Payroll.Adjusted., df_final)
summary(Model_MB)
##
## Call:
## lm(formula = WP ~ AVG + SLP + OBP + RC + WHIP + Payroll.Adjusted.,
## data = df_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.073272 -0.015354 0.001224 0.017911 0.049050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2125831139568 0.3959693903659 0.537 0.5998
## AVG -1.1446485581684 1.7517730802435 -0.653 0.5241
## SLP 0.5777082896275 0.6704957730882 0.862 0.4034
## OBP 3.4643761981074 1.8763753409680 1.846 0.0861 .
## RC -0.0000431887942 0.0002057544018 -0.210 0.8368
## WHIP -0.5356745159143 0.0852611149237 -6.283 0.0000201 ***
## Payroll.Adjusted. 0.0000000002344 0.0000000006500 0.361 0.7237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03554 on 14 degrees of freedom
## Multiple R-squared: 0.8111, Adjusted R-squared: 0.7302
## F-statistic: 10.02 on 6 and 14 DF, p-value: 0.0002153
vif(Model_MB)
## AVG SLP OBP RC
## 2.524430 9.527752 7.349163 7.172675
## WHIP Payroll.Adjusted.
## 1.373198 1.344296
lm.beta(Model_MB) #standardized coefficients to show the relative importance of each predictor: ranging 0-1; larger value means more important, smaller value means less important; unit being same across different coefficients.
## AVG SLP OBP RC
## -0.12059097 0.30892035 0.58138392 -0.06529821
## WHIP Payroll.Adjusted.
## -0.85517559 0.04857386
plot(Model_MB)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.