This paper is for predicting Oakland A’s win in 2002 from 2001 data. The model uses very simple multiple regression but it gives quite well results.
Model consisted of 3 units.
library(ggplot2)
library(plotly)
library(dplyr)
baseball = read.csv("baseball.csv")
moneyball = baseball %>% filter(Year < 2002)
\[ RD = RS(Run Socred) - RA(Run Allowed) \]
moneyball = moneyball %>% mutate(RD = RS - RA)
p = ggplot(moneyball, aes(x = RD, y = W, col = Playoffs)) + geom_point()
ggplotly(p)
WinsReg = lm(W ~ RD, data=moneyball)
summary(WinsReg)
##
## Call:
## lm(formula = W ~ RD, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2662 -2.6509 0.1234 2.9364 11.6570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.881375 0.131157 616.67 <2e-16 ***
## RD 0.105766 0.001297 81.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.939 on 900 degrees of freedom
## Multiple R-squared: 0.8808, Adjusted R-squared: 0.8807
## F-statistic: 6651 on 1 and 900 DF, p-value: < 2.2e-16
\[ RS = OBP * x_1 + SLG * x_2 + BA * x_3 \]
RunsReg_1 = lm(RS ~ OBP + SLG + BA, data=moneyball)
summary(RunsReg_1)
##
## Call:
## lm(formula = RS ~ OBP + SLG + BA, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.941 -17.247 -0.621 16.754 90.998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -788.46 19.70 -40.029 < 2e-16 ***
## OBP 2917.42 110.47 26.410 < 2e-16 ***
## SLG 1637.93 45.99 35.612 < 2e-16 ***
## BA -368.97 130.58 -2.826 0.00482 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.69 on 898 degrees of freedom
## Multiple R-squared: 0.9302, Adjusted R-squared: 0.93
## F-statistic: 3989 on 3 and 898 DF, p-value: < 2.2e-16
RunsReg_2 = lm(RS ~ OBP + SLG, data=moneyball)
summary(RunsReg_2)
##
## Call:
## lm(formula = RS ~ OBP + SLG, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.838 -17.174 -1.108 16.770 90.036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -804.63 18.92 -42.53 <2e-16 ***
## OBP 2737.77 90.68 30.19 <2e-16 ***
## SLG 1584.91 42.16 37.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.79 on 899 degrees of freedom
## Multiple R-squared: 0.9296, Adjusted R-squared: 0.9294
## F-statistic: 5934 on 2 and 899 DF, p-value: < 2.2e-16
\[ RA = OOBP * x_1 + OSLG * x_2 \]
RunsReg_3 = lm(RA ~ OOBP + OSLG, data = moneyball)
summary(RunsReg_3)
##
## Call:
## lm(formula = RA ~ OOBP + OSLG, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.397 -15.178 -0.129 17.679 60.955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -837.38 60.26 -13.897 < 2e-16 ***
## OOBP 2913.60 291.97 9.979 4.46e-16 ***
## OSLG 1514.29 175.43 8.632 2.55e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.67 on 87 degrees of freedom
## (812 observations deleted due to missingness)
## Multiple R-squared: 0.9073, Adjusted R-squared: 0.9052
## F-statistic: 425.8 on 2 and 87 DF, p-value: < 2.2e-16
PRW = function(obp = 0, slg = 0, oobp = 0, oslg = 0){
dat = data.frame(OBP = obp,
SLG = slg,
OOBP = oobp,
OSLG = oslg)
RS = predict(RunsReg_2, dat)
RA = predict(RunsReg_3, dat)
RD = RS - RA
W = predict(WinsReg, data.frame(RD))
res = c(RS,RA,W)
return(res)
}
dat_2001 = moneyball %>% filter(Year == 2001) %>% filter(Team == 'OAK')
predict_2002 = PRW(dat_2001$OBP, dat_2001$SLG, dat_2001$OOBP, dat_2001$OSLG)
dat_2002 = baseball %>% filter(Year == 2002) %>% filter(Team == 'OAK') %>% select(RS, RA, W)
compare = rbind(predict_2002, dat_2002)
colnames(compare) = c('Run Scored', 'Run Allowed', 'Win')
rownames(compare) = c('PREDICT', 'REAL')
knitr::kable(compare)
| Run Scored | Run Allowed | Win | |
|---|---|---|---|
| PREDICT | 835.6778 | 635.4394 | 102.0597 |
| REAL | 800.0000 | 654.0000 | 103.0000 |