title: “Predicting the Baseball World Series Champion” output: html_document —
baseball <- read.csv("baseball.csv")
str(baseball)
## 'data.frame': 1232 obs. of 15 variables:
## $ Team : chr "ARI" "ATL" "BAL" "BOS" ...
## $ League : chr "NL" "NL" "AL" "AL" ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ RS : int 734 700 712 734 613 748 669 667 758 726 ...
## $ RA : int 688 600 705 806 759 676 588 845 890 670 ...
## $ W : int 81 94 93 69 61 85 97 68 64 88 ...
## $ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
## $ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
## $ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
## $ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
## $ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
## $ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
## $ G : int 162 162 162 162 162 162 162 162 162 162 ...
## $ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
## $ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
nrow(baseball)
## [1] 1232
table(baseball$Year)
##
## 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973 1974 1975 1976 1977 1978
## 20 20 20 20 20 20 20 24 24 24 24 24 24 24 26 26
## 1979 1980 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1996 1997
## 26 26 26 26 26 26 26 26 26 26 26 26 26 28 28 28
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
length(table(baseball$Year))
## [1] 47
baseball <- subset(baseball, Playoffs == 1)
nrow(baseball)
## [1] 244
table(baseball$Year)
##
## 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973 1974 1975 1976 1977 1978
## 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4
## 1979 1980 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1996 1997
## 4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 8 8 8 8 8 8 8 8 8 8 8 8 8 8 10
table(table(baseball$Year))
##
## 2 4 8 10
## 7 23 16 1
PlayoffTable <- table(baseball$Year)
baseball$NumCompetitors <- PlayoffTable[as.character(baseball$Year)]
table(baseball$NumCompetitors)
##
## 2 4 8 10
## 14 92 128 10
baseball$WorldSeries <- as.numeric(baseball$RankPlayoffs == 1)
table(baseball$WorldSeries)
##
## 0 1
## 197 47
model1 <- glm(WorldSeries ~ Year, data=baseball, family="binomial")
summary(model1)
##
## Call:
## glm(formula = WorldSeries ~ Year, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 72.23602 22.64409 3.19 0.00142 **
## Year -0.03700 0.01138 -3.25 0.00115 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 228.35 on 242 degrees of freedom
## AIC: 232.35
##
## Number of Fisher Scoring iterations: 4
model2 <- glm(WorldSeries ~ RS, data=baseball, family="binomial")
summary(model2)
##
## Call:
## glm(formula = WorldSeries ~ RS, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.661226 1.636494 0.404 0.686
## RS -0.002681 0.002098 -1.278 0.201
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 237.45 on 242 degrees of freedom
## AIC: 241.45
##
## Number of Fisher Scoring iterations: 4
model3 <- glm(WorldSeries ~ RA, data=baseball, family="binomial")
summary(model3)
##
## Call:
## glm(formula = WorldSeries ~ RA, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.888174 1.483831 1.272 0.2032
## RA -0.005053 0.002273 -2.223 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 233.88 on 242 degrees of freedom
## AIC: 237.88
##
## Number of Fisher Scoring iterations: 4
model4 <- glm(WorldSeries ~ W, data=baseball, family="binomial")
summary(model4)
##
## Call:
## glm(formula = WorldSeries ~ W, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.85568 2.87620 -2.384 0.0171 *
## W 0.05671 0.02988 1.898 0.0577 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 235.51 on 242 degrees of freedom
## AIC: 239.51
##
## Number of Fisher Scoring iterations: 4
model5 <- glm(WorldSeries ~ OBP, data=baseball, family="binomial")
summary(model5)
##
## Call:
## glm(formula = WorldSeries ~ OBP, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.741 3.989 0.687 0.492
## OBP -12.402 11.865 -1.045 0.296
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 238.02 on 242 degrees of freedom
## AIC: 242.02
##
## Number of Fisher Scoring iterations: 4
model6 <- glm(WorldSeries ~ SLG, data=baseball, family="binomial")
summary(model6)
##
## Call:
## glm(formula = WorldSeries ~ SLG, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.200 2.358 1.357 0.1748
## SLG -11.130 5.689 -1.956 0.0504 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 235.23 on 242 degrees of freedom
## AIC: 239.23
##
## Number of Fisher Scoring iterations: 4
model7 <- glm(WorldSeries ~ BA, data=baseball, family="binomial")
summary(model7)
##
## Call:
## glm(formula = WorldSeries ~ BA, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.6392 3.8988 -0.164 0.870
## BA -2.9765 14.6123 -0.204 0.839
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 239.08 on 242 degrees of freedom
## AIC: 243.08
##
## Number of Fisher Scoring iterations: 4
model8 <- glm(WorldSeries ~ RankSeason, data=baseball, family="binomial")
summary(model8)
##
## Call:
## glm(formula = WorldSeries ~ RankSeason, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.8256 0.3268 -2.527 0.0115 *
## RankSeason -0.2069 0.1027 -2.016 0.0438 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 234.75 on 242 degrees of freedom
## AIC: 238.75
##
## Number of Fisher Scoring iterations: 4
model9 <- glm(WorldSeries ~ OOBP, data=baseball, family="binomial")
summary(model9)
##
## Call:
## glm(formula = WorldSeries ~ OOBP, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9306 8.3728 -0.111 0.912
## OOBP -3.2233 26.0587 -0.124 0.902
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.926 on 113 degrees of freedom
## Residual deviance: 84.910 on 112 degrees of freedom
## (130 observations deleted due to missingness)
## AIC: 88.91
##
## Number of Fisher Scoring iterations: 4
model10 <- glm(WorldSeries ~ OSLG, data=baseball, family="binomial")
summary(model10)
##
## Call:
## glm(formula = WorldSeries ~ OSLG, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.08725 6.07285 -0.014 0.989
## OSLG -4.65992 15.06881 -0.309 0.757
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 84.926 on 113 degrees of freedom
## Residual deviance: 84.830 on 112 degrees of freedom
## (130 observations deleted due to missingness)
## AIC: 88.83
##
## Number of Fisher Scoring iterations: 4
model11 <- glm(WorldSeries ~ NumCompetitors, data=baseball, family="binomial")
summary(model11)
##
## Call:
## glm(formula = WorldSeries ~ NumCompetitors, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03868 0.43750 0.088 0.929559
## NumCompetitors -0.25220 0.07422 -3.398 0.000678 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 226.96 on 242 degrees of freedom
## AIC: 230.96
##
## Number of Fisher Scoring iterations: 4
model12 <- glm(WorldSeries ~ League, data=baseball, family="binomial")
summary(model12)
##
## Call:
## glm(formula = WorldSeries ~ League, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3558 0.2243 -6.045 1.5e-09 ***
## LeagueNL -0.1583 0.3252 -0.487 0.626
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 238.88 on 242 degrees of freedom
## AIC: 242.88
##
## Number of Fisher Scoring iterations: 4
LogModel <- glm(WorldSeries ~ Year + RA + RankSeason + NumCompetitors, data=baseball, family="binomial")
summary(LogModel)
##
## Call:
## glm(formula = WorldSeries ~ Year + RA + RankSeason + NumCompetitors,
## family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.5874376 53.6474210 0.235 0.814
## Year -0.0061425 0.0274665 -0.224 0.823
## RA -0.0008238 0.0027391 -0.301 0.764
## RankSeason -0.0685046 0.1203459 -0.569 0.569
## NumCompetitors -0.1794264 0.1815933 -0.988 0.323
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 226.37 on 239 degrees of freedom
## AIC: 236.37
##
## Number of Fisher Scoring iterations: 4
cor(baseball[c("Year", "RA", "RankSeason", "NumCompetitors")])
## Year RA RankSeason NumCompetitors
## Year 1.0000000 0.4762422 0.3852191 0.9139548
## RA 0.4762422 1.0000000 0.3991413 0.5136769
## RankSeason 0.3852191 0.3991413 1.0000000 0.4247393
## NumCompetitors 0.9139548 0.5136769 0.4247393 1.0000000
model13 <- glm(WorldSeries ~ Year + RA, data=baseball, family="binomial")
summary(model13)
##
## Call:
## glm(formula = WorldSeries ~ Year + RA, family = "binomial", data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 63.610741 25.654830 2.479 0.0132 *
## Year -0.032084 0.013323 -2.408 0.0160 *
## RA -0.001766 0.002585 -0.683 0.4945
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 227.88 on 241 degrees of freedom
## AIC: 233.88
##
## Number of Fisher Scoring iterations: 4
model14 <- glm(WorldSeries ~ Year + RankSeason, data=baseball, family="binomial")
summary(model14)
##
## Call:
## glm(formula = WorldSeries ~ Year + RankSeason, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 63.64855 24.37063 2.612 0.00901 **
## Year -0.03254 0.01231 -2.643 0.00822 **
## RankSeason -0.10064 0.11352 -0.887 0.37534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 227.55 on 241 degrees of freedom
## AIC: 233.55
##
## Number of Fisher Scoring iterations: 4
model15 <- glm(WorldSeries ~ Year + NumCompetitors, data=baseball, family="binomial")
summary(model15)
##
## Call:
## glm(formula = WorldSeries ~ Year + NumCompetitors, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.350467 53.481896 0.250 0.803
## Year -0.006802 0.027328 -0.249 0.803
## NumCompetitors -0.212610 0.175520 -1.211 0.226
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 226.90 on 241 degrees of freedom
## AIC: 232.9
##
## Number of Fisher Scoring iterations: 4
model16 <- glm(WorldSeries ~ RA + RankSeason, data=baseball, family="binomial")
summary(model16)
##
## Call:
## glm(formula = WorldSeries ~ RA + RankSeason, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.487461 1.506143 0.988 0.323
## RA -0.003815 0.002441 -1.563 0.118
## RankSeason -0.140824 0.110908 -1.270 0.204
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 232.22 on 241 degrees of freedom
## AIC: 238.22
##
## Number of Fisher Scoring iterations: 4
model17 <- glm(WorldSeries ~ RA + NumCompetitors, data=baseball, family="binomial")
summary(model17)
##
## Call:
## glm(formula = WorldSeries ~ RA + NumCompetitors, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.716895 1.528736 0.469 0.63911
## RA -0.001233 0.002661 -0.463 0.64313
## NumCompetitors -0.229385 0.088399 -2.595 0.00946 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 226.74 on 241 degrees of freedom
## AIC: 232.74
##
## Number of Fisher Scoring iterations: 4
model18 <- glm(WorldSeries ~ RankSeason + NumCompetitors, data=baseball, family="binomial")
summary(model18)
##
## Call:
## glm(formula = WorldSeries ~ RankSeason + NumCompetitors, family = "binomial",
## data = baseball)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.12277 0.45737 0.268 0.78837
## RankSeason -0.07697 0.11711 -0.657 0.51102
## NumCompetitors -0.22784 0.08201 -2.778 0.00546 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 239.12 on 243 degrees of freedom
## Residual deviance: 226.52 on 241 degrees of freedom
## AIC: 232.52
##
## Number of Fisher Scoring iterations: 4
None of the two-variable models had both predictors statistically
significant. The best model based on AIC was the simple model with only
NumCompetitors
as the independent variable, supporting the
idea that postseason success may be heavily influenced by
randomness.