Load and Explore Data
baseball = read.csv("baseball.csv")
str(baseball)
'data.frame': 1232 obs. of 15 variables:
$ Team : chr "ARI" "ATL" "BAL" "BOS" ...
$ League : chr "NL" "NL" "AL" "AL" ...
$ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
$ RS : int 734 700 712 734 613 748 669 667 758 726 ...
$ RA : int 688 600 705 806 759 676 588 845 890 670 ...
$ W : int 81 94 93 69 61 85 97 68 64 88 ...
$ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
$ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
$ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
$ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
$ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
$ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
$ G : int 162 162 162 162 162 162 162 162 162 162 ...
$ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
$ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
nrow(baseball)
[1] 1232
table(baseball$Year)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973 1974 1975 1976 1977 1978
2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4
1979 1980 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1996 1997
4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8
1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
8 8 8 8 8 8 8 8 8 8 8 8 8 8 10
length(table(baseball$Year))
[1] 47
##Subset to Teams Making the Playoffs
baseball <- subset(baseball, Playoffs == 1)
nrow(baseball)
[1] 244
table(baseball$Year)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973 1974 1975 1976 1977 1978
2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4
1979 1980 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1996 1997
4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 8
1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
8 8 8 8 8 8 8 8 8 8 8 8 8 8 10
table(table(baseball$Year))
2 4 8 10
7 23 16 1
#Add NumCompetitors Column
PlayoffTable <- table(baseball$Year)
baseball$NumCompetitors <- PlayoffTable[as.character(baseball$Year)]
table(baseball$NumCompetitors)
2 4 8 10
14 92 128 10
#Create WorldSeries Variable
baseball$WorldSeries <- as.numeric(baseball$RankPlayoffs == 1)
table(baseball$WorldSeries)
0 1
197 47
#Bivariate Logistic Regressions
model1 <- glm(WorldSeries ~ Year, data=baseball, family="binomial")
summary(model1)
Call:
glm(formula = WorldSeries ~ Year, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 72.23602 22.64409 3.19 0.00142 **
Year -0.03700 0.01138 -3.25 0.00115 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 228.35 on 242 degrees of freedom
AIC: 232.35
Number of Fisher Scoring iterations: 4
model2 <- glm(WorldSeries ~ RS, data=baseball, family="binomial")
summary(model2)
Call:
glm(formula = WorldSeries ~ RS, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.661226 1.636494 0.404 0.686
RS -0.002681 0.002098 -1.278 0.201
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 237.45 on 242 degrees of freedom
AIC: 241.45
Number of Fisher Scoring iterations: 4
model3 <- glm(WorldSeries ~ RA, data=baseball, family="binomial")
summary(model3)
Call:
glm(formula = WorldSeries ~ RA, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.888174 1.483831 1.272 0.2032
RA -0.005053 0.002273 -2.223 0.0262 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 233.88 on 242 degrees of freedom
AIC: 237.88
Number of Fisher Scoring iterations: 4
model4 <- glm(WorldSeries ~ W, data=baseball, family="binomial")
summary(model4)
Call:
glm(formula = WorldSeries ~ W, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.85568 2.87620 -2.384 0.0171 *
W 0.05671 0.02988 1.898 0.0577 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 235.51 on 242 degrees of freedom
AIC: 239.51
Number of Fisher Scoring iterations: 4
model5 <- glm(WorldSeries ~ OBP, data=baseball, family="binomial")
summary(model5)
Call:
glm(formula = WorldSeries ~ OBP, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.741 3.989 0.687 0.492
OBP -12.402 11.865 -1.045 0.296
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 238.02 on 242 degrees of freedom
AIC: 242.02
Number of Fisher Scoring iterations: 4
model6 <- glm(WorldSeries ~ SLG, data=baseball, family="binomial")
summary(model6)
Call:
glm(formula = WorldSeries ~ SLG, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.200 2.358 1.357 0.1748
SLG -11.130 5.689 -1.956 0.0504 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 235.23 on 242 degrees of freedom
AIC: 239.23
Number of Fisher Scoring iterations: 4
model7 <- glm(WorldSeries ~ BA, data=baseball, family="binomial")
summary(model7)
Call:
glm(formula = WorldSeries ~ BA, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.6392 3.8988 -0.164 0.870
BA -2.9765 14.6123 -0.204 0.839
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 239.08 on 242 degrees of freedom
AIC: 243.08
Number of Fisher Scoring iterations: 4
model8 <- glm(WorldSeries ~ RankSeason, data=baseball, family="binomial")
summary(model8)
Call:
glm(formula = WorldSeries ~ RankSeason, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.8256 0.3268 -2.527 0.0115 *
RankSeason -0.2069 0.1027 -2.016 0.0438 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 234.75 on 242 degrees of freedom
AIC: 238.75
Number of Fisher Scoring iterations: 4
model9 <- glm(WorldSeries ~ OOBP, data=baseball, family="binomial")
summary(model9)
Call:
glm(formula = WorldSeries ~ OOBP, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.9306 8.3728 -0.111 0.912
OOBP -3.2233 26.0587 -0.124 0.902
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 84.926 on 113 degrees of freedom
Residual deviance: 84.910 on 112 degrees of freedom
(130 observations deleted due to missingness)
AIC: 88.91
Number of Fisher Scoring iterations: 4
model10 <- glm(WorldSeries ~ OSLG, data=baseball, family="binomial")
summary(model10)
Call:
glm(formula = WorldSeries ~ OSLG, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.08725 6.07285 -0.014 0.989
OSLG -4.65992 15.06881 -0.309 0.757
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 84.926 on 113 degrees of freedom
Residual deviance: 84.830 on 112 degrees of freedom
(130 observations deleted due to missingness)
AIC: 88.83
Number of Fisher Scoring iterations: 4
model11 <- glm(WorldSeries ~ NumCompetitors, data=baseball, family="binomial")
summary(model11)
Call:
glm(formula = WorldSeries ~ NumCompetitors, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.03868 0.43750 0.088 0.929559
NumCompetitors -0.25220 0.07422 -3.398 0.000678 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.96 on 242 degrees of freedom
AIC: 230.96
Number of Fisher Scoring iterations: 4
model12 <- glm(WorldSeries ~ League, data=baseball, family="binomial")
summary(model12)
Call:
glm(formula = WorldSeries ~ League, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.3558 0.2243 -6.045 1.5e-09 ***
LeagueNL -0.1583 0.3252 -0.487 0.626
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 238.88 on 242 degrees of freedom
AIC: 242.88
Number of Fisher Scoring iterations: 4
#Multivariate Model with All Significant Predictors
LogModel <- glm(WorldSeries ~ Year + RA + RankSeason + NumCompetitors, data=baseball, family="binomial")
summary(LogModel)
Call:
glm(formula = WorldSeries ~ Year + RA + RankSeason + NumCompetitors,
family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 12.5874376 53.6474210 0.235 0.814
Year -0.0061425 0.0274665 -0.224 0.823
RA -0.0008238 0.0027391 -0.301 0.764
RankSeason -0.0685046 0.1203459 -0.569 0.569
NumCompetitors -0.1794264 0.1815933 -0.988 0.323
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.37 on 239 degrees of freedom
AIC: 236.37
Number of Fisher Scoring iterations: 4
#Correlation Between Predictors
cor(baseball[c("Year", "RA", "RankSeason", "NumCompetitors")])
Year RA RankSeason NumCompetitors
Year 1.0000000 0.4762422 0.3852191 0.9139548
RA 0.4762422 1.0000000 0.3991413 0.5136769
RankSeason 0.3852191 0.3991413 1.0000000 0.4247393
NumCompetitors 0.9139548 0.5136769 0.4247393 1.0000000
##Two-Variable Models (Compare AIC)
model13 <- glm(WorldSeries ~ Year + RA, data=baseball, family="binomial")
summary(model13)
Call:
glm(formula = WorldSeries ~ Year + RA, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 63.610741 25.654830 2.479 0.0132 *
Year -0.032084 0.013323 -2.408 0.0160 *
RA -0.001766 0.002585 -0.683 0.4945
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 227.88 on 241 degrees of freedom
AIC: 233.88
Number of Fisher Scoring iterations: 4
model14 <- glm(WorldSeries ~ Year + RankSeason, data=baseball, family="binomial")
summary(model14)
Call:
glm(formula = WorldSeries ~ Year + RankSeason, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 63.64855 24.37063 2.612 0.00901 **
Year -0.03254 0.01231 -2.643 0.00822 **
RankSeason -0.10064 0.11352 -0.887 0.37534
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 227.55 on 241 degrees of freedom
AIC: 233.55
Number of Fisher Scoring iterations: 4
model15 <- glm(WorldSeries ~ Year + NumCompetitors, data=baseball, family="binomial")
summary(model15)
Call:
glm(formula = WorldSeries ~ Year + NumCompetitors, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 13.350467 53.481896 0.250 0.803
Year -0.006802 0.027328 -0.249 0.803
NumCompetitors -0.212610 0.175520 -1.211 0.226
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.90 on 241 degrees of freedom
AIC: 232.9
Number of Fisher Scoring iterations: 4
model16 <- glm(WorldSeries ~ RA + RankSeason, data=baseball, family="binomial")
summary(model16)
Call:
glm(formula = WorldSeries ~ RA + RankSeason, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.487461 1.506143 0.988 0.323
RA -0.003815 0.002441 -1.563 0.118
RankSeason -0.140824 0.110908 -1.270 0.204
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 232.22 on 241 degrees of freedom
AIC: 238.22
Number of Fisher Scoring iterations: 4
model17 <- glm(WorldSeries ~ RA + NumCompetitors, data=baseball, family="binomial")
summary(model17)
Call:
glm(formula = WorldSeries ~ RA + NumCompetitors, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.716895 1.528736 0.469 0.63911
RA -0.001233 0.002661 -0.463 0.64313
NumCompetitors -0.229385 0.088399 -2.595 0.00946 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.74 on 241 degrees of freedom
AIC: 232.74
Number of Fisher Scoring iterations: 4
model18 <- glm(WorldSeries ~ RankSeason + NumCompetitors, data=baseball, family="binomial")
summary(model18)
Call:
glm(formula = WorldSeries ~ RankSeason + NumCompetitors, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.12277 0.45737 0.268 0.78837
RankSeason -0.07697 0.11711 -0.657 0.51102
NumCompetitors -0.22784 0.08201 -2.778 0.00546 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.52 on 241 degrees of freedom
AIC: 232.52
Number of Fisher Scoring iterations: 4
#Conclusion
#None of the two-variable models included both predictors as statistically significant. According to AIC, the best-fitting model was the simpler one containing only NumCompetitors as the independent variable, reinforcing the notion that postseason outcomes may be largely driven by chance.