baseball = read.csv("baseball.csv")
str(baseball)
'data.frame': 1232 obs. of 15 variables:
$ Team : chr "ARI" "ATL" "BAL" "BOS" ...
$ League : chr "NL" "NL" "AL" "AL" ...
$ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
$ RS : int 734 700 712 734 613 748 669 667 758 726 ...
$ RA : int 688 600 705 806 759 676 588 845 890 670 ...
$ W : int 81 94 93 69 61 85 97 68 64 88 ...
$ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
$ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
$ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
$ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
$ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
$ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
$ G : int 162 162 162 162 162 162 162 162 162 162 ...
$ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
$ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
nrow(baseball)
[1] 1232
#Though the dataset contains data from 1962 until 2012, we removed several years with shorter-than-usual seasons. Using the table() function, identify the total number of years included in this dataset.
table(baseball$Year)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973
20 20 20 20 20 20 20 24 24 24 24
1974 1975 1976 1977 1978 1979 1980 1982 1983 1984 1985
24 24 24 26 26 26 26 26 26 26 26
1986 1987 1988 1989 1990 1991 1992 1993 1996 1997 1998
26 26 26 26 26 26 26 28 28 28 30
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
30 30 30 30 30 30 30 30 30 30 30
2010 2011 2012
30 30 30
length(table(baseball$Year))
[1] 47
baseball = subset(baseball, Playoffs == 1)
nrow(baseball)
[1] 244
table(baseball$Year)
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973
2 2 2 2 2 2 2 4 4 4 4
1974 1975 1976 1977 1978 1979 1980 1982 1983 1984 1985
4 4 4 4 4 4 4 4 4 4 4
1986 1987 1988 1989 1990 1991 1992 1993 1996 1997 1998
4 4 4 4 4 4 4 4 8 8 8
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
8 8 8 8 8 8 8 8 8 8 8
2010 2011 2012
8 8 10
table(table(baseball$Year))
2 4 8 10
7 23 16 1
PlayoffTable = table(baseball$Year)
PlayoffTable
1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1973
2 2 2 2 2 2 2 4 4 4 4
1974 1975 1976 1977 1978 1979 1980 1982 1983 1984 1985
4 4 4 4 4 4 4 4 4 4 4
1986 1987 1988 1989 1990 1991 1992 1993 1996 1997 1998
4 4 4 4 4 4 4 4 8 8 8
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009
8 8 8 8 8 8 8 8 8 8 8
2010 2011 2012
8 8 10
str(names(PlayoffTable))
chr [1:47] "1962" "1963" "1964" "1965" "1966" "1967" ...
PlayoffTable[c("1990", "2001")]
1990 2001
4 8
baseball$NumCompetitors = PlayoffTable[as.character(baseball$Year)]
baseball$NumCompetitors
[1] 10 10 10 10 10 10 10 10 10 10 8 8 8 8 8 8 8 8
[19] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[37] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[55] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[73] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[91] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[109] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
[127] 8 8 8 8 8 8 8 8 8 8 8 8 4 4 4 4 4 4
[145] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[163] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[181] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[199] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[217] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2
[235] 2 2 2 2 2 2 2 2 2 2
table(baseball$NumCompetitors)
2 4 8 10
14 92 128 10
baseball$WorldSeries = as.numeric(baseball$RankPlayoffs == 1)
table(baseball$WorldSeries)
0 1
197 47
#Which of the following variables is a significant predictor of the WorldSeries variable in a bivariate logistic regression model?
#Varibales to use as predictors for each bivariate model(Year, RS, RA, W, OBP, SLG, BA, RankSeason, OOBP,OSLG, NumCompetitors, League)
model1<-glm(WorldSeries~Year, data=baseball, family="binomial")
summary(model1)
Call:
glm(formula = WorldSeries ~ Year, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 72.23602 22.64409 3.19 0.00142 **
Year -0.03700 0.01138 -3.25 0.00115 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 228.35 on 242 degrees of freedom
AIC: 232.35
Number of Fisher Scoring iterations: 4
#RS is not significant
model2<-glm(WorldSeries~RS, data=baseball, family="binomial")
summary(model2)
Call:
glm(formula = WorldSeries ~ RS, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.661226 1.636494 0.404 0.686
RS -0.002681 0.002098 -1.278 0.201
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 237.45 on 242 degrees of freedom
AIC: 241.45
Number of Fisher Scoring iterations: 4
model3<-glm(WorldSeries~RA, data=baseball, family="binomial")
summary(model3)
Call:
glm(formula = WorldSeries ~ RA, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.888174 1.483831 1.272 0.2032
RA -0.005053 0.002273 -2.223 0.0262 *
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 233.88 on 242 degrees of freedom
AIC: 237.88
Number of Fisher Scoring iterations: 4
model4<-glm(WorldSeries~W, data=baseball, family="binomial")
summary(model4)
Call:
glm(formula = WorldSeries ~ W, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.85568 2.87620 -2.384 0.0171 *
W 0.05671 0.02988 1.898 0.0577 .
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 235.51 on 242 degrees of freedom
AIC: 239.51
Number of Fisher Scoring iterations: 4
model5<-glm(WorldSeries~OBP, data=baseball, family="binomial")
summary(model5)
Call:
glm(formula = WorldSeries ~ OBP, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.741 3.989 0.687 0.492
OBP -12.402 11.865 -1.045 0.296
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 238.02 on 242 degrees of freedom
AIC: 242.02
Number of Fisher Scoring iterations: 4
model7<-glm(WorldSeries~BA, data=baseball, family="binomial")
summary(model7)
Call:
glm(formula = WorldSeries ~ BA, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.6392 3.8988 -0.164 0.870
BA -2.9765 14.6123 -0.204 0.839
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 239.08 on 242 degrees of freedom
AIC: 243.08
Number of Fisher Scoring iterations: 4
model8<-glm(WorldSeries~RankSeason, data=baseball, family="binomial")
summary(model8)
Call:
glm(formula = WorldSeries ~ RankSeason, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.8256 0.3268 -2.527 0.0115 *
RankSeason -0.2069 0.1027 -2.016 0.0438 *
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 234.75 on 242 degrees of freedom
AIC: 238.75
Number of Fisher Scoring iterations: 4
model9<-glm(WorldSeries~OOBP, data=baseball, family="binomial")
summary(model9)
Call:
glm(formula = WorldSeries ~ OOBP, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.9306 8.3728 -0.111 0.912
OOBP -3.2233 26.0587 -0.124 0.902
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 84.926 on 113 degrees of freedom
Residual deviance: 84.910 on 112 degrees of freedom
(130 observations deleted due to missingness)
AIC: 88.91
Number of Fisher Scoring iterations: 4
model10<-glm(WorldSeries~OSLG, data=baseball, family="binomial")
summary(model10)
Call:
glm(formula = WorldSeries ~ OSLG, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.08725 6.07285 -0.014 0.989
OSLG -4.65992 15.06881 -0.309 0.757
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 84.926 on 113 degrees of freedom
Residual deviance: 84.830 on 112 degrees of freedom
(130 observations deleted due to missingness)
AIC: 88.83
Number of Fisher Scoring iterations: 4
model11<-glm(WorldSeries~NumCompetitors, data=baseball, family="binomial")
summary(model11)
Call:
glm(formula = WorldSeries ~ NumCompetitors, family = "binomial",
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.03868 0.43750 0.088 0.929559
NumCompetitors -0.25220 0.07422 -3.398 0.000678 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.96 on 242 degrees of freedom
AIC: 230.96
Number of Fisher Scoring iterations: 4
model12<-glm(WorldSeries~League, data=baseball, family="binomial")
summary(model12)
Call:
glm(formula = WorldSeries ~ League, family = "binomial", data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.3558 0.2243 -6.045 1.5e-09 ***
LeagueNL -0.1583 0.3252 -0.487 0.626
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 238.88 on 242 degrees of freedom
AIC: 242.88
Number of Fisher Scoring iterations: 4
LogModel = glm(WorldSeries ~ Year + RA + RankSeason + NumCompetitors, data=baseball, family=binomial)
summary(LogModel)
Call:
glm(formula = WorldSeries ~ Year + RA + RankSeason + NumCompetitors,
family = binomial, data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 12.5874376 53.6474210 0.235 0.814
Year -0.0061425 0.0274665 -0.224 0.823
RA -0.0008238 0.0027391 -0.301 0.764
RankSeason -0.0685046 0.1203459 -0.569 0.569
NumCompetitors -0.1794264 0.1815933 -0.988 0.323
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.37 on 239 degrees of freedom
AIC: 236.37
Number of Fisher Scoring iterations: 4
cor(baseball[c("Year", "RA", "RankSeason", "NumCompetitors")])
Year RA RankSeason
Year 1.0000000 0.4762422 0.3852191
RA 0.4762422 1.0000000 0.3991413
RankSeason 0.3852191 0.3991413 1.0000000
NumCompetitors 0.9139548 0.5136769 0.4247393
NumCompetitors
Year 0.9139548
RA 0.5136769
RankSeason 0.4247393
NumCompetitors 1.0000000
model13 = glm(WorldSeries ~ Year + RA, data=baseball, family=binomial)
summary(model13)
Call:
glm(formula = WorldSeries ~ Year + RA, family = binomial, data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 63.610741 25.654830 2.479 0.0132 *
Year -0.032084 0.013323 -2.408 0.0160 *
RA -0.001766 0.002585 -0.683 0.4945
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 227.88 on 241 degrees of freedom
AIC: 233.88
Number of Fisher Scoring iterations: 4
model14 = glm(WorldSeries ~ Year + RankSeason, data=baseball, family=binomial)
summary(model14)
Call:
glm(formula = WorldSeries ~ Year + RankSeason, family = binomial,
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 63.64855 24.37063 2.612 0.00901 **
Year -0.03254 0.01231 -2.643 0.00822 **
RankSeason -0.10064 0.11352 -0.887 0.37534
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 227.55 on 241 degrees of freedom
AIC: 233.55
Number of Fisher Scoring iterations: 4
model15 = glm(WorldSeries ~ Year + NumCompetitors, data=baseball, family=binomial)
summary(model15)
Call:
glm(formula = WorldSeries ~ Year + NumCompetitors, family = binomial,
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 13.350467 53.481896 0.250 0.803
Year -0.006802 0.027328 -0.249 0.803
NumCompetitors -0.212610 0.175520 -1.211 0.226
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.90 on 241 degrees of freedom
AIC: 232.9
Number of Fisher Scoring iterations: 4
model16 = glm(WorldSeries ~ RA + RankSeason, data=baseball, family=binomial)
summary(model16)
Call:
glm(formula = WorldSeries ~ RA + RankSeason, family = binomial,
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.487461 1.506143 0.988 0.323
RA -0.003815 0.002441 -1.563 0.118
RankSeason -0.140824 0.110908 -1.270 0.204
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 232.22 on 241 degrees of freedom
AIC: 238.22
Number of Fisher Scoring iterations: 4
model17 = glm(WorldSeries ~ RA + NumCompetitors, data=baseball, family=binomial)
summary(model17)
Call:
glm(formula = WorldSeries ~ RA + NumCompetitors, family = binomial,
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.716895 1.528736 0.469 0.63911
RA -0.001233 0.002661 -0.463 0.64313
NumCompetitors -0.229385 0.088399 -2.595 0.00946 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.74 on 241 degrees of freedom
AIC: 232.74
Number of Fisher Scoring iterations: 4
model18 = glm(WorldSeries ~ RankSeason + NumCompetitors, data=baseball, family=binomial)
summary(model18)
Call:
glm(formula = WorldSeries ~ RankSeason + NumCompetitors, family = binomial,
data = baseball)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.12277 0.45737 0.268 0.78837
RankSeason -0.07697 0.11711 -0.657 0.51102
NumCompetitors -0.22784 0.08201 -2.778 0.00546 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 239.12 on 243 degrees of freedom
Residual deviance: 226.52 on 241 degrees of freedom
AIC: 232.52
Number of Fisher Scoring iterations: 4
#none of the models had 2 variables that were significant