[ source files available on GitHub ]
Libraries needed for data processing and plotting:
library("dplyr")
library("magrittr")
library("ggplot2")
Source external script with my own handy functions definitions:
source("./scripts/my_defs_u2.R")
The content of this external file is included in the Appendix at the end of this report.
Read the dataset baseball.csv
.
baseball <- read.csv("data/baseball.csv")
str(baseball)
## 'data.frame': 1232 obs. of 15 variables:
## $ Team : Factor w/ 39 levels "ANA","ARI","ATL",..: 2 3 4 5 7 8 9 10 11 12 ...
## $ League : Factor w/ 2 levels "AL","NL": 2 2 1 1 2 1 2 1 2 1 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ RS : int 734 700 712 734 613 748 669 667 758 726 ...
## $ RA : int 688 600 705 806 759 676 588 845 890 670 ...
## $ W : int 81 94 93 69 61 85 97 68 64 88 ...
## $ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
## $ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
## $ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
## $ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
## $ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
## $ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
## $ G : int 162 162 162 162 162 162 162 162 162 162 ...
## $ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
## $ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
Subsetting data for the 1996-2001 period:
moneyball_1996_2001 <- subset(baseball, Year < 2002 & Year >= 1996)
# str(moneyball_1996_2001)
ggplot(data = moneyball_1996_2001, aes(x = W, y = Team)) + theme_bw() +
scale_color_manual(values = c("grey", "red3")) +
geom_vline(xintercept = c(85.0, 95.0), col = "green2", linetype = "longdash") +
geom_point(aes(color = factor(Playoffs)), pch = 16, size = 3.0)
Subset to only include moneyball years
moneyball <- subset(baseball, Year < 2002)
str(moneyball)
## 'data.frame': 902 obs. of 15 variables:
## $ Team : Factor w/ 39 levels "ANA","ARI","ATL",..: 1 2 3 4 5 7 8 9 10 11 ...
## $ League : Factor w/ 2 levels "AL","NL": 1 2 2 1 1 2 1 2 1 2 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ RS : int 691 818 729 687 772 777 798 735 897 923 ...
## $ RA : int 730 677 643 829 745 701 795 850 821 906 ...
## $ W : int 75 92 88 63 82 88 83 66 91 73 ...
## $ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
## $ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
## $ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
## $ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
## $ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
## $ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
## $ G : int 162 162 162 162 161 162 162 162 162 162 ...
## $ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
## $ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
Compute run difference and define new variable RD
added to the moneyball
data frame:
moneyball$RD <- moneyball$RS - moneyball$RA
# str(moneyball)
Scatterplot to check for linear relationship between RD
and wins (W
):
# plot(moneyball$RD, moneyball$W)
ggplot(data = moneyball, aes(x = RD, y = W)) + theme_bw() +
scale_color_manual(values = c("grey", "red3")) +
geom_hline(yintercept = c(85.0, 95.0), col = "green2", linetype = "longdash") +
geom_point(aes(color = factor(Playoffs)), alpha = 0.5, pch = 16, size = 3.0)
Given the clear correlation between RD
and W
, it makes sense to try first a linear regression with just RD
as predictor.
Wins_Reg <- lm(W ~ RD, data = moneyball)
printStats(Wins_Reg)
## MODEL : W ~ RD
## SUMMARY STATS: R^2 = 0.8808 (adj. = 0.8807)
## F-stats: 6650.993 on 1 and 900 DF, p-value: 0
##
## Estimate Std. Error Pr(>|t|) Signif
## (Intercept) 8.0881e+01 1.3116e-01 0.0000e+00 ***
## RD 1.0577e-01 1.2969e-03 0.0000e+00 ***
\[ W = 80.881 + 0.106*RD \]
Then one can compute the RD
needed to get \(W\ge95\), i.e. \(RD \ge 133.5\).
How does a team score more runs?
The A’s discovered that two baseball statistics were significantly more important than anything else
OBP
)
SLG
)
Most teams focused on Batting Average (BA
)
The A’s claimed that:
We use linear regression to verify which baseball stats are more important to predict runs.
We can use pitching statistics to predict runs scored:
OBP
)SLG
)OBP
, SLG
, BA
)RS_reg_1 <- lm(RS ~ OBP + SLG + BA, data = moneyball)
printStats(RS_reg_1)
## MODEL : RS ~ OBP + SLG + BA
## SUMMARY STATS: R^2 = 0.9302 (adj. = 0.9300)
## F-stats: 3989.210 on 3 and 898 DF, p-value: 0
##
## Estimate Std. Error Pr(>|t|) Signif
## (Intercept) -7.8846e+02 1.9697e+01 7.0020e-202 ***
## OBP 2.9174e+03 1.1047e+02 3.3560e-114 ***
## SLG 1.6379e+03 4.5994e+01 6.8440e-174 ***
## BA -3.6897e+02 1.3058e+02 4.8240e-03 **
If we take a look at the summary of our regression equation, we can see that all of our independent variables are significant, and our \(R^2\) 0.9302.
But if we look at our coefficients, we can see that the coefficient for batting average is negative.
This implies that, all else being equal, a team with a lower batting average will score more runs, which is a little counterintuitive.
What’s going on here is a case of multicollinearity.
These three hitting statistics are highly correlated, so it’s hard to interpret the coefficients of our model.
OBP
, SLG
)Let’s try removing batting average, the variable with the least significance, to see what happens to our model.
RS_reg_2 <- lm(RS ~ OBP + SLG, data = moneyball)
printStats(RS_reg_2)
## MODEL : RS ~ OBP + SLG
## SUMMARY STATS: R^2 = 0.9296 (adj. = 0.9294)
## F-stats: 5933.726 on 2 and 899 DF, p-value: 0
##
## Estimate Std. Error Pr(>|t|) Signif
## (Intercept) -8.0463e+02 1.8921e+01 1.9504e-217 ***
## OBP 2.7378e+03 9.0685e+01 8.2192e-139 ***
## SLG 1.5849e+03 4.2156e+01 1.2334e-186 ***
We get the linear regression model: \[ RS = -804.63 + 2737.77*OBP + 1584.91*SLG \] with and \(R^2\) = 0.9296.
So this model is simpler, with only two independent variables, and has about the same \(R^2\). Overall a better model.
We can see that on-base percentage has a larger coefficient than slugging percentage.
Since these variables are on about the same scale, this tells us that on-base percentage is probably worth more than slugging percentage.
So by using linear regression, we’re able to verify the claims made in Moneyball:
We can use pitching statistics to predict runs allowed:
OOBP
)OSLG
)RA_reg <- lm(RA ~ OOBP + OSLG, data = moneyball)
printStats(RA_reg)
## MODEL : RA ~ OOBP + OSLG
## SUMMARY STATS: R^2 = 0.9073 (adj. = 0.9052)
## F-stats: 425.823 on 2 and 87 DF, p-value: 1.162191e-45
##
## Estimate Std. Error Pr(>|t|) Signif
## (Intercept) -8.3738e+02 6.0255e+01 8.2873e-24 ***
## OOBP 2.9136e+03 2.9197e+02 4.4604e-16 ***
## OSLG 1.5143e+03 1.7543e+02 2.5454e-13 ***
We get the linear regression model: \[ RA = -837.38 + 2913.6*OOBP + 1514.29*OSLG \] with and \(R^2\) = 0.9073. Both variables are significant.
Can we predict how many games the 2002 Oakland A’s will win using our models?
At the beginning of the 2002 season, the Oakland A’s had 24 batters on their roster.
Using the 2001 regular season statistics for these players:
OBP
is 0.339SLG
is 0.430Our regression equation was: \[ RS = -804.63 + 2737.77*OBP + 1584.91*SLG \]
Based on this, our prediction for 2002 would then be 805.
At the beginning of the 2002 season, the Oakland A’s had 17 pitchers on their roster. Using the 2001 regular season statistics for these players: * Team OOBP
is 0.307 * Team OSLG
is 0.373
Our regression equation was \[ RA = -837.38 + 2913.6*OOBP + 1514.29*OSLG \]
Based on this, our prediction for 2002 would then be 621.9.
We can now make a prediction for how many games they will win.
Our regression equation to predict wins was: \[
W = 80.881 + 0.106*RD
\]
We predicted 805 runs scored (RS
) and 621.9 runs allowed (RA
), for a difference RD
of 183.1
We can plug in this RD
to predict that the A’s will win 100.2 games in 2002.
Paul DePodesta used a similar approach to make predictions.
Predictions closely match actual performance
The A’s set a League record by winning 20 games in a row.
Won one more game than the previous year, and made it to the playoffs.
Billy and Paul see their job as making sure the team makes it to the playoffs – after that all bets are off.
Why?
In other words, the playoffs suffer from the sample size problem.
There are not enough games to make any statistical claims.
Let’s see if we can verify this using our data set.
We can compute the correlation between whether or not the team wins the World Series (a binary variable) and the number of regular season wins, since we would expect teams with more wins to be more likely to win the World Series.
recent <- subset(baseball, Year >= 1994 & Year <= 2011)
recent[recent$RankPlayoffs == 1 & is.na(recent$RankPlayoffs) == FALSE, ]
## Team League Year RS RA W OBP SLG BA Playoffs RankSeason RankPlayoffs G OOBP OSLG
## 56 STL NL 2011 762 692 90 0.341 0.425 0.273 1 7 1 162 0.319 0.398
## 85 SFG NL 2010 697 583 92 0.321 0.408 0.257 1 5 1 162 0.313 0.370
## 109 NYY AL 2009 915 753 103 0.362 0.478 0.283 1 1 1 162 0.327 0.408
## 141 PHI NL 2008 799 680 92 0.332 0.438 0.255 1 4 1 162 0.329 0.410
## 154 BOS AL 2007 867 657 96 0.362 0.444 0.279 1 1 1 162 0.314 0.392
## 206 STL NL 2006 781 762 83 0.337 0.431 0.269 1 6 1 161 0.337 0.443
## 216 CHW AL 2005 741 645 99 0.322 0.425 0.262 1 2 1 162 0.310 0.397
## 245 BOS AL 2004 949 768 98 0.360 0.472 0.282 1 3 1 162 0.318 0.408
## 282 FLA NL 2003 751 692 91 0.333 0.421 0.266 1 5 1 162 0.325 0.396
## 301 ANA AL 2002 851 644 99 0.341 0.433 0.282 1 3 1 162 0.314 0.392
## 332 ARI NL 2001 818 677 92 0.341 0.442 0.267 1 5 1 162 0.311 0.404
## 380 NYY AL 2000 871 814 87 0.354 0.450 0.277 1 5 1 161 0.336 0.422
## 410 NYY AL 1999 900 731 98 0.366 0.453 0.282 1 3 1 162 0.329 0.400
## 440 NYY AL 1998 965 656 114 0.364 0.460 0.288 1 1 1 162 NA NA
## 461 FLA NL 1997 740 669 92 0.346 0.395 0.259 1 4 1 162 NA NA
## 497 NYY AL 1996 871 787 92 0.360 0.436 0.288 1 3 1 162 NA NA
recent$WSwin <- 0
recent[recent$RankPlayoffs == 1 & is.na(recent$RankPlayoffs) == FALSE, "WSwin"] <- 1
cor(recent$WSwin, recent$W)
## [1] 0.2253985
clean <- recent[is.na(recent$RankPlayoffs) == FALSE, ]
# cor(clean$W, clean$WSwin)
Additional locally defined functions, from the external file loaded at the beginning.
mypanel <- function(x, y, ...){
cvec.bg <- c(rgb(1.0, 0.8, 0.9) , rgb(0.9, 0.9, 1.0), rgb(0.8, 1.0, 0.8))
cvec.border <- c("red", "blue", rgb(0.0, 0.67, 0.0))
r <- abs(cor(x, y))
i.color <- 1+(r>0.7)+(r>0.8)
ll <- par("usr")
rect(ll[1], ll[3], ll[2], ll[4], border=cvec.border[i.color], lwd=3)
points(x, y, ... )
ok <- is.finite(x) & is.finite(y)
if( any(ok) ) { abline(lm(y[ok] ~ x[ok]), col="red", lty=2, ...) }
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, color.bg=FALSE, ...) {
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
cvec.bg <- c(rgb(1.0, 0.8, 0.9) , rgb(0.9, 0.9, 1.0), rgb(0.8, 1.0, 0.8))
cvec.text <- c("red", "blue", rgb(0.0, 0.67, 0.0))
i.color <- 1+(r>0.4)+(r>0.6)
if( color.bg ) {
ll <- par("usr")
rect(ll[1], ll[3], ll[2], ll[4], col=cvec.bg[i.color])
text(0.5, 0.5, txt, cex = cex.cor * r, col=cvec.text[i.color])
} else {
text(0.5, 0.5, txt, cex = cex.cor * r, col=cvec.text[i.color])
}
}
printStats <- function (m, with.cx=TRUE) {
if (class(m) != "lm") stop("Not an object of class 'lm' ")
f <- summary(m)$fstatistic
p <- pf(f[1], f[2], f[3], lower.tail=FALSE)
attributes(p) <- NULL
fml <- as.character(formula(m))
cx <- summary(m)$coeff
stars <- rep(" ", nrow(cx))
stars[cx[,4] <= 0.1] <- "."
stars[cx[,4] <= 0.05] <- "*"
stars[cx[,4] <= 0.01] <- "**"
stars[cx[,4] <= 0.001] <- "***"
cat("MODEL : ", sprintf("%s", paste(fml[c(2,1,3)], sep=" ", collapse=" ")), "\n", sep="")
cat("SUMMARY STATS: ")
cat("R^2 = ",sprintf("%6.4f",summary(m)$r.squared),
" (adj. = ", sprintf("%6.4f",summary(m)$adj.r.squared), ")", sep="")
cat("\n")
cat(" ")
cat("F-stats: ", sprintf("%.3f",f[1]), " on ", f[2], " and ", f[3], " DF, p-value: ", p, "\n", sep="")
if( with.cx ) {
cat("\n")
print(cbind(format(cx[,c(1,2,4)], scientific=TRUE, justify="right", digits=5), Signif=stars),
quote=FALSE, print.gap=3)
}
}
printStats.cpt <- function (m, with.cx=TRUE) {
if (class(m) != "lm") stop("Not an object of class 'lm' ")
f <- summary(m)$fstatistic
p <- pf(f[1], f[2], f[3], lower.tail=FALSE)
attributes(p) <- NULL
fml <- as.character(formula(m))
cx <- summary(m)$coeff
stars <- rep(" ", nrow(cx))
stars[cx[,4] <= 0.1] <- "."
stars[cx[,4] <= 0.05] <- "*"
stars[cx[,4] <= 0.01] <- "**"
stars[cx[,4] <= 0.001] <- "***"
cat("MODEL : ", sprintf("%s", paste(fml[c(2,1,3)], sep=" ", collapse=" ")), "\n", sep="")
cat(" : ")
cat("adj. R^2 = ", sprintf("%6.4f",summary(m)$adj.r.squared),
" / F-stats: ", sprintf("%.3f",f[1]), " on ", f[2],",", f[3], " Df, p-value: ", p, "\n", sep="")
if( with.cx ) {
print(cbind(format(cx[,c(1,2,4)], scientific=TRUE, justify="right", digits=5), Signif=stars),
quote=FALSE, print.gap=3)
cat("\n")
}
}