[ source files available on GitHub ]

PRELIMINARIES

Libraries needed for data processing and plotting:

library("dplyr")
library("magrittr")
library("ggplot2")

Source external script with my own handy functions definitions:

source("./scripts/my_defs_u2.R")

The content of this external file is included in the Appendix at the end of this report.

LOADING THE DATA

Read the dataset baseball.csv.

baseball <- read.csv("data/baseball.csv")
str(baseball)
## 'data.frame':    1232 obs. of  15 variables:
##  $ Team        : Factor w/ 39 levels "ANA","ARI","ATL",..: 2 3 4 5 7 8 9 10 11 12 ...
##  $ League      : Factor w/ 2 levels "AL","NL": 2 2 1 1 2 1 2 1 2 1 ...
##  $ Year        : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ RS          : int  734 700 712 734 613 748 669 667 758 726 ...
##  $ RA          : int  688 600 705 806 759 676 588 845 890 670 ...
##  $ W           : int  81 94 93 69 61 85 97 68 64 88 ...
##  $ OBP         : num  0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
##  $ SLG         : num  0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
##  $ BA          : num  0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
##  $ Playoffs    : int  0 1 1 0 0 0 1 0 0 1 ...
##  $ RankSeason  : int  NA 4 5 NA NA NA 2 NA NA 6 ...
##  $ RankPlayoffs: int  NA 5 4 NA NA NA 4 NA NA 2 ...
##  $ G           : int  162 162 162 162 162 162 162 162 162 162 ...
##  $ OOBP        : num  0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
##  $ OSLG        : num  0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...

Part 2 : MAKING IT TO THE PLAYOFFS

Subsetting data for the 1996-2001 period:

moneyball_1996_2001 <- subset(baseball, Year < 2002 & Year >= 1996)
# str(moneyball_1996_2001)
ggplot(data = moneyball_1996_2001, aes(x = W, y = Team)) + theme_bw() + 
    scale_color_manual(values = c("grey", "red3")) + 
    geom_vline(xintercept = c(85.0, 95.0), col = "green2", linetype = "longdash") +
    geom_point(aes(color = factor(Playoffs)), pch = 16, size = 3.0)

How does a team win games?

  • They score more runs than their opponent
  • But how many more?
  • The A’s calculated that they needed to score 135 more runs than they allowed during the regular season to expect to win 95 games
  • Let’s see if we can verify this using linear regression

Subset to only include moneyball years

moneyball <- subset(baseball, Year < 2002)
str(moneyball)
## 'data.frame':    902 obs. of  15 variables:
##  $ Team        : Factor w/ 39 levels "ANA","ARI","ATL",..: 1 2 3 4 5 7 8 9 10 11 ...
##  $ League      : Factor w/ 2 levels "AL","NL": 1 2 2 1 1 2 1 2 1 2 ...
##  $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
##  $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
##  $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
##  $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
##  $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
##  $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
##  $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
##  $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
##  $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
##  $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
##  $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...

Compute run difference and define new variable RD added to the moneyball data frame:

moneyball$RD <- moneyball$RS - moneyball$RA
# str(moneyball)

Scatterplot to check for linear relationship between RD and wins (W):

# plot(moneyball$RD, moneyball$W)
ggplot(data = moneyball, aes(x = RD, y = W)) + theme_bw() + 
    scale_color_manual(values = c("grey", "red3")) + 
    geom_hline(yintercept = c(85.0, 95.0), col = "green2", linetype = "longdash") +
    geom_point(aes(color = factor(Playoffs)), alpha = 0.5, pch = 16, size = 3.0) 

Regression model to predict wins

Given the clear correlation between RD and W, it makes sense to try first a linear regression with just RD as predictor.

Wins_Reg <- lm(W ~ RD, data = moneyball)

printStats(Wins_Reg)
## MODEL        : W ~ RD
## SUMMARY STATS: R^2 = 0.8808  (adj. = 0.8807)
##                F-stats: 6650.993 on 1 and 900 DF,  p-value: 0
## 
##               Estimate     Std. Error   Pr(>|t|)     Signif
## (Intercept)   8.0881e+01   1.3116e-01   0.0000e+00   ***   
## RD            1.0577e-01   1.2969e-03   0.0000e+00   ***

\[ W = 80.881 + 0.106*RD \]

Then one can compute the RD needed to get \(W\ge95\), i.e. \(RD \ge 133.5\).

Part 3 : PREDICTING RUNS

prediction model flow

Scoring Runs

How does a team score more runs?
The A’s discovered that two baseball statistics were significantly more important than anything else

  • On-Base Percentage (OBP)
    • Percentage of time a player gets on base (including walks).
  • Slugging Percentage (SLG)
    • How far a player gets around the bases on his turn (measures power).

Most teams focused on Batting Average (BA)

  • Getting on base by hitting the ball.

The A’s claimed that:

  • On-Base Percentage was the most important.
  • Slugging Percentage was important.
  • Batting Average was overvalued.

We use linear regression to verify which baseball stats are more important to predict runs.

Predicting Runs Scored

We can use pitching statistics to predict runs scored:

  • On-Base Percentage (OBP)
  • Slugging Percentage (SLG)

Regression model with 3 predictors (OBP, SLG, BA)

RS_reg_1 <- lm(RS ~ OBP + SLG + BA, data = moneyball)

printStats(RS_reg_1)
## MODEL        : RS ~ OBP + SLG + BA
## SUMMARY STATS: R^2 = 0.9302  (adj. = 0.9300)
##                F-stats: 3989.210 on 3 and 898 DF,  p-value: 0
## 
##               Estimate       Std. Error     Pr(>|t|)       Signif
## (Intercept)    -7.8846e+02     1.9697e+01    7.0020e-202   ***   
## OBP             2.9174e+03     1.1047e+02    3.3560e-114   ***   
## SLG             1.6379e+03     4.5994e+01    6.8440e-174   ***   
## BA             -3.6897e+02     1.3058e+02     4.8240e-03   **

If we take a look at the summary of our regression equation, we can see that all of our independent variables are significant, and our \(R^2\) 0.9302.

But if we look at our coefficients, we can see that the coefficient for batting average is negative.
This implies that, all else being equal, a team with a lower batting average will score more runs, which is a little counterintuitive.

What’s going on here is a case of multicollinearity.
These three hitting statistics are highly correlated, so it’s hard to interpret the coefficients of our model.

Regression model with 2 predictors (OBP, SLG)

Let’s try removing batting average, the variable with the least significance, to see what happens to our model.

RS_reg_2 <- lm(RS ~ OBP + SLG, data = moneyball)

printStats(RS_reg_2)
## MODEL        : RS ~ OBP + SLG
## SUMMARY STATS: R^2 = 0.9296  (adj. = 0.9294)
##                F-stats: 5933.726 on 2 and 899 DF,  p-value: 0
## 
##               Estimate       Std. Error     Pr(>|t|)       Signif
## (Intercept)    -8.0463e+02     1.8921e+01    1.9504e-217   ***   
## OBP             2.7378e+03     9.0685e+01    8.2192e-139   ***   
## SLG             1.5849e+03     4.2156e+01    1.2334e-186   ***

We get the linear regression model: \[ RS = -804.63 + 2737.77*OBP + 1584.91*SLG \] with and \(R^2\) = 0.9296.

So this model is simpler, with only two independent variables, and has about the same \(R^2\). Overall a better model.

We can see that on-base percentage has a larger coefficient than slugging percentage.
Since these variables are on about the same scale, this tells us that on-base percentage is probably worth more than slugging percentage.

So by using linear regression, we’re able to verify the claims made in Moneyball:

  • that batting average is overvalued,
  • on-base percentage is the most important, and slugging
  • percentage is important for predicting runs scored.

Allowing Runs

We can use pitching statistics to predict runs allowed:

  • Opponents On-Base Percentage (OOBP)
  • Opponents Slugging Percentage (OSLG)
RA_reg <- lm(RA ~ OOBP + OSLG, data = moneyball)

printStats(RA_reg)
## MODEL        : RA ~ OOBP + OSLG
## SUMMARY STATS: R^2 = 0.9073  (adj. = 0.9052)
##                F-stats: 425.823 on 2 and 87 DF,  p-value: 1.162191e-45
## 
##               Estimate      Std. Error    Pr(>|t|)      Signif
## (Intercept)   -8.3738e+02    6.0255e+01    8.2873e-24   ***   
## OOBP           2.9136e+03    2.9197e+02    4.4604e-16   ***   
## OSLG           1.5143e+03    1.7543e+02    2.5454e-13   ***

We get the linear regression model: \[ RA = -837.38 + 2913.6*OOBP + 1514.29*OSLG \] with and \(R^2\) = 0.9073. Both variables are significant.

Part 4 : USING THE MODELS TO MAKE PREDICTIONS

Can we predict how many games the 2002 Oakland A’s will win using our models?

Predicting Runs Scored

At the beginning of the 2002 season, the Oakland A’s had 24 batters on their roster.
Using the 2001 regular season statistics for these players:

  • Team OBP is 0.339
  • Team SLG is 0.430

Our regression equation was: \[ RS = -804.63 + 2737.77*OBP + 1584.91*SLG \]

Based on this, our prediction for 2002 would then be 805.

Predicting Runs Allowed

At the beginning of the 2002 season, the Oakland A’s had 17 pitchers on their roster. Using the 2001 regular season statistics for these players: * Team OOBP is 0.307 * Team OSLG is 0.373

Our regression equation was \[ RA = -837.38 + 2913.6*OOBP + 1514.29*OSLG \]

Based on this, our prediction for 2002 would then be 621.9.

Predicting Wins

We can now make a prediction for how many games they will win.
Our regression equation to predict wins was: \[ W = 80.881 + 0.106*RD \]

We predicted 805 runs scored (RS) and 621.9 runs allowed (RA), for a difference RD of 183.1

We can plug in this RD to predict that the A’s will win 100.2 games in 2002.

How does this predictions compare with actual results for 2002?

Paul DePodesta used a similar approach to make predictions.
Predictions closely match actual performance

predictions for 2001

The A’s set a League record by winning 20 games in a row.
Won one more game than the previous year, and made it to the playoffs.

Part 5 : WINNING THE WORLD SERIES

Luck in the Playoffs

Billy and Paul see their job as making sure the team makes it to the playoffs – after that all bets are off.

  • The A’s made it to the playoffs in 2000, 2001, 2002, 2003.
  • But they didn’t win the World Series.

Why?

  • “Over a long season the luck evens out, and the skill shines through.
    But in a series of three out of five, or even four out of seven, anything can happen.”
    "

In other words, the playoffs suffer from the sample size problem.
There are not enough games to make any statistical claims.
Let’s see if we can verify this using our data set.

Is Playoff Performance Predictable?

We can compute the correlation between whether or not the team wins the World Series (a binary variable) and the number of regular season wins, since we would expect teams with more wins to be more likely to win the World Series.

  • Using data 1994-2011 (8 teams in the playoffs).
  • Correlation between winning the World Series and regular season wins is 0.03. Very low.
  • Winning regular season games gets you to the playoffs.
  • But in the playoffs, there are too few games for luck to even out.
  • Logistic regression can be used to predict whether or not a team will win the World Series.
recent <- subset(baseball, Year >= 1994 & Year <= 2011)

recent[recent$RankPlayoffs == 1 & is.na(recent$RankPlayoffs) == FALSE, ]
##     Team League Year  RS  RA   W   OBP   SLG    BA Playoffs RankSeason RankPlayoffs   G  OOBP  OSLG
## 56   STL     NL 2011 762 692  90 0.341 0.425 0.273        1          7            1 162 0.319 0.398
## 85   SFG     NL 2010 697 583  92 0.321 0.408 0.257        1          5            1 162 0.313 0.370
## 109  NYY     AL 2009 915 753 103 0.362 0.478 0.283        1          1            1 162 0.327 0.408
## 141  PHI     NL 2008 799 680  92 0.332 0.438 0.255        1          4            1 162 0.329 0.410
## 154  BOS     AL 2007 867 657  96 0.362 0.444 0.279        1          1            1 162 0.314 0.392
## 206  STL     NL 2006 781 762  83 0.337 0.431 0.269        1          6            1 161 0.337 0.443
## 216  CHW     AL 2005 741 645  99 0.322 0.425 0.262        1          2            1 162 0.310 0.397
## 245  BOS     AL 2004 949 768  98 0.360 0.472 0.282        1          3            1 162 0.318 0.408
## 282  FLA     NL 2003 751 692  91 0.333 0.421 0.266        1          5            1 162 0.325 0.396
## 301  ANA     AL 2002 851 644  99 0.341 0.433 0.282        1          3            1 162 0.314 0.392
## 332  ARI     NL 2001 818 677  92 0.341 0.442 0.267        1          5            1 162 0.311 0.404
## 380  NYY     AL 2000 871 814  87 0.354 0.450 0.277        1          5            1 161 0.336 0.422
## 410  NYY     AL 1999 900 731  98 0.366 0.453 0.282        1          3            1 162 0.329 0.400
## 440  NYY     AL 1998 965 656 114 0.364 0.460 0.288        1          1            1 162    NA    NA
## 461  FLA     NL 1997 740 669  92 0.346 0.395 0.259        1          4            1 162    NA    NA
## 497  NYY     AL 1996 871 787  92 0.360 0.436 0.288        1          3            1 162    NA    NA
recent$WSwin <- 0
recent[recent$RankPlayoffs == 1 & is.na(recent$RankPlayoffs) == FALSE, "WSwin"] <- 1
cor(recent$WSwin, recent$W)
## [1] 0.2253985

clean <- recent[is.na(recent$RankPlayoffs) == FALSE, ]
# cor(clean$W, clean$WSwin)

APPENDIX : external functions

Additional locally defined functions, from the external file loaded at the beginning.

mypanel <- function(x, y, ...){
   cvec.bg <- c(rgb(1.0, 0.8, 0.9) , rgb(0.9, 0.9, 1.0), rgb(0.8, 1.0, 0.8))
   cvec.border <- c("red", "blue", rgb(0.0, 0.67, 0.0))
   r <- abs(cor(x, y))
   i.color <- 1+(r>0.7)+(r>0.8)
   ll <- par("usr") 
   rect(ll[1], ll[3], ll[2], ll[4], border=cvec.border[i.color], lwd=3)
   points(x, y, ... ) 
   ok <- is.finite(x) & is.finite(y)
   if( any(ok) ) { abline(lm(y[ok] ~ x[ok]), col="red", lty=2, ...) }
}

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, color.bg=FALSE, ...) {
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    cvec.bg <- c(rgb(1.0, 0.8, 0.9) , rgb(0.9, 0.9, 1.0), rgb(0.8, 1.0, 0.8))
    cvec.text <- c("red", "blue", rgb(0.0, 0.67, 0.0))
    i.color <- 1+(r>0.4)+(r>0.6)
    if( color.bg ) {
       ll <- par("usr") 
       rect(ll[1], ll[3], ll[2], ll[4], col=cvec.bg[i.color])
       text(0.5, 0.5, txt, cex = cex.cor * r, col=cvec.text[i.color])
    } else {
       text(0.5, 0.5, txt, cex = cex.cor * r, col=cvec.text[i.color])
    }
}

printStats <- function (m, with.cx=TRUE) {
    if (class(m) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(m)$fstatistic
    p <- pf(f[1], f[2], f[3], lower.tail=FALSE)
    attributes(p) <- NULL
    
    fml <- as.character(formula(m))
    cx <- summary(m)$coeff
    stars <- rep(" ", nrow(cx))
    stars[cx[,4] <= 0.1] <- "."
    stars[cx[,4] <= 0.05] <- "*"
    stars[cx[,4] <= 0.01] <- "**"
    stars[cx[,4] <= 0.001] <- "***"
    cat("MODEL        : ", sprintf("%s", paste(fml[c(2,1,3)], sep=" ", collapse=" ")), "\n", sep="")
    cat("SUMMARY STATS: ")
    cat("R^2 = ",sprintf("%6.4f",summary(m)$r.squared), 
        "  (adj. = ", sprintf("%6.4f",summary(m)$adj.r.squared), ")", sep="")
    cat("\n")
    cat("               ")
    cat("F-stats: ", sprintf("%.3f",f[1]), " on ", f[2], " and ", f[3], " DF,  p-value: ", p, "\n", sep="")
    if( with.cx ) {
        cat("\n")
        print(cbind(format(cx[,c(1,2,4)], scientific=TRUE, justify="right", digits=5), Signif=stars), 
              quote=FALSE, print.gap=3)
    }
}

printStats.cpt <- function (m, with.cx=TRUE) {
    if (class(m) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(m)$fstatistic
    p <- pf(f[1], f[2], f[3], lower.tail=FALSE)
    attributes(p) <- NULL
    
    fml <- as.character(formula(m))
    cx <- summary(m)$coeff
    stars <- rep(" ", nrow(cx))
    stars[cx[,4] <= 0.1] <- "."
    stars[cx[,4] <= 0.05] <- "*"
    stars[cx[,4] <= 0.01] <- "**"
    stars[cx[,4] <= 0.001] <- "***"
    cat("MODEL : ", sprintf("%s", paste(fml[c(2,1,3)], sep=" ", collapse=" ")), "\n", sep="")
    cat("      : ")
    cat("adj. R^2 = ", sprintf("%6.4f",summary(m)$adj.r.squared), 
        " /  F-stats: ", sprintf("%.3f",f[1]), " on ", f[2],",", f[3], " Df,  p-value: ", p, "\n", sep="")
    if( with.cx ) {
        print(cbind(format(cx[,c(1,2,4)], scientific=TRUE, justify="right", digits=5), Signif=stars), 
              quote=FALSE, print.gap=3)
        cat("\n")
    }
}