baseball <- read.csv("baseball.csv")
moneyball <- subset(baseball, baseball$Year < 2002)
moneyball$RD <- moneyball$RS - moneyball$RA
str(moneyball)
## 'data.frame':    902 obs. of  16 variables:
##  $ Team        : Factor w/ 39 levels "ANA","ARI","ATL",..: 1 2 3 4 5 7 8 9 10 11 ...
##  $ League      : Factor w/ 2 levels "AL","NL": 1 2 2 1 1 2 1 2 1 2 ...
##  $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
##  $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
##  $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
##  $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
##  $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
##  $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
##  $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
##  $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
##  $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
##  $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
##  $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
##  $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
##  $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
##  $ RD          : int  -39 141 86 -142 27 76 3 -115 76 17 ...

Model for Run Scored using OBP and SLG

fit.runs <- lm(RS ~ OBP + SLG, data=moneyball)
summary(fit.runs)
## 
## Call:
## lm(formula = RS ~ OBP + SLG, data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.838 -17.174  -1.108  16.770  90.036 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -804.63      18.92  -42.53   <2e-16 ***
## OBP          2737.77      90.68   30.19   <2e-16 ***
## SLG          1584.91      42.16   37.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.79 on 899 degrees of freedom
## Multiple R-squared:  0.9296, Adjusted R-squared:  0.9294 
## F-statistic:  5934 on 2 and 899 DF,  p-value: < 2.2e-16

The list of players:

players <- data.frame(name=c("Eric", "Jeremy", "Frank", "Greg", "Carlos"), 
                      OBP=c(.338, .391, .369, .313, .361),
                      SLG=c(.540, .450, .374, .447, .500),
                      salary1000=c(1400, 1065, 295, 800, 300))
players
##     name   OBP   SLG salary1000
## 1   Eric 0.338 0.540       1400
## 2 Jeremy 0.391 0.450       1065
## 3  Frank 0.369 0.374        295
## 4   Greg 0.313 0.447        800
## 5 Carlos 0.361 0.500        300

Given $1.5M budget, which 2 players to choose?

for (i in 1:nrow(players)) {
  for (j in i:nrow(players)) {
    if (i != j) {
      salary <- players[i,4] + players[j,4]
      if (salary <= 1500) {
        obp = (players[i,2] + players[j,2]) * 0.5
        slg = (players[i,3] + players[j,3]) * 0.5
        p <- predict(fit.runs, data.frame(OBP=obp, SLG=slg))
        print(paste(players[i,1], players[j,1], p))
      }
    }
  }
}
## [1] "Jeremy Frank 888.707132850523"
## [1] "Jeremy Carlos 977.60530290546"
## [1] "Frank Greg 779.556817073763"
## [1] "Frank Carlos 887.263327652974"
## [1] "Greg Carlos 868.4549871287"

In 2012 and 2013, there were 10 teams in the MLB playoffs: the six teams that had the most wins in each baseball division, and four “wild card” teams. The playoffs start between the four wild card teams - the two teams that win proceed in the playoffs (8 teams remaining). Then, these teams are paired off and play a series of games. The four teams that win are then paired and play to determine who will play in the World Series.

We can assign rankings to the teams as follows:

teamRank <- c(1,2,3,3,4,4,4,4,5,5)

t2012 <- data.frame(name=c("SanFran", 
                           "Detroit", 
                           "NY", "StLouis",
                           "Baltimore", "Oakland", "Washington", "Cincinnati",
                           "Texas", "Atlanta"),
                    rank=c(1,
                           2,
                           3, 3,
                           4, 4, 4, 4, 
                           5, 5),
                    wins=c(94,
                           88,
                           95, 88,
                           93, 94, 98, 97,
                           93, 94))
(t2012)
##          name rank wins
## 1     SanFran    1   94
## 2     Detroit    2   88
## 3          NY    3   95
## 4     StLouis    3   88
## 5   Baltimore    4   93
## 6     Oakland    4   94
## 7  Washington    4   98
## 8  Cincinnati    4   97
## 9       Texas    5   93
## 10    Atlanta    5   94
t2013 <- data.frame(name=c("Boston",
                           "StLouis",
                           "LA", "Detroit",
                           "TampaBay", "Oakland", "Pittsburgh", "Atlanta",
                           "Cleveland", "Cincinnati"),
                    rank=c(1,
                           2,
                           3, 3,
                           4, 4, 4, 4, 
                           5, 5),
                    wins=c(97,
                           97,
                           92, 93,
                           92, 96, 94, 96,
                           92, 90))
(t2013)
##          name rank wins
## 1      Boston    1   97
## 2     StLouis    2   97
## 3          LA    3   92
## 4     Detroit    3   93
## 5    TampaBay    4   92
## 6     Oakland    4   96
## 7  Pittsburgh    4   94
## 8     Atlanta    4   96
## 9   Cleveland    5   92
## 10 Cincinnati    5   90

What is the correlation between teamRank and wins2012?

cor(t2012$rank, t2012$wins)
## [1] 0.3477129

What is the correlation between teamRank and wins2013?

cor(t2013$rank, t2013$wins)
## [1] -0.6556945