baseball = read.csv("baseball.csv")
str(baseball)
'data.frame': 1232 obs. of 15 variables:
$ Team : chr "ARI" "ATL" "BAL" "BOS" ...
$ League : chr "NL" "NL" "AL" "AL" ...
$ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
$ RS : int 734 700 712 734 613 748 669 667 758 726 ...
$ RA : int 688 600 705 806 759 676 588 845 890 670 ...
$ W : int 81 94 93 69 61 85 97 68 64 88 ...
$ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
$ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
$ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
$ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
$ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
$ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
$ G : int 162 162 162 162 162 162 162 162 162 162 ...
$ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
$ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
moneyball = subset(baseball, Year < 2002)
str(moneyball)
'data.frame': 902 obs. of 15 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
moneyball$RD = moneyball$RS - moneyball$RA
str(moneyball)
'data.frame': 902 obs. of 16 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
$ RD : int -39 141 86 -142 27 76 3 -115 76 17 ...
WinsReg = lm(W ~ RD, data=moneyball)
summary(WinsReg)
Call:
lm(formula = W ~ RD, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-14.2662 -2.6509 0.1234 2.9364 11.6570
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 80.881375 0.131157 616.67 <2e-16 ***
RD 0.105766 0.001297 81.55 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.939 on 900 degrees of freedom
Multiple R-squared: 0.8808, Adjusted R-squared: 0.8807
F-statistic: 6651 on 1 and 900 DF, p-value: < 2.2e-16
Exercise 1 If a baseball team’s OBP is 0.361, SLG is 0.409, and BA is 0.257, how many runs do we expect the team to score? Using the linear regression model constructed during the lecture (the one that uses OBP, SLG, and BA as independent variables), find the number of runs we expect the team to score:
RunsReg = lm(RS ~ OBP + SLG + BA, data=moneyball)
summary(RunsReg)
Call:
lm(formula = RS ~ OBP + SLG + BA, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-70.941 -17.247 -0.621 16.754 90.998
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -788.46 19.70 -40.029 < 2e-16 ***
OBP 2917.42 110.47 26.410 < 2e-16 ***
SLG 1637.93 45.99 35.612 < 2e-16 ***
BA -368.97 130.58 -2.826 0.00482 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 24.69 on 898 degrees of freedom
Multiple R-squared: 0.9302, Adjusted R-squared: 0.93
F-statistic: 3989 on 3 and 898 DF, p-value: < 2.2e-16
RunsScored=-788.46+2917.42*(0.361)+1637.93*(0.409)-368.97*(0.257)
RunsScored
[1] 839.8167
That equals to around 840 runs scored.
Exercise 2 If a baseball team’s opponents OBP (OOBP) is 0.267 and opponents SLG (OSLG) is 0.392, how many runs do we expect the team to allow? Using the linear regression model discussed during the lecture (the one on the last slide of the previous video), find the number of runs we expect the team to allow.
RunsAllowedReg=lm(RA ~ OOBP + OSLG, data=moneyball)
summary(RunsAllowedReg)
Call:
lm(formula = RA ~ OOBP + OSLG, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-82.397 -15.178 -0.129 17.679 60.955
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -837.38 60.26 -13.897 < 2e-16 ***
OOBP 2913.60 291.97 9.979 4.46e-16 ***
OSLG 1514.29 175.43 8.632 2.55e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 25.67 on 87 degrees of freedom
(812 observations deleted due to missingness)
Multiple R-squared: 0.9073, Adjusted R-squared: 0.9052
F-statistic: 425.8 on 2 and 87 DF, p-value: < 2.2e-16
RunsAllowed=-837.38+2913.60*(0.267)+1514.29*(0.392)
RunsAllowed
[1] 534.1529
We expect the team to allow around 534 runs.