# Read in data
baseball = read.csv("baseball.csv")
str(baseball)
'data.frame':   1232 obs. of  15 variables:
 $ Team        : chr  "ARI" "ATL" "BAL" "BOS" ...
 $ League      : chr  "NL" "NL" "AL" "AL" ...
 $ Year        : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
 $ RS          : int  734 700 712 734 613 748 669 667 758 726 ...
 $ RA          : int  688 600 705 806 759 676 588 845 890 670 ...
 $ W           : int  81 94 93 69 61 85 97 68 64 88 ...
 $ OBP         : num  0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
 $ SLG         : num  0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
 $ BA          : num  0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
 $ Playoffs    : int  0 1 1 0 0 0 1 0 0 1 ...
 $ RankSeason  : int  NA 4 5 NA NA NA 2 NA NA 6 ...
 $ RankPlayoffs: int  NA 5 4 NA NA NA 4 NA NA 2 ...
 $ G           : int  162 162 162 162 162 162 162 162 162 162 ...
 $ OOBP        : num  0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
 $ OSLG        : num  0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
# Subset to only include moneyball years
moneyball = subset(baseball, Year < 2002)
str(moneyball)
'data.frame':   902 obs. of  15 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
# Compute Run Difference
moneyball$RD = moneyball$RS - moneyball$RA
str(moneyball)
'data.frame':   902 obs. of  16 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
 $ RD          : int  -39 141 86 -142 27 76 3 -115 76 17 ...

Exercise 1 If a baseball team’s OBP is 0.361 and SLG is 0.409, how many runs do we expect the team to score?

Using the linear regression model constructed during the lecture (the one that uses OBP and SLG as independent variables), find the number of runs we expect the team to score:

RunsReg = lm(RS ~ OBP + SLG, data=moneyball)
summary(RunsReg)

Call:
lm(formula = RS ~ OBP + SLG, data = moneyball)

Residuals:
    Min      1Q  Median      3Q     Max 
-70.838 -17.174  -1.108  16.770  90.036 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -804.63      18.92  -42.53   <2e-16 ***
OBP          2737.77      90.68   30.19   <2e-16 ***
SLG          1584.91      42.16   37.60   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 24.79 on 899 degrees of freedom
Multiple R-squared:  0.9296,    Adjusted R-squared:  0.9294 
F-statistic:  5934 on 2 and 899 DF,  p-value: < 2.2e-16
runs_scored=-804.63+2737.77*(0.361)+1584.91*(0.409)
runs_scored
[1] 831.9332

#We expected the team to score between 831 and 832 runs

Exercise 2

If a baseball team’s opponents OBP (OOBP) is 0.267 and opponents SLG (OSLG) is 0.392, how many runs do we expect the team to allow?

Using the linear regression model discussed during the lecture (the one on the last slide of the previous video), find the number of runs we expect the team to allow.

# Regression model to predict runs allowed
RunsAllowedReg = lm(RA ~ OOBP + OSLG, data=moneyball)
summary(RunsAllowedReg)

Call:
lm(formula = RA ~ OOBP + OSLG, data = moneyball)

Residuals:
    Min      1Q  Median      3Q     Max 
-82.397 -15.178  -0.129  17.679  60.955 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -837.38      60.26 -13.897  < 2e-16 ***
OOBP         2913.60     291.97   9.979 4.46e-16 ***
OSLG         1514.29     175.43   8.632 2.55e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 25.67 on 87 degrees of freedom
  (812 observations deleted due to missingness)
Multiple R-squared:  0.9073,    Adjusted R-squared:  0.9052 
F-statistic: 425.8 on 2 and 87 DF,  p-value: < 2.2e-16
runs_allowed=-837.38+2913.60*(0.267)+1514.29*(0.392)
runs_allowed
[1] 534.1529

#We expected that team only allow bewteen 534 and 535 runs.

LS0tCnRpdGxlOiAiSW4gQ2xhc3MgQWN0aXZpdHkgOCIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CiMgUmVhZCBpbiBkYXRhCmJhc2ViYWxsID0gcmVhZC5jc3YoImJhc2ViYWxsLmNzdiIpCnN0cihiYXNlYmFsbCkKYGBgCgpgYGB7cn0KIyBTdWJzZXQgdG8gb25seSBpbmNsdWRlIG1vbmV5YmFsbCB5ZWFycwptb25leWJhbGwgPSBzdWJzZXQoYmFzZWJhbGwsIFllYXIgPCAyMDAyKQpzdHIobW9uZXliYWxsKQpgYGAKCmBgYHtyfQojIENvbXB1dGUgUnVuIERpZmZlcmVuY2UKbW9uZXliYWxsJFJEID0gbW9uZXliYWxsJFJTIC0gbW9uZXliYWxsJFJBCnN0cihtb25leWJhbGwpCmBgYAoKRXhlcmNpc2UgMQpJZiBhIGJhc2ViYWxsIHRlYW3igJlzIE9CUCBpcyAwLjM2MSBhbmQgIFNMRyBpcyAwLjQwOSwgaG93IG1hbnkgcnVucyBkbyB3ZSBleHBlY3QgdGhlIHRlYW0gdG8gc2NvcmU/CgpVc2luZyB0aGUgbGluZWFyIHJlZ3Jlc3Npb24gbW9kZWwgY29uc3RydWN0ZWQgZHVyaW5nIHRoZSBsZWN0dXJlICh0aGUgb25lIHRoYXQgdXNlcyBPQlAgYW5kICBTTEcgYXMgaW5kZXBlbmRlbnQgdmFyaWFibGVzKSwgZmluZCB0aGUgbnVtYmVyIG9mIHJ1bnMgd2UgZXhwZWN0IHRoZSB0ZWFtIHRvIHNjb3JlOgoKCmBgYHtyfQpSdW5zUmVnID0gbG0oUlMgfiBPQlAgKyBTTEcsIGRhdGE9bW9uZXliYWxsKQpzdW1tYXJ5KFJ1bnNSZWcpCmBgYAoKCmBgYHtyfQpydW5zX3Njb3JlZD0tODA0LjYzKzI3MzcuNzcqKDAuMzYxKSsxNTg0LjkxKigwLjQwOSkKcnVuc19zY29yZWQKYGBgCgojV2UgZXhwZWN0ZWQgdGhlIHRlYW0gdG8gc2NvcmUgYmV0d2VlbiA4MzEgYW5kIDgzMiBydW5zCgpFeGVyY2lzZSAyCgoKSWYgYSBiYXNlYmFsbCB0ZWFt4oCZcyBvcHBvbmVudHMgT0JQIChPT0JQKSBpcyAwLjI2NyBhbmQgb3Bwb25lbnRzIFNMRyAoT1NMRykgaXMgMC4zOTIsIGhvdyBtYW55IHJ1bnMgZG8gd2UgZXhwZWN0IHRoZSB0ZWFtIHRvIGFsbG93PwoKVXNpbmcgdGhlIGxpbmVhciByZWdyZXNzaW9uIG1vZGVsIGRpc2N1c3NlZCBkdXJpbmcgdGhlIGxlY3R1cmUgKHRoZSBvbmUgb24gdGhlIGxhc3Qgc2xpZGUgb2YgdGhlIHByZXZpb3VzIHZpZGVvKSwgZmluZCB0aGUgbnVtYmVyIG9mIHJ1bnMgd2UgZXhwZWN0IHRoZSB0ZWFtIHRvIGFsbG93LiAKCmBgYHtyfQojIFJlZ3Jlc3Npb24gbW9kZWwgdG8gcHJlZGljdCBydW5zIGFsbG93ZWQKUnVuc0FsbG93ZWRSZWcgPSBsbShSQSB+IE9PQlAgKyBPU0xHLCBkYXRhPW1vbmV5YmFsbCkKc3VtbWFyeShSdW5zQWxsb3dlZFJlZykKYGBgCgpgYGB7cn0KcnVuc19hbGxvd2VkPS04MzcuMzgrMjkxMy42MCooMC4yNjcpKzE1MTQuMjkqKDAuMzkyKQpydW5zX2FsbG93ZWQKYGBgCgoKICNXZSBleHBlY3RlZCB0aGF0IHRlYW0gb25seSBhbGxvdyBiZXd0ZWVuIDUzNCBhbmQgNTM1IHJ1bnMuCiAKIAogCgoK