# Read in data
baseball = read.csv("baseball.csv")
str(baseball)
'data.frame': 1232 obs. of 15 variables:
$ Team : chr "ARI" "ATL" "BAL" "BOS" ...
$ League : chr "NL" "NL" "AL" "AL" ...
$ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
$ RS : int 734 700 712 734 613 748 669 667 758 726 ...
$ RA : int 688 600 705 806 759 676 588 845 890 670 ...
$ W : int 81 94 93 69 61 85 97 68 64 88 ...
$ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
$ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
$ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
$ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
$ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
$ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
$ G : int 162 162 162 162 162 162 162 162 162 162 ...
$ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
$ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
# Subset to only include moneyball years
moneyball = subset(baseball, Year < 2002)
str(moneyball)
'data.frame': 902 obs. of 15 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
# Compute Run Difference
moneyball$RD = moneyball$RS - moneyball$RA
str(moneyball)
'data.frame': 902 obs. of 16 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
$ RD : int -39 141 86 -142 27 76 3 -115 76 17 ...
# Scatterplot to check for linear relationship
plot(moneyball$RD, moneyball$W)

# Regression model to predict wins
WinsReg = lm(W ~ RD, data=moneyball)
summary(WinsReg)
Call:
lm(formula = W ~ RD, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-14.2662 -2.6509 0.1234 2.9364 11.6570
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 80.881375 0.131157 616.67 <2e-16 ***
RD 0.105766 0.001297 81.55 <2e-16 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.939 on 900 degrees of freedom
Multiple R-squared: 0.8808, Adjusted R-squared: 0.8807
F-statistic: 6651 on 1 and 900 DF, p-value: < 2.2e-16
str(moneyball)
'data.frame': 902 obs. of 16 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
$ RD : int -39 141 86 -142 27 76 3 -115 76 17 ...
# Regression model to predict runs scored
RunsReg = lm(RS ~ OBP + SLG + BA, data=moneyball)
summary(RunsReg)
Call:
lm(formula = RS ~ OBP + SLG + BA, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-70.941 -17.247 -0.621 16.754 90.998
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -788.46 19.70 -40.029 < 2e-16 ***
OBP 2917.42 110.47 26.410 < 2e-16 ***
SLG 1637.93 45.99 35.612 < 2e-16 ***
BA -368.97 130.58 -2.826 0.00482 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 24.69 on 898 degrees of freedom
Multiple R-squared: 0.9302, Adjusted R-squared: 0.93
F-statistic: 3989 on 3 and 898 DF, p-value: < 2.2e-16
# Regression model to predict runs scored again but removing the batting average
RunsReg = lm(RS ~ OBP + SLG, data=moneyball)
summary(RunsReg)
Call:
lm(formula = RS ~ OBP + SLG, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-70.838 -17.174 -1.108 16.770 90.036
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -804.63 18.92 -42.53 <2e-16 ***
OBP 2737.77 90.68 30.19 <2e-16 ***
SLG 1584.91 42.16 37.60 <2e-16 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 24.79 on 899 degrees of freedom
Multiple R-squared: 0.9296, Adjusted R-squared: 0.9294
F-statistic: 5934 on 2 and 899 DF, p-value: < 2.2e-16
str(moneyball)
'data.frame': 902 obs. of 16 variables:
$ Team : chr "ANA" "ARI" "ATL" "BAL" ...
$ League : chr "AL" "NL" "NL" "AL" ...
$ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
$ RS : int 691 818 729 687 772 777 798 735 897 923 ...
$ RA : int 730 677 643 829 745 701 795 850 821 906 ...
$ W : int 75 92 88 63 82 88 83 66 91 73 ...
$ OBP : num 0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
$ SLG : num 0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
$ BA : num 0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
$ Playoffs : int 0 1 1 0 0 0 0 0 1 0 ...
$ RankSeason : int NA 5 7 NA NA NA NA NA 6 NA ...
$ RankPlayoffs: int NA 1 3 NA NA NA NA NA 4 NA ...
$ G : int 162 162 162 162 161 162 162 162 162 162 ...
$ OOBP : num 0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
$ OSLG : num 0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
$ RD : int -39 141 86 -142 27 76 3 -115 76 17 ...
# Regression model to predict runs allowed
RunsAllowedReg = lm(RA ~ OOBP + OSLG, data=moneyball)
summary(RunsAllowedReg)
Call:
lm(formula = RA ~ OOBP + OSLG, data = moneyball)
Residuals:
Min 1Q Median 3Q Max
-82.397 -15.178 -0.129 17.679 60.955
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -837.38 60.26 -13.897 < 2e-16 ***
OOBP 2913.60 291.97 9.979 4.46e-16 ***
OSLG 1514.29 175.43 8.632 2.55e-13 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 25.67 on 87 degrees of freedom
(812 observations deleted due to missingness)
Multiple R-squared: 0.9073, Adjusted R-squared: 0.9052
F-statistic: 425.8 on 2 and 87 DF, p-value: < 2.2e-16
##Activity 8 Exercise 1
# If OBP = 0.361 and SLG = 0.409, how many runs do we expect the team to score?
# Building the regression model to predict RS from OBP and SLG
model_RS <- lm(RS ~ OBP + SLG, data = baseball)
# Predict RS using OBP = 0.361 and SLG = 0.409
exercise1 <- data.frame(OBP = 0.361, SLG = 0.409)
predicted_runs_ex1 <- predict(model_RS, newdata = exercise1)
print(paste("Predicted Runs Scored (Exercise 1):", round(predicted_runs_ex1)))
[1] "Predicted Runs Scored (Exercise 1): 831"
#Exercise 2 If OOBP = 0.267 and OSLG = 0.392, how many runs do we expect the team to allow?
# Build a model to predict Runs Allowed (RA) based on Opponent OBP and SLG
model_RA <- lm(RA ~ OOBP + OSLG, data = baseball)
# Predict RA using OOBP = 0.267 and OSLG = 0.392
exercise2 <- data.frame(OOBP = 0.267, OSLG = 0.392)
predicted_runs_allowed_ex2 <- predict(model_RA, newdata = exercise2)
print(paste("Predicted Runs Allowed (Exercise 2):", round(predicted_runs_allowed_ex2)))
[1] "Predicted Runs Allowed (Exercise 2): 527"
LS0tCnRpdGxlOiAiQWN0aXZpdHkgOCBQcmVkaWN0aW5nIHRoZSBOdW1iZXIgb2YgUnVucyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CiMgUmVhZCBpbiBkYXRhCmJhc2ViYWxsID0gcmVhZC5jc3YoImJhc2ViYWxsLmNzdiIpCnN0cihiYXNlYmFsbCkKYGBgCmBgYHtyfQojIFN1YnNldCB0byBvbmx5IGluY2x1ZGUgbW9uZXliYWxsIHllYXJzCm1vbmV5YmFsbCA9IHN1YnNldChiYXNlYmFsbCwgWWVhciA8IDIwMDIpCnN0cihtb25leWJhbGwpCmBgYApgYGB7cn0KIyBDb21wdXRlIFJ1biBEaWZmZXJlbmNlCm1vbmV5YmFsbCRSRCA9IG1vbmV5YmFsbCRSUyAtIG1vbmV5YmFsbCRSQQpzdHIobW9uZXliYWxsKQpgYGAKYGBge3J9CiMgU2NhdHRlcnBsb3QgdG8gY2hlY2sgZm9yIGxpbmVhciByZWxhdGlvbnNoaXAKcGxvdChtb25leWJhbGwkUkQsIG1vbmV5YmFsbCRXKQpgYGAKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHdpbnMKV2luc1JlZyA9IGxtKFcgfiBSRCwgZGF0YT1tb25leWJhbGwpCnN1bW1hcnkoV2luc1JlZykKYGBgCmBgYHtyfQpzdHIobW9uZXliYWxsKQpgYGAKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHJ1bnMgc2NvcmVkClJ1bnNSZWcgPSBsbShSUyB+IE9CUCArIFNMRyArIEJBLCBkYXRhPW1vbmV5YmFsbCkKc3VtbWFyeShSdW5zUmVnKQpgYGAKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHJ1bnMgc2NvcmVkIGFnYWluIGJ1dCByZW1vdmluZyB0aGUgYmF0dGluZyBhdmVyYWdlClJ1bnNSZWcgPSBsbShSUyB+IE9CUCArIFNMRywgZGF0YT1tb25leWJhbGwpCnN1bW1hcnkoUnVuc1JlZykKYGBgCmBgYHtyfQpzdHIobW9uZXliYWxsKQpgYGAKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHJ1bnMgYWxsb3dlZApSdW5zQWxsb3dlZFJlZyA9IGxtKFJBIH4gT09CUCArIE9TTEcsIGRhdGE9bW9uZXliYWxsKQpzdW1tYXJ5KFJ1bnNBbGxvd2VkUmVnKQpgYGAKYGBge3J9CiMjQWN0aXZpdHkgOCBFeGVyY2lzZSAxCiMgSWYgT0JQID0gMC4zNjEgYW5kIFNMRyA9IDAuNDA5LCBob3cgbWFueSBydW5zIGRvIHdlIGV4cGVjdCB0aGUgdGVhbSB0byBzY29yZT8KICAKIyBCdWlsZGluZyB0aGUgcmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IFJTIGZyb20gT0JQIGFuZCBTTEcKbW9kZWxfUlMgPC0gbG0oUlMgfiBPQlAgKyBTTEcsIGRhdGEgPSBiYXNlYmFsbCkKCiMgUHJlZGljdCBSUyB1c2luZyBPQlAgPSAwLjM2MSBhbmQgU0xHID0gMC40MDkKZXhlcmNpc2UxIDwtIGRhdGEuZnJhbWUoT0JQID0gMC4zNjEsIFNMRyA9IDAuNDA5KQpwcmVkaWN0ZWRfcnVuc19leDEgPC0gcHJlZGljdChtb2RlbF9SUywgbmV3ZGF0YSA9IGV4ZXJjaXNlMSkKcHJpbnQocGFzdGUoIlByZWRpY3RlZCBSdW5zIFNjb3JlZCAoRXhlcmNpc2UgMSk6Iiwgcm91bmQocHJlZGljdGVkX3J1bnNfZXgxKSkpCgpgYGAKYGBge3J9CiNFeGVyY2lzZSAyIElmIE9PQlAgPSAwLjI2NyBhbmQgT1NMRyA9IDAuMzkyLCBob3cgbWFueSBydW5zIGRvIHdlIGV4cGVjdCB0aGUgdGVhbSB0byBhbGxvdz8KCiMgQnVpbGQgYSBtb2RlbCB0byBwcmVkaWN0IFJ1bnMgQWxsb3dlZCAoUkEpIGJhc2VkIG9uIE9wcG9uZW50IE9CUCBhbmQgU0xHCm1vZGVsX1JBIDwtIGxtKFJBIH4gT09CUCArIE9TTEcsIGRhdGEgPSBiYXNlYmFsbCkKCiMgUHJlZGljdCBSQSB1c2luZyBPT0JQID0gMC4yNjcgYW5kIE9TTEcgPSAwLjM5MgpleGVyY2lzZTIgPC0gZGF0YS5mcmFtZShPT0JQID0gMC4yNjcsIE9TTEcgPSAwLjM5MikKcHJlZGljdGVkX3J1bnNfYWxsb3dlZF9leDIgPC0gcHJlZGljdChtb2RlbF9SQSwgbmV3ZGF0YSA9IGV4ZXJjaXNlMikKcHJpbnQocGFzdGUoIlByZWRpY3RlZCBSdW5zIEFsbG93ZWQgKEV4ZXJjaXNlIDIpOiIsIHJvdW5kKHByZWRpY3RlZF9ydW5zX2FsbG93ZWRfZXgyKSkpCmBgYAoK