# Moneyball
setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit 2")
baseball <- read.csv("baseball.csv")
str(baseball)
## 'data.frame': 1232 obs. of 15 variables:
## $ Team : Factor w/ 39 levels "ANA","ARI","ATL",..: 2 3 4 5 7 8 9 10 11 12 ...
## $ League : Factor w/ 2 levels "AL","NL": 2 2 1 1 2 1 2 1 2 1 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ RS : int 734 700 712 734 613 748 669 667 758 726 ...
## $ RA : int 688 600 705 806 759 676 588 845 890 670 ...
## $ W : int 81 94 93 69 61 85 97 68 64 88 ...
## $ OBP : num 0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
## $ SLG : num 0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
## $ BA : num 0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
## $ Playoffs : int 0 1 1 0 0 0 1 0 0 1 ...
## $ RankSeason : int NA 4 5 NA NA NA 2 NA NA 6 ...
## $ RankPlayoffs: int NA 5 4 NA NA NA 4 NA NA 2 ...
## $ G : int 162 162 162 162 162 162 162 162 162 162 ...
## $ OOBP : num 0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
## $ OSLG : num 0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
# subsetting the data to only take observations before 2002.
moneyball <- subset(baseball, Year < 2002)
# create a new variable RD
moneyball$RD <- moneyball$RS - moneyball$RA
# check relationship between RD and # of wins
plot(moneyball$RD, moneyball$W)

WinsReg <- lm(W ~ RD, data = moneyball)
summary(WinsReg)
##
## Call:
## lm(formula = W ~ RD, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.2662 -2.6509 0.1234 2.9364 11.6570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.881375 0.131157 616.67 <2e-16 ***
## RD 0.105766 0.001297 81.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.939 on 900 degrees of freedom
## Multiple R-squared: 0.8808, Adjusted R-squared: 0.8807
## F-statistic: 6651 on 1 and 900 DF, p-value: < 2.2e-16
# Predicting runs
RunsReg <- lm(RS ~ OBP + SLG + BA, data = moneyball)
summary(RunsReg)
##
## Call:
## lm(formula = RS ~ OBP + SLG + BA, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.941 -17.247 -0.621 16.754 90.998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -788.46 19.70 -40.029 < 2e-16 ***
## OBP 2917.42 110.47 26.410 < 2e-16 ***
## SLG 1637.93 45.99 35.612 < 2e-16 ***
## BA -368.97 130.58 -2.826 0.00482 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.69 on 898 degrees of freedom
## Multiple R-squared: 0.9302, Adjusted R-squared: 0.93
## F-statistic: 3989 on 3 and 898 DF, p-value: < 2.2e-16
# we can see that the coefficient for batting average is negative.
# This implies that, all else being equal, a team with a lower batting average
# will score more runs, which is a little counterintuitive.
# What's going on here is a case of multicollinearity.
# These three hitting statistics are highly correlated,
# so it's hard to interpret the coefficients of our model.
# Let's try removing batting average, the variable
# with the least significance, to see what happens to our model.
RunsReg <- lm(RS ~ OBP + SLG, data = moneyball)
summary(RunsReg)
##
## Call:
## lm(formula = RS ~ OBP + SLG, data = moneyball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.838 -17.174 -1.108 16.770 90.036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -804.63 18.92 -42.53 <2e-16 ***
## OBP 2737.77 90.68 30.19 <2e-16 ***
## SLG 1584.91 42.16 37.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.79 on 899 degrees of freedom
## Multiple R-squared: 0.9296, Adjusted R-squared: 0.9294
## F-statistic: 5934 on 2 and 899 DF, p-value: < 2.2e-16