# Moneyball
setwd("C:/Users/jzchen/Documents/Courses/Analytics Edge/Unit 2")
baseball <- read.csv("baseball.csv")
str(baseball)
## 'data.frame':    1232 obs. of  15 variables:
##  $ Team        : Factor w/ 39 levels "ANA","ARI","ATL",..: 2 3 4 5 7 8 9 10 11 12 ...
##  $ League      : Factor w/ 2 levels "AL","NL": 2 2 1 1 2 1 2 1 2 1 ...
##  $ Year        : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ RS          : int  734 700 712 734 613 748 669 667 758 726 ...
##  $ RA          : int  688 600 705 806 759 676 588 845 890 670 ...
##  $ W           : int  81 94 93 69 61 85 97 68 64 88 ...
##  $ OBP         : num  0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
##  $ SLG         : num  0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
##  $ BA          : num  0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
##  $ Playoffs    : int  0 1 1 0 0 0 1 0 0 1 ...
##  $ RankSeason  : int  NA 4 5 NA NA NA 2 NA NA 6 ...
##  $ RankPlayoffs: int  NA 5 4 NA NA NA 4 NA NA 2 ...
##  $ G           : int  162 162 162 162 162 162 162 162 162 162 ...
##  $ OOBP        : num  0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
##  $ OSLG        : num  0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
# subsetting the data to only take observations before 2002.
moneyball <- subset(baseball, Year < 2002)
# create a new variable RD
moneyball$RD <- moneyball$RS - moneyball$RA
# check relationship between RD and # of wins
plot(moneyball$RD, moneyball$W)

WinsReg <- lm(W ~ RD, data = moneyball)
summary(WinsReg)
## 
## Call:
## lm(formula = W ~ RD, data = moneyball)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.2662  -2.6509   0.1234   2.9364  11.6570 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 80.881375   0.131157  616.67   <2e-16 ***
## RD           0.105766   0.001297   81.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.939 on 900 degrees of freedom
## Multiple R-squared:  0.8808, Adjusted R-squared:  0.8807 
## F-statistic:  6651 on 1 and 900 DF,  p-value: < 2.2e-16
# Predicting runs
RunsReg <- lm(RS ~ OBP + SLG + BA, data = moneyball)
summary(RunsReg)
## 
## Call:
## lm(formula = RS ~ OBP + SLG + BA, data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.941 -17.247  -0.621  16.754  90.998 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -788.46      19.70 -40.029  < 2e-16 ***
## OBP          2917.42     110.47  26.410  < 2e-16 ***
## SLG          1637.93      45.99  35.612  < 2e-16 ***
## BA           -368.97     130.58  -2.826  0.00482 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.69 on 898 degrees of freedom
## Multiple R-squared:  0.9302, Adjusted R-squared:   0.93 
## F-statistic:  3989 on 3 and 898 DF,  p-value: < 2.2e-16
# we can see that the coefficient for batting average is negative.
# This implies that, all else being equal, a team with a lower batting average
# will score more runs, which is a little counterintuitive.
# What's going on here is a case of multicollinearity.
# These three hitting statistics are highly correlated,
# so it's hard to interpret the coefficients of our model.
# Let's try removing batting average, the variable
# with the least significance, to see what happens to our model.
RunsReg <- lm(RS ~ OBP + SLG, data = moneyball)
summary(RunsReg)
## 
## Call:
## lm(formula = RS ~ OBP + SLG, data = moneyball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.838 -17.174  -1.108  16.770  90.036 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -804.63      18.92  -42.53   <2e-16 ***
## OBP          2737.77      90.68   30.19   <2e-16 ***
## SLG          1584.91      42.16   37.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.79 on 899 degrees of freedom
## Multiple R-squared:  0.9296, Adjusted R-squared:  0.9294 
## F-statistic:  5934 on 2 and 899 DF,  p-value: < 2.2e-16