getwd()
[1] "/cloud/project"
# Read in data
baseball = read.csv("baseball.csv")
str(baseball)
'data.frame':   1232 obs. of  15 variables:
 $ Team        : chr  "ARI" "ATL" "BAL" "BOS" ...
 $ League      : chr  "NL" "NL" "AL" "AL" ...
 $ Year        : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
 $ RS          : int  734 700 712 734 613 748 669 667 758 726 ...
 $ RA          : int  688 600 705 806 759 676 588 845 890 670 ...
 $ W           : int  81 94 93 69 61 85 97 68 64 88 ...
 $ OBP         : num  0.328 0.32 0.311 0.315 0.302 0.318 0.315 0.324 0.33 0.335 ...
 $ SLG         : num  0.418 0.389 0.417 0.415 0.378 0.422 0.411 0.381 0.436 0.422 ...
 $ BA          : num  0.259 0.247 0.247 0.26 0.24 0.255 0.251 0.251 0.274 0.268 ...
 $ Playoffs    : int  0 1 1 0 0 0 1 0 0 1 ...
 $ RankSeason  : int  NA 4 5 NA NA NA 2 NA NA 6 ...
 $ RankPlayoffs: int  NA 5 4 NA NA NA 4 NA NA 2 ...
 $ G           : int  162 162 162 162 162 162 162 162 162 162 ...
 $ OOBP        : num  0.317 0.306 0.315 0.331 0.335 0.319 0.305 0.336 0.357 0.314 ...
 $ OSLG        : num  0.415 0.378 0.403 0.428 0.424 0.405 0.39 0.43 0.47 0.402 ...
# Subset to only include moneyball years
moneyball = subset(baseball, Year < 2002)
str(moneyball)
'data.frame':   902 obs. of  15 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
# Compute Run Difference
moneyball$RD = moneyball$RS - moneyball$RA
str(moneyball)
'data.frame':   902 obs. of  16 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
 $ RD          : int  -39 141 86 -142 27 76 3 -115 76 17 ...
# Scatterplot to check for linear relationship
plot(moneyball$RD, moneyball$W)

In-Class activity 7

NumberofWins=80.88+.106*(763-614)
NumberofWins
[1] 96.674

A team with a runs difference of 149 is expected to win around 97 games

# Regression model to predict wins
WinsReg = lm(W ~ RD, data=moneyball)
summary(WinsReg)

Call:
lm(formula = W ~ RD, data = moneyball)

Residuals:
     Min       1Q   Median 
-14.2662  -2.6509   0.1234 
      3Q      Max 
  2.9364  11.6570 

Coefficients:
             Estimate Std. Error
(Intercept) 80.881375   0.131157
RD           0.105766   0.001297
            t value Pr(>|t|)    
(Intercept)  616.67   <2e-16 ***
RD            81.55   <2e-16 ***
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’
  0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.939 on 900 degrees of freedom
Multiple R-squared:  0.8808,    Adjusted R-squared:  0.8807 
F-statistic:  6651 on 1 and 900 DF,  p-value: < 2.2e-16
# VIDEO 3

str(moneyball)
'data.frame':   902 obs. of  16 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
 $ RD          : int  -39 141 86 -142 27 76 3 -115 76 17 ...
# Regression model to predict runs scored
RunsReg = lm(RS ~ OBP + SLG + BA, data=moneyball)
summary(RunsReg)

Call:
lm(formula = RS ~ OBP + SLG + BA, data = moneyball)

Residuals:
    Min      1Q  Median      3Q 
-70.941 -17.247  -0.621  16.754 
    Max 
 90.998 

Coefficients:
            Estimate Std. Error
(Intercept)  -788.46      19.70
OBP          2917.42     110.47
SLG          1637.93      45.99
BA           -368.97     130.58
            t value Pr(>|t|)    
(Intercept) -40.029  < 2e-16 ***
OBP          26.410  < 2e-16 ***
SLG          35.612  < 2e-16 ***
BA           -2.826  0.00482 ** 
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’
  0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 24.69 on 898 degrees of freedom
Multiple R-squared:  0.9302,    Adjusted R-squared:   0.93 
F-statistic:  3989 on 3 and 898 DF,  p-value: < 2.2e-16
# Regression model to predict runs scored again but removing the batting average
RunsReg = lm(RS ~ OBP + SLG, data=moneyball)
summary(RunsReg)

Call:
lm(formula = RS ~ OBP + SLG, data = moneyball)

Residuals:
    Min      1Q  Median      3Q 
-70.838 -17.174  -1.108  16.770 
    Max 
 90.036 

Coefficients:
            Estimate Std. Error
(Intercept)  -804.63      18.92
OBP          2737.77      90.68
SLG          1584.91      42.16
            t value Pr(>|t|)    
(Intercept)  -42.53   <2e-16 ***
OBP           30.19   <2e-16 ***
SLG           37.60   <2e-16 ***
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’
  0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 24.79 on 899 degrees of freedom
Multiple R-squared:  0.9296,    Adjusted R-squared:  0.9294 
F-statistic:  5934 on 2 and 899 DF,  p-value: < 2.2e-16

In class acitivty 8 part A

ExpectedRuns=-804.63+2737.77*(0.361)+1584.91*(0.409)
ExpectedRuns
[1] 831.9332

We expect the team to score around 831.932 runs.

vif(RunsReg)
     OBP      SLG 
2.856197 2.856197 
cor(moneyball$BA,moneyball$OBP)
[1] 0.8540549
str(moneyball)
'data.frame':   902 obs. of  16 variables:
 $ Team        : chr  "ANA" "ARI" "ATL" "BAL" ...
 $ League      : chr  "AL" "NL" "NL" "AL" ...
 $ Year        : int  2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
 $ RS          : int  691 818 729 687 772 777 798 735 897 923 ...
 $ RA          : int  730 677 643 829 745 701 795 850 821 906 ...
 $ W           : int  75 92 88 63 82 88 83 66 91 73 ...
 $ OBP         : num  0.327 0.341 0.324 0.319 0.334 0.336 0.334 0.324 0.35 0.354 ...
 $ SLG         : num  0.405 0.442 0.412 0.38 0.439 0.43 0.451 0.419 0.458 0.483 ...
 $ BA          : num  0.261 0.267 0.26 0.248 0.266 0.261 0.268 0.262 0.278 0.292 ...
 $ Playoffs    : int  0 1 1 0 0 0 0 0 1 0 ...
 $ RankSeason  : int  NA 5 7 NA NA NA NA NA 6 NA ...
 $ RankPlayoffs: int  NA 1 3 NA NA NA NA NA 4 NA ...
 $ G           : int  162 162 162 162 161 162 162 162 162 162 ...
 $ OOBP        : num  0.331 0.311 0.314 0.337 0.329 0.321 0.334 0.341 0.341 0.35 ...
 $ OSLG        : num  0.412 0.404 0.384 0.439 0.393 0.398 0.427 0.455 0.417 0.48 ...
 $ RD          : int  -39 141 86 -142 27 76 3 -115 76 17 ...
# Regression model to predict runs allowed
RunsAllowedReg = lm(RA ~ OOBP + OSLG, data=moneyball)
summary(RunsAllowedReg)

Call:
lm(formula = RA ~ OOBP + OSLG, data = moneyball)

Residuals:
    Min      1Q  Median      3Q 
-82.397 -15.178  -0.129  17.679 
    Max 
 60.955 

Coefficients:
            Estimate Std. Error
(Intercept)  -837.38      60.26
OOBP         2913.60     291.97
OSLG         1514.29     175.43
            t value Pr(>|t|)    
(Intercept) -13.897  < 2e-16 ***
OOBP          9.979 4.46e-16 ***
OSLG          8.632 2.55e-13 ***
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’
  0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 25.67 on 87 degrees of freedom
  (812 observations deleted due to missingness)
Multiple R-squared:  0.9073,    Adjusted R-squared:  0.9052 
F-statistic: 425.8 on 2 and 87 DF,  p-value: < 2.2e-16

In class activity 8 part B

ExpectedRunsAllowed=-837.38+2913.60*(0.267)+1514.29*(0.392)
ExpectedRunsAllowed
[1] 534.1529

The Team is expected to allow 534 runs.

vif(RunsAllowedReg)
    OOBP     OSLG 
2.735629 2.735629 
LS0tCnRpdGxlOiAiTW9kdWxlIDMgQWN0aXZpdHkgNyBQcmVkaWNpdGluZyB0aGUgbnVtYmVyIG9mIHdpbnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpnZXR3ZCgpCmBgYAoKYGBge3J9CiMgUmVhZCBpbiBkYXRhCmJhc2ViYWxsID0gcmVhZC5jc3YoImJhc2ViYWxsLmNzdiIpCnN0cihiYXNlYmFsbCkKCmBgYAoKCgpgYGB7cn0KIyBTdWJzZXQgdG8gb25seSBpbmNsdWRlIG1vbmV5YmFsbCB5ZWFycwptb25leWJhbGwgPSBzdWJzZXQoYmFzZWJhbGwsIFllYXIgPCAyMDAyKQpzdHIobW9uZXliYWxsKQpgYGAKCgoKYGBge3J9CiMgQ29tcHV0ZSBSdW4gRGlmZmVyZW5jZSAmIEFkZGluZyBhIG5ldyBjb2x1bW4gUkQ9UnVucyBEaWZmZXJlbmNlIAptb25leWJhbGwkUkQgPSBtb25leWJhbGwkUlMgLSBtb25leWJhbGwkUkEKc3RyKG1vbmV5YmFsbCkKYGBgCgoKCmBgYHtyfQojIFNjYXR0ZXJwbG90IHRvIGNoZWNrIGZvciBsaW5lYXIgcmVsYXRpb25zaGlwLCBDaGVja2luZyB0byBpcyBpZiB0aGVyZXMgYSBjb3JyZWxhdGlvbi5JbiBmYWN0IHRoZXJlIGlzIGEgY29ycmVsYXRpb24gYmV0d2VlbiBydW4gZGlmZmVyZW5jZSBhbmQgd2lucy4gSGlnaGVyIG51bWJlciBvZiBydW4gZGlmZmVyZW5jZSwgdGhlIGhpZ2hlciB0aGUgd2lucwpwbG90KG1vbmV5YmFsbCRSRCwgbW9uZXliYWxsJFcpCgpgYGAKCioqSW4tQ2xhc3MgYWN0aXZpdHkgNyoqCgpgYGB7cn0KTnVtYmVyb2ZXaW5zPTgwLjg4Ky4xMDYqKDc2My02MTQpCk51bWJlcm9mV2lucwpgYGAKQSB0ZWFtIHdpdGggYSBydW5zIGRpZmZlcmVuY2Ugb2YgMTQ5IGlzIGV4cGVjdGVkIHRvIHdpbiBhcm91bmQgOTcgZ2FtZXMKCmBgYHtyfQojIFJlZ3Jlc3Npb24gbW9kZWwgdG8gcHJlZGljdCB3aW5zLCBDcmVhdGluZyBhIHZhcmlhYmxlIExNIGFuZCB0aGVuIHJ1bm5pbmcgYSBzdW1tYXJ5IHRvIHNlZSB0aGUgbmV3IFZhcmlhYmxlIGFuZCBjaGVjayBvdXQgdGhlIGRldGFpbHMuUCB2YWx1ZSBmb3IgcmQgaXMgc2lnbmZpY2FudCwgcnVucyBkaWZmZXJlY2UgZG9lcyBhZmZlY3QsIGFjY29yZGluZyB0byB0aGUgZXN0aW1hdGUuIFdpdGggRWFjaCBydW4gZGlmZmVyZW5jZSBvZiAxLCBpbmNyZWFzZXMgdGhlIG51bWJlciBvZiB3aW5zIGJ5IC4xMCwgbWVhbmluZyBpZiBhIHRlYW0gaGFzIGEgcnVuIGRpZmZlcmVuY2Ugb2YgMTAgaXQgaW5jcmVhc2UgdG8gMSB3aW4uCldpbnNSZWcgPSBsbShXIH4gUkQsIGRhdGE9bW9uZXliYWxsKQpzdW1tYXJ5KFdpbnNSZWcpCmBgYAoKCmBgYHtyfQojIFZJREVPIDMgUmVkdW5kYW50IAoKc3RyKG1vbmV5YmFsbCkKYGBgCgoKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHJ1bnMgc2NvcmVkLiBXZSBvYmVzZXZlcmVkIE9QUCBTTEcgYW5kIEJBIGFyZSBhbGwgc2lnbmlmaWNhbnQgYXQgYSA1JSBzaWdpbmlmYWNudCBsZXZlbCwgd2hpY2ggbWVhbnMgdGhhdCB0aGVzZSB2YXJpYWJsZXMgYXJlIHJlbGlhYmxlLiBVc2luZyB0aG9zZSB0aHJlZSBhdHRyaWJ1dGVzIGFyZSBnb29kIGJ1dCB0aGUgYmFkIG5ld3MgaXMgT0JQIGFuZCBCQSBhcmUgaGlnaGx5IGNvcnJlbGF0ZWQuClJ1bnNSZWcgPSBsbShSUyB+IE9CUCArIFNMRyArIEJBLCBkYXRhPW1vbmV5YmFsbCkKc3VtbWFyeShSdW5zUmVnKQpgYGAKCgoKCmBgYHtyfQpgYGAKCgoKYGBge3J9CiMgUmVncmVzc2lvbiBtb2RlbCB0byBwcmVkaWN0IHJ1bnMgc2NvcmVkIGFnYWluIGJ1dCByZW1vdmluZyB0aGUgYmF0dGluZyBhdmVyYWdlLCBpdCB3YXMgYWZmZWN0aW5nIHRoZSBtb2RlbCB0aGF0IHdhcyBoaWdobHkgY29ycmVsYXRlZC4gVGhhdCB3YXkgdGhlcmVzIG5vIGluZmxhdGlvbiBvZiB0aGUgZGF0YS4gClJ1bnNSZWcgPSBsbShSUyB+IE9CUCArIFNMRywgZGF0YT1tb25leWJhbGwpCnN1bW1hcnkoUnVuc1JlZykKYGBgCioqSW4gY2xhc3MgYWNpdGl2dHkgOCBwYXJ0IEEqKgoKYGBge3J9CkV4cGVjdGVkUnVucz0tODA0LjYzKzI3MzcuNzcqKDAuMzYxKSsxNTg0LjkxKigwLjQwOSkKRXhwZWN0ZWRSdW5zCmBgYAoKV2UgZXhwZWN0IHRoZSB0ZWFtIHRvIHNjb3JlIGFyb3VuZCA4MzEuOTMyIHJ1bnMuCgoKCmBgYHtyfQp2aWYoUnVuc1JlZykKYGBgCgoKYGBge3J9CiNWZXJ5IGhpZ2ggY29ycmVsYXRpb24sIDg1cGVyY2VudCAKY29yKG1vbmV5YmFsbCRCQSxtb25leWJhbGwkT0JQKQoKYGBgCgoKYGBge3J9CnN0cihtb25leWJhbGwpCgpgYGAKCgoKCmBgYHtyfQojIFJlZ3Jlc3Npb24gbW9kZWwgdG8gcHJlZGljdCBydW5zIGFsbG93ZWQKI1dlIGZvdW5kIE9PQlAgJiBPU0xHIGFyZSBib3RoIHNpZ25pZmljYW50IGF0IGEgNSUgc2lnbmlmaWNhbmNlLiBXaGljaCBtZWFucyB0aGlzIGFmZmVjdHMgd2lucwpSdW5zQWxsb3dlZFJlZyA9IGxtKFJBIH4gT09CUCArIE9TTEcsIGRhdGE9bW9uZXliYWxsKQpzdW1tYXJ5KFJ1bnNBbGxvd2VkUmVnKQpgYGAKCgoqKkluIGNsYXNzIGFjdGl2aXR5IDggcGFydCBCKioKCmBgYHtyfQpFeHBlY3RlZFJ1bnNBbGxvd2VkPS04MzcuMzgrMjkxMy42MCooMC4yNjcpKzE1MTQuMjkqKDAuMzkyKQpFeHBlY3RlZFJ1bnNBbGxvd2VkCmBgYAoKVGhlIFRlYW0gaXMgZXhwZWN0ZWQgdG8gYWxsb3cgIDUzNCBydW5zLgoKCgpgYGB7cn0KI0Fic29sdXRlbHkgY29ycmVsYXRlZAp2aWYoUnVuc0FsbG93ZWRSZWcpCmBgYAoKCg==