1 Data Exploration

ourdata<-read.csv(file="C:/HW/moneyball-training-data.csv")

# visual observation of top records

head(ourdata)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1     1          39           1445             194              39
## 2     2          70           1339             219              22
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1              13             143             842              NA
## 2             190             685            1075              37
## 3             137             602             917              46
## 4              96             451             922              43
## 5             102             472             920              49
## 6              92             443             973             107
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1              NA               NA            9364               84
## 2              28               NA            1347              191
## 3              27               NA            1377              137
## 4              30               NA            1396               97
## 5              39               NA            1297              102
## 6              59               NA            1279               92
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              927             5456            1011               NA
## 2              689             1082             193              155
## 3              602              917             175              153
## 4              454              928             164              156
## 5              472              920             138              168
## 6              443              973             123              149
library(knitr)

# mean/median/IQR/NA

kable(summary(ourdata))
INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
Median :1270.5 Median : 82.00 Median :1454 Median :238.0 Median : 47.00 Median :102.00 Median :512.0 Median : 750.0 Median :101.0 Median : 49.0 Median :58.00 Median : 1518 Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
NA NA NA NA NA NA NA NA’s :102 NA’s :131 NA’s :772 NA’s :2085 NA NA NA NA’s :102 NA NA’s :286
# histogram/density

# near-normal

par(mar=c(2,2,2,2))

par(mfrow = c(2,2))

hist(ourdata$TARGET_WINS)

plot(density(ourdata$TARGET_WINS))

# near-normal, long right-tail, potential outliers

hist(ourdata$TEAM_BATTING_H)

plot(density(ourdata$TEAM_BATTING_H))

# near-normal, some weirdness around the mean

hist(ourdata$TEAM_BATTING_2B)

plot(density(ourdata$TEAM_BATTING_2B))

# heavy right skewed, outliers(?)

hist(ourdata$TEAM_BATTING_3B)

plot(density(ourdata$TEAM_BATTING_3B))

# looks very weird, does not look right, potentially needs to be dropped

hist(ourdata$TEAM_BATTING_HR)

plot(density(ourdata$TEAM_BATTING_HR))

# left skewed, outliers(?)

hist(ourdata$TEAM_BATTING_BB)

plot(density(ourdata$TEAM_BATTING_BB))

# weird, potentially to be dropped

hist(ourdata$TEAM_BATTING_SO)

plot(density(ourdata$TEAM_BATTING_SO,na.rm = TRUE))

# right skewed, outliers(?)

hist(ourdata$TEAM_BASERUN_SB)

plot(density(ourdata$TEAM_BASERUN_SB,na.rm = TRUE))

# right skewed, outliers(?)

hist(ourdata$TEAM_BASERUN_CS)

plot(density(ourdata$TEAM_BASERUN_CS,na.rm = TRUE))

# right skewed, outliers(?)

hist(ourdata$TEAM_FIELDING_E)

plot(density(ourdata$TEAM_FIELDING_E,na.rm = TRUE))

# near-normal, slightly left skewed

hist(ourdata$TEAM_FIELDING_DP)

plot(density(ourdata$TEAM_FIELDING_DP,na.rm = TRUE))

# right skewed, outliers

hist(ourdata$TEAM_PITCHING_BB)

plot(density(ourdata$TEAM_PITCHING_BB,na.rm = TRUE))

# severily right skewed, to be dropped(?)

hist(ourdata$TEAM_PITCHING_H)

plot(density(ourdata$TEAM_PITCHING_H,na.rm = TRUE))

# weird, to be dropped(?)

hist(ourdata$TEAM_PITCHING_HR)

plot(density(ourdata$TEAM_PITCHING_HR,na.rm = TRUE))

# severily right skewed, to be dropped(?)

hist(ourdata$TEAM_PITCHING_SO)

plot(density(ourdata$TEAM_PITCHING_SO,na.rm = TRUE))

pairs(ourdata)

cor(ourdata, use="pairwise.complete.obs", method="pearson") 
##                          INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX             1.0000000000 -0.02105643   -0.017920241      0.01118301
## TARGET_WINS      -0.0210564349  1.00000000    0.388767521      0.28910365
## TEAM_BATTING_H   -0.0179202413  0.38876752    1.000000000      0.56284968
## TEAM_BATTING_2B   0.0111830135  0.28910365    0.562849678      1.00000000
## TEAM_BATTING_3B  -0.0058146834  0.14260841    0.427696575     -0.10730582
## TEAM_BATTING_HR   0.0514810468  0.17615320   -0.006544685      0.43539729
## TEAM_BATTING_BB  -0.0265672362  0.23255986   -0.072464013      0.25572610
## TEAM_BATTING_SO   0.0814501106 -0.03175071   -0.463853571      0.16268519
## TEAM_BASERUN_SB   0.0402671277  0.13513892    0.123567797     -0.19975724
## TEAM_BASERUN_CS   0.0005653743  0.02240407    0.016705668     -0.09981406
## TEAM_BATTING_HBP  0.0771930266  0.07350424   -0.029112176      0.04608475
## TEAM_PITCHING_H   0.0171031479 -0.10993705    0.302693709      0.02369219
## TEAM_PITCHING_HR  0.0509858973  0.18901373    0.072853119      0.45455082
## TEAM_PITCHING_BB -0.0152875130  0.12417454    0.094193027      0.17805420
## TEAM_PITCHING_SO  0.0558901457 -0.07843609   -0.252656790      0.06479231
## TEAM_FIELDING_E  -0.0092331265 -0.17648476    0.264902478     -0.23515099
## TEAM_FIELDING_DP  0.0200642919 -0.03485058    0.155383321      0.29087998
##                  TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX               -0.005814683     0.051481047     -0.02656724
## TARGET_WINS          0.142608411     0.176153200      0.23255986
## TEAM_BATTING_H       0.427696575    -0.006544685     -0.07246401
## TEAM_BATTING_2B     -0.107305824     0.435397293      0.25572610
## TEAM_BATTING_3B      1.000000000    -0.635566946     -0.28723584
## TEAM_BATTING_HR     -0.635566946     1.000000000      0.51373481
## TEAM_BATTING_BB     -0.287235841     0.513734810      1.00000000
## TEAM_BATTING_SO     -0.669781188     0.727069348      0.37975087
## TEAM_BASERUN_SB      0.533506448    -0.453578426     -0.10511564
## TEAM_BASERUN_CS      0.348764919    -0.433793868     -0.13698837
## TEAM_BATTING_HBP    -0.174247154     0.106181160      0.04746007
## TEAM_PITCHING_H      0.194879411    -0.250145481     -0.44977762
## TEAM_PITCHING_HR    -0.567836679     0.969371396      0.45955207
## TEAM_PITCHING_BB    -0.002224148     0.136927564      0.48936126
## TEAM_PITCHING_SO    -0.258818931     0.184707564     -0.02075682
## TEAM_FIELDING_E      0.509778447    -0.587339098     -0.65597081
## TEAM_FIELDING_DP    -0.323074847     0.448985348      0.43087675
##                  TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX                 0.08145011      0.04026713    0.0005653743
## TARGET_WINS          -0.03175071      0.13513892    0.0224040691
## TEAM_BATTING_H       -0.46385357      0.12356780    0.0167056677
## TEAM_BATTING_2B       0.16268519     -0.19975724   -0.0998140593
## TEAM_BATTING_3B      -0.66978119      0.53350645    0.3487649195
## TEAM_BATTING_HR       0.72706935     -0.45357843   -0.4337938681
## TEAM_BATTING_BB       0.37975087     -0.10511564   -0.1369883707
## TEAM_BATTING_SO       1.00000000     -0.25448923   -0.2178813684
## TEAM_BASERUN_SB      -0.25448923      1.00000000    0.6552448036
## TEAM_BASERUN_CS      -0.21788137      0.65524480    1.0000000000
## TEAM_BATTING_HBP      0.22094219     -0.06400498   -0.0705138958
## TEAM_PITCHING_H      -0.37568637      0.07328505   -0.0520078089
## TEAM_PITCHING_HR      0.66717889     -0.41651072   -0.4225660463
## TEAM_PITCHING_BB      0.03700514      0.14641513   -0.1069612356
## TEAM_PITCHING_SO      0.41623330     -0.13712861   -0.2102227352
## TEAM_FIELDING_E      -0.58466444      0.50963090    0.0483218940
## TEAM_FIELDING_DP      0.15488939     -0.49707763   -0.2142480076
##                  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## INDEX                  0.07719303      0.01710315       0.05098590
## TARGET_WINS            0.07350424     -0.10993705       0.18901373
## TEAM_BATTING_H        -0.02911218      0.30269371       0.07285312
## TEAM_BATTING_2B        0.04608475      0.02369219       0.45455082
## TEAM_BATTING_3B       -0.17424715      0.19487941      -0.56783668
## TEAM_BATTING_HR        0.10618116     -0.25014548       0.96937140
## TEAM_BATTING_BB        0.04746007     -0.44977762       0.45955207
## TEAM_BATTING_SO        0.22094219     -0.37568637       0.66717889
## TEAM_BASERUN_SB       -0.06400498      0.07328505      -0.41651072
## TEAM_BASERUN_CS       -0.07051390     -0.05200781      -0.42256605
## TEAM_BATTING_HBP       1.00000000     -0.02769699       0.10675878
## TEAM_PITCHING_H       -0.02769699      1.00000000      -0.14161276
## TEAM_PITCHING_HR       0.10675878     -0.14161276       1.00000000
## TEAM_PITCHING_BB       0.04785137      0.32067616       0.22193750
## TEAM_PITCHING_SO       0.22157375      0.26724807       0.20588053
## TEAM_FIELDING_E        0.04178971      0.66775901      -0.49314447
## TEAM_FIELDING_DP      -0.07120824     -0.22865059       0.43917040
##                  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## INDEX                -0.015287513       0.05589015    -0.009233126
## TARGET_WINS           0.124174536      -0.07843609    -0.176484759
## TEAM_BATTING_H        0.094193027      -0.25265679     0.264902478
## TEAM_BATTING_2B       0.178054204       0.06479231    -0.235150986
## TEAM_BATTING_3B      -0.002224148      -0.25881893     0.509778447
## TEAM_BATTING_HR       0.136927564       0.18470756    -0.587339098
## TEAM_BATTING_BB       0.489361263      -0.02075682    -0.655970815
## TEAM_BATTING_SO       0.037005141       0.41623330    -0.584664436
## TEAM_BASERUN_SB       0.146415134      -0.13712861     0.509630902
## TEAM_BASERUN_CS      -0.106961236      -0.21022274     0.048321894
## TEAM_BATTING_HBP      0.047851371       0.22157375     0.041789712
## TEAM_PITCHING_H       0.320676162       0.26724807     0.667759010
## TEAM_PITCHING_HR      0.221937505       0.20588053    -0.493144466
## TEAM_PITCHING_BB      1.000000000       0.48849865    -0.022837561
## TEAM_PITCHING_SO      0.488498653       1.00000000    -0.023291783
## TEAM_FIELDING_E      -0.022837561      -0.02329178     1.000000000
## TEAM_FIELDING_DP      0.324457226       0.02615804    -0.497684954
##                  TEAM_FIELDING_DP
## INDEX                  0.02006429
## TARGET_WINS           -0.03485058
## TEAM_BATTING_H         0.15538332
## TEAM_BATTING_2B        0.29087998
## TEAM_BATTING_3B       -0.32307485
## TEAM_BATTING_HR        0.44898535
## TEAM_BATTING_BB        0.43087675
## TEAM_BATTING_SO        0.15488939
## TEAM_BASERUN_SB       -0.49707763
## TEAM_BASERUN_CS       -0.21424801
## TEAM_BATTING_HBP      -0.07120824
## TEAM_PITCHING_H       -0.22865059
## TEAM_PITCHING_HR       0.43917040
## TEAM_PITCHING_BB       0.32445723
## TEAM_PITCHING_SO       0.02615804
## TEAM_FIELDING_E       -0.49768495
## TEAM_FIELDING_DP       1.00000000
# very high correlation for Home Runs - very suspicious - looks wrong

temp <- lm(TEAM_BATTING_HR~ TEAM_PITCHING_HR, data=ourdata)

with(ourdata,plot(TEAM_PITCHING_HR,TEAM_BATTING_HR))

abline(temp)

# some data is missing

sum(ourdata$TEAM_BATTING_H)
## [1] 3344058
sum(ourdata$TEAM_PITCHING_H)
## [1] 4049483
sum(ourdata$TEAM_BATTING_BB)
## [1] 1141548
sum(ourdata$TEAM_PITCHING_BB)
## [1] 1258646
ourdata$losses<-162-ourdata$TARGET_WINS

sum(ourdata$TARGET_WINS)
## [1] 183880
sum(ourdata$losses)
## [1] 184832
ourdata$losses<-NULL

Problems with the data

  1. Number of wins and number of losses is not the same, so we do not have complete data; which still could be ok for our analysis

  2. No strong correlations between dependent and independent variables

  3. TEAM_BATTING_HR and TEAM_PITCHING_HR are almost identical , correlation between them 93%. It does not make sense. Their distribution look very weird too. Possible we need to drop both variables.

  4. TEAM_PITCHING_H correlates positively, while it should have negative correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.has a lot of outliers. Maximum value is 30,132 or 191 per game, which is not possible (heavily right skewed) It needs to be fixed or dropped.

  5. TEAM_BASERUN_CS correlates positively (actually close to zero), while it should have negative correlation. Right-skewed with potential outliers. If we use the variable we need to explain why we use variable that is counter-intuitive.

  6. TEAM_BASERUN_HBP has too much missing data. It needs to be dropped.

  7. TEAM_PITCHING_BB correlates positively, while it should have negative correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.. It has a lot of outliers, which seem to skew the data. Maybe it could be used after we fix it.

  8. TEAM_PITCHING_SO correlates negatively, while it should have positive correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.. It has a lot of outliers, which seem to skew the data. Maybe it could be used after we fix it.

  9. TEAM_FIELDING_DP correlates negatively (close to zero), while it should have positive correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.

  10. Strikeout numbers both for Batting and Pitching look very low. Distributions look weird, skewed.

2 Data Preperation

# dropping bad variable

ourdata$TEAM_BATTING_HBP<-NULL

# We will keep one dataset the way it is, and we will create a copy of it to manually adjust data

ourdataM<-ourdata

# dropping pitching Home Runs

ourdataM$TEAM_PITCHING_HR<-NULL

# dropping outliers

82-(92-71)*2
## [1] 40
82+(92-71)*2
## [1] 124
# 116 Wins was the maximum, so to use 124 as cut off is very conservative; https://www.mlb.com/news/most-mlb-wins-in-a-season/c-289159676

# The minimum in more resent years was 40; http://writing.jmpressley.net/sports/worstseasons.html

ourdataM<-ourdataM[!(ourdata$TARGET_WINS<40),]

ourdataM<-ourdataM[!(ourdata$TARGET_WINS>124),]

# I use 2018 data as reference; https://www.baseball-reference.com/leagues/MLB/2018.shtml

1454-(1537-1383)*2
## [1] 1146
1454+(1537-1383)*2
## [1] 1762
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_H<1146),]

ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_H>1762),]

238-(273-208)*2
## [1] 108
238+(273-208)*2
## [1] 368
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_2B<108),]

ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_2B>368),]

47-(72-34)*2
## [1] -29
47+(72-34)*2
## [1] 123
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_3B>123),]

# Home Run historical data indicates 3 was the minumum ever; http://www.baseball-almanac.com/recbooks/rb_hr7.shtml

ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_HR<3),]

512-(580-451)*2
## [1] 254
512+(580-451)*2
## [1] 770
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_BB<254),]

ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_BB>770),]

# TEAM_BATTING_SO is very inconsistent with the 2018 reference data; I would drop the variable

ourdataM$TEAM_BATTING_SO<-NULL

101-(156-56)*2
## [1] -99
101+(156-56)*2
## [1] 301
ourdataM<-ourdataM[!(ourdata$TEAM_BASERUN_SB>301),]

1518-(1682-1419)*2
## [1] 992
1518+(1682-1419)*2
## [1] 2044
ourdataM<-ourdataM[!(ourdata$TEAM_PITCHING_H<992),]

ourdataM<-ourdataM[!(ourdata$TEAM_PITCHING_H>2044),]

536.5-(611-476)*2
## [1] 266.5
536.5+(611-476)*2
## [1] 806.5
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_BB<266),]

ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_BB>807),]

813.5-(968-615)*2
## [1] 107.5
813.5+(968-615)*2
## [1] 1519.5
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_SO<107),]

ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_SO>1520),]

159-(249.2-127)*2
## [1] -85.4
159+(249.2-127)*2
## [1] 403.4
ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_E>404),]

149-(164-131)*2
## [1] 83
149+(164-131)*2
## [1] 215
ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_DP>215),]

ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_DP<83),]

ourdataM<-ourdataM[!is.na(ourdataM$INDEX),]

summary(ourdataM)
##      INDEX       TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   3   Min.   : 40.00   Min.   :1168   Min.   :130    
##  1st Qu.: 671   1st Qu.: 72.00   1st Qu.:1381   1st Qu.:215    
##  Median :1270   Median : 81.00   Median :1446   Median :244    
##  Mean   :1254   Mean   : 80.58   Mean   :1453   Mean   :247    
##  3rd Qu.:1862   3rd Qu.: 90.00   3rd Qu.:1523   3rd Qu.:278    
##  Max.   :2534   Max.   :120.00   Max.   :1758   Max.   :367    
##                                                                
##  TEAM_BATTING_3B  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB
##  Min.   : 11.00   Min.   :  5.0   Min.   :273.0   Min.   : 18.0  
##  1st Qu.: 31.00   1st Qu.: 76.0   1st Qu.:472.0   1st Qu.: 63.0  
##  Median : 41.00   Median :119.0   Median :525.0   Median : 92.0  
##  Mean   : 47.25   Mean   :115.9   Mean   :528.4   Mean   :102.1  
##  3rd Qu.: 60.00   3rd Qu.:156.0   3rd Qu.:586.0   3rd Qu.:132.0  
##  Max.   :118.00   Max.   :264.0   Max.   :770.0   Max.   :367.0  
##                                                                  
##  TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO
##  Min.   : 11.00   Min.   :1168    Min.   :325.0    Min.   : 345.0  
##  1st Qu.: 38.00   1st Qu.:1404    1st Qu.:488.0    1st Qu.: 644.0  
##  Median : 50.00   Median :1486    Median :541.0    Median : 827.0  
##  Mean   : 53.49   Mean   :1513    Mean   :548.9    Mean   : 816.6  
##  3rd Qu.: 63.00   3rd Qu.:1586    3rd Qu.:605.5    3rd Qu.: 972.5  
##  Max.   :201.00   Max.   :2394    Max.   :807.0    Max.   :1481.0  
##  NA's   :253                                                       
##  TEAM_FIELDING_E TEAM_FIELDING_DP
##  Min.   : 65.0   Min.   : 83.0   
##  1st Qu.:121.0   1st Qu.:136.0   
##  Median :142.0   Median :151.0   
##  Mean   :159.3   Mean   :150.2   
##  3rd Qu.:181.0   3rd Qu.:165.0   
##  Max.   :397.0   Max.   :215.0   
## 
# using median to impute missing values for manually adjusted model

ourdataM$TEAM_PITCHING_SO[is.na(ourdataM$TEAM_PITCHING_SO)]<-827

ourdataM$TEAM_BASERUN_SB[is.na(ourdataM$TEAM_BASERUN_SB)]<-92

ourdataM$TEAM_FIELDING_DP[is.na(ourdataM$TEAM_FIELDING_DP)]<-151

ourdataM$TEAM_BASERUN_CS[is.na(ourdataM$TEAM_BASERUN_CS)]<-50
# using median to impute missing values for original model

ourdata$TEAM_PITCHING_SO[is.na(ourdata$TEAM_PITCHING_SO)]<-817.7

ourdata$TEAM_BASERUN_SB[is.na(ourdata$TEAM_BASERUN_SB)]<-101

ourdata$TEAM_FIELDING_DP[is.na(ourdata$TEAM_FIELDING_DP)]<-149

ourdata$TEAM_BASERUN_CS[is.na(ourdata$TEAM_BASERUN_CS)]<-49

ourdata$TEAM_BATTING_SO[is.na(ourdata$TEAM_BATTING_SO)]<-750
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_BASERUN_CS, method="pearson", use = "complete.obs")
## [1] -0.02438445
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_H, method="pearson", use = "complete.obs")
## [1] 0.2322221
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_BB, method="pearson", use = "complete.obs")
## [1] 0.3021963
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_SO, method="pearson", use = "complete.obs")
## [1] -0.05017687
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_FIELDING_DP, method="pearson", use = "complete.obs")
## [1] -0.03810618
#For 4 variables correlation still does not make sense, corrupted variables; we will NA them

ourdataM$TEAM_PITCHING_BB<-NULL

ourdataM$TEAM_PITCHING_H<-NULL

ourdataM$TEAM_PITCHING_SO<-NULL

ourdataM$TEAM_FIELDING_DP<-NULL
head(ourdataM)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
## 7     7          80           1244             179              54
## 8     8          85           1273             171              37
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB TEAM_BASERUN_CS
## 3             137             602              46              27
## 4              96             451              43              30
## 5             102             472              49              39
## 6              92             443             107              59
## 7             122             525              80              54
## 8             115             456              40              36
##   TEAM_FIELDING_E
## 3             175
## 4             164
## 5             138
## 6             123
## 7             136
## 8             112
library(car)
## Loading required package: carData
ourLM <- lm(TARGET_WINS~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_BB+TEAM_BASERUN_CS+TEAM_BASERUN_SB+TEAM_FIELDING_E, data=ourdataM)

summary(ourLM)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_CS + TEAM_BASERUN_SB + 
##     TEAM_FIELDING_E, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.990  -7.131  -0.053   7.501  34.768 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.734616   4.928213   0.758    0.449    
## TEAM_BATTING_H   0.055283   0.004213  13.122  < 2e-16 ***
## TEAM_BATTING_2B -0.074178   0.009996  -7.421 2.00e-13 ***
## TEAM_BATTING_3B  0.150994   0.020923   7.217 8.64e-13 ***
## TEAM_BATTING_BB  0.039049   0.003695  10.569  < 2e-16 ***
## TEAM_BASERUN_CS -0.080828   0.015468  -5.226 2.00e-07 ***
## TEAM_BASERUN_SB  0.079308   0.006349  12.491  < 2e-16 ***
## TEAM_FIELDING_E -0.105254   0.007874 -13.368  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.65 on 1419 degrees of freedom
## Multiple R-squared:  0.3396, Adjusted R-squared:  0.3363 
## F-statistic: 104.2 on 7 and 1419 DF,  p-value: < 2.2e-16
plot(cooks.distance(ourLM))

plot(ourLM)

ourinf<-influence(ourLM)

sum(ourinf$hat)
## [1] 8
#install.packages("faraway")

library(faraway)
## Warning: package 'faraway' was built under R version 3.5.2
## 
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
## 
##     logit, vif
halfnorm(lm.influence(ourLM)$hat,labs=row.names(ourdataM),ylab="Leverages")

ourLMsum<-summary(ourLM)

ourLMsum$sig
## [1] 10.64642
stud<-residuals(ourLM)/(ourLMsum$sig*sqrt(1-ourinf$hat))

qqnorm(stud)

abline(0,1)

dim(ourdataM)
## [1] 1427   10
jack<-rstudent(ourLM)

jack[which.max(abs(jack))]
##     1622 
## 3.299737
# 2n/(n-1-p)

qt(.05/(1421*2),1421-1-8)
## [1] -4.150314
# no outliers
ourdataM[rownames(ourdataM)=='183'|rownames(ourdataM)=='1409',]<-NA

ourdataM<-ourdataM[!is.na(ourdataM$TARGET_WINS),]
LMTBH <- lm(TARGET_WINS~ TEAM_BATTING_H, data=ourdataM)

summary(LMTBH)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.653  -8.536   0.310   8.712  37.032 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    17.783658   4.493969   3.957 7.96e-05 ***
## TEAM_BATTING_H  0.043213   0.003084  14.013  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.26 on 1423 degrees of freedom
## Multiple R-squared:  0.1213, Adjusted R-squared:  0.1206 
## F-statistic: 196.4 on 1 and 1423 DF,  p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_H,TARGET_WINS))
 
abline(LMTBH)

LMTBH_R = rstandard(LMTBH)

plot(ourdataM$TEAM_BATTING_H, LMTBH_R, ylab="Standardized Residuals", xlab="BATTING HITS", main="WINS") 
abline(0, 0)         

LMTB2B <- lm(TARGET_WINS~ TEAM_BATTING_2B, data=ourdataM)

summary(LMTB2B)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.588  -8.933   0.350   8.926  41.091 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     65.23438    1.97606  33.012  < 2e-16 ***
## TEAM_BATTING_2B  0.06216    0.00788   7.888 6.06e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.8 on 1423 degrees of freedom
## Multiple R-squared:  0.0419, Adjusted R-squared:  0.04122 
## F-statistic: 62.23 on 1 and 1423 DF,  p-value: 6.058e-15
with(ourdataM,plot(TEAM_BATTING_2B,TARGET_WINS))
abline(LMTB2B)

LMTB2B_R = rstandard(LMTB2B)

plot(ourdataM$TEAM_BATTING_2B, LMTB2B_R, ylab="Residuals", xlab="BATTING HITS 2 BASE", main="WINS") 
abline(0, 0)  

# QUDRATIC ?

LMTB3B <- lm(TARGET_WINS~ TEAM_BATTING_3B, data=ourdataM)

summary(LMTB3B)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_3B, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.675  -9.079   0.580   9.202  36.782 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     77.76037    0.83617  92.996  < 2e-16 ***
## TEAM_BATTING_3B  0.05993    0.01613   3.716  0.00021 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.02 on 1423 degrees of freedom
## Multiple R-squared:  0.00961,    Adjusted R-squared:  0.008914 
## F-statistic: 13.81 on 1 and 1423 DF,  p-value: 0.0002104
with(ourdataM,plot(TEAM_BATTING_3B,TARGET_WINS))
abline(LMTB3B)

LMTB3B_R = rstandard(LMTB3B)

plot(ourdataM$TEAM_BATTING_3B, LMTB3B_R, ylab="Residuals", xlab="BATTING HITS 3 BASE", main="WINS") 
abline(0, 0)  

LMTBHR <- lm(TARGET_WINS~ TEAM_BATTING_HR, data=ourdataM)

summary(LMTBHR)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_HR, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.028  -8.874   0.472   8.781  44.893 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     73.895440   0.790878  93.435   <2e-16 ***
## TEAM_BATTING_HR  0.057674   0.006166   9.354   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.7 on 1423 degrees of freedom
## Multiple R-squared:  0.05792,    Adjusted R-squared:  0.05726 
## F-statistic: 87.49 on 1 and 1423 DF,  p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_HR,TARGET_WINS))
abline(LMTBHR)

LMTBHR_R = rstandard(LMTBHR)

plot(ourdataM$TEAM_BATTING_HR, LMTBHR_R, ylab="Residuals", xlab="BATTING HITS HOME RUNS", main="WINS") 
abline(0, 0) 

LMTBBB <- lm(TARGET_WINS~ TEAM_BATTING_BB, data=ourdataM)

summary(LMTBBB)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.385  -8.188   0.188   8.799  44.144 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     54.087763   2.134672   25.34   <2e-16 ***
## TEAM_BATTING_BB  0.050157   0.003992   12.56   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.41 on 1423 degrees of freedom
## Multiple R-squared:  0.09987,    Adjusted R-squared:  0.09924 
## F-statistic: 157.9 on 1 and 1423 DF,  p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_BB,TARGET_WINS))
abline(LMTBBB)

LMTBBB_R = rstandard(LMTBBB)

plot(ourdataM$TEAM_BATTING_BB, LMTBHR_R, ylab="Residuals", xlab="BATTING HITS HOME RUNS", main="WINS") 
abline(0, 0) 

LMTBSB <- lm(TARGET_WINS~ TEAM_BASERUN_SB, data=ourdataM)

summary(LMTBSB)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.200  -9.059   0.475   8.911  38.901 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     76.958747   0.738738  104.18  < 2e-16 ***
## TEAM_BASERUN_SB  0.035641   0.006421    5.55 3.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.94 on 1423 degrees of freedom
## Multiple R-squared:  0.02119,    Adjusted R-squared:  0.0205 
## F-statistic: 30.81 on 1 and 1423 DF,  p-value: 3.394e-08
with(ourdataM,plot(TEAM_BASERUN_SB,TARGET_WINS))
abline(LMTBSB)

LMTBSB_R = rstandard(LMTBSB)

plot(ourdataM$TEAM_BASERUN_SB, LMTBSB_R, ylab="Residuals", xlab="BATTING SB RUNS", main="WINS") 
abline(0, 0) 

LMTBFE <- lm(TARGET_WINS~ TEAM_FIELDING_E, data=ourdataM)

summary(LMTBFE)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = ourdataM)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.490  -8.879   0.508   8.848  44.835 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     87.286435   1.022156  85.394  < 2e-16 ***
## TEAM_FIELDING_E -0.042087   0.006058  -6.948 5.63e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 1423 degrees of freedom
## Multiple R-squared:  0.03281,    Adjusted R-squared:  0.03213 
## F-statistic: 48.27 on 1 and 1423 DF,  p-value: 5.625e-12
with(ourdataM,plot(TEAM_FIELDING_E,TARGET_WINS))
abline(LMTBFE)

LMTBFE_R = rstandard(LMTBFE)

plot(ourdataM$TEAM_FIELDING_E, LMTBFE_R, ylab="Residuals", xlab="FIELD ERRORS", main="WINS") 
abline(0, 0) 

head(ourdataM)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 3     3          86           1377             232              35
## 4     4          70           1387             209              38
## 5     5          82           1297             186              27
## 6     6          75           1279             200              36
## 7     7          80           1244             179              54
## 8     8          85           1273             171              37
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB TEAM_BASERUN_CS
## 3             137             602              46              27
## 4              96             451              43              30
## 5             102             472              49              39
## 6              92             443             107              59
## 7             122             525              80              54
## 8             115             456              40              36
##   TEAM_FIELDING_E
## 3             175
## 4             164
## 5             138
## 6             123
## 7             136
## 8             112

3 Build Model

powerTransform(cbind(ourdataM$TARGET_WINS,ourdataM$TEAM_BATTING_H,ourdataM$TEAM_BATTING_2B,ourdataM$TEAM_BATTING_3B,ourdataM$TEAM_BATTING_HR,ourdataM$TEAM_BATTING_BB,ourdataM$TEAM_BASERUN_CS,ourdataM$TEAM_BASERUN_SB,ourdataM$TEAM_FIELDING_E))
## Estimated transformation parameters 
##          Y1          Y2          Y3          Y4          Y5          Y6 
##  1.22735945  0.26192872  0.43644658  0.33568310  0.67141571  0.68145733 
##          Y7          Y8          Y9 
##  0.01584513  0.17057304 -0.29854402
ourdataM$TARGET_WINS_BC<-ourdataM$TARGET_WINS^1.23

ourdataM$TEAM_BATTING_H_BC<-ourdataM$TEAM_BATTING_H^0.26

ourdataM$TEAM_BATTING_2B_BC<-ourdataM$TEAM_BATTING_2B^0.44

ourdataM$TEAM_BATTING_3B_BC<-ourdataM$TEAM_BATTING_3B^0.34

ourdataM$TEAM_BATTING_HR_BC<-ourdataM$TEAM_BATTING_HR^0.67

ourdataM$TEAM_BATTING_BB_BC<-ourdataM$TEAM_BATTING_BB^0.68

ourdataM$TEAM_BASERUN_CS_BC<-ourdataM$TEAM_BASERUN_CS^0.02

ourdataM$TEAM_BASERUN_SB_BC<-ourdataM$TEAM_BASERUN_SB^0.17

ourdataM$TEAM_FIELDING_E_BC<-ourdataM$TEAM_FIELDING_E^(-0.3)
ourdataM[rownames(ourdataM)=='1622'|rownames(ourdataM)=='1352'|rownames(ourdataM)=='205',]<-NA

ourLM_BC <- lm(TARGET_WINS_BC~ TEAM_BATTING_H_BC+TEAM_BATTING_2B_BC+TEAM_BATTING_3B_BC+TEAM_BATTING_HR_BC+TEAM_BATTING_BB_BC+TEAM_BASERUN_CS_BC+TEAM_BASERUN_SB_BC+TEAM_FIELDING_E_BC, data=ourdataM)

summary(ourLM_BC)
## 
## Call:
## lm(formula = TARGET_WINS_BC ~ TEAM_BATTING_H_BC + TEAM_BATTING_2B_BC + 
##     TEAM_BATTING_3B_BC + TEAM_BATTING_HR_BC + TEAM_BATTING_BB_BC + 
##     TEAM_BASERUN_CS_BC + TEAM_BASERUN_SB_BC + TEAM_FIELDING_E_BC, 
##     data = ourdataM)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.470  -24.499   -0.183   24.388  105.146 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -411.7253   165.6148  -2.486    0.013 *  
## TEAM_BATTING_H_BC   146.1160    12.2780  11.901  < 2e-16 ***
## TEAM_BATTING_2B_BC  -14.3516     1.6789  -8.548  < 2e-16 ***
## TEAM_BATTING_3B_BC   25.0834     2.9529   8.494  < 2e-16 ***
## TEAM_BATTING_HR_BC    1.0721     0.2161   4.961 7.86e-07 ***
## TEAM_BATTING_BB_BC    1.3971     0.1365  10.233  < 2e-16 ***
## TEAM_BASERUN_CS_BC -626.9268   150.9332  -4.154 3.47e-05 ***
## TEAM_BASERUN_SB_BC   62.0789     5.9107  10.503  < 2e-16 ***
## TEAM_FIELDING_E_BC  692.0484    78.1164   8.859  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.48 on 1413 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.3451, Adjusted R-squared:  0.3414 
## F-statistic: 93.08 on 8 and 1413 DF,  p-value: < 2.2e-16
plot(ourLM_BC)

car::ncvTest(ourLM_BC)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.7091673, Df = 1, p = 0.39972
lmtest::bptest(ourLM_BC)
## 
##  studentized Breusch-Pagan test
## 
## data:  ourLM_BC
## BP = 32.658, df = 8, p-value = 7.098e-05
ourLM2 <- lm(TARGET_WINS~ ., data=ourdata)

summary(ourLM2)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = ourdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.786  -8.594   0.068   8.311  59.017 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      24.1263754  5.4055651   4.463 8.47e-06 ***
## INDEX            -0.0004470  0.0003764  -1.187  0.23521    
## TEAM_BATTING_H    0.0488248  0.0036952  13.213  < 2e-16 ***
## TEAM_BATTING_2B  -0.0209230  0.0091772  -2.280  0.02271 *  
## TEAM_BATTING_3B   0.0656207  0.0168303   3.899 9.94e-05 ***
## TEAM_BATTING_HR   0.0533970  0.0274931   1.942  0.05224 .  
## TEAM_BATTING_BB   0.0102557  0.0058396   1.756  0.07919 .  
## TEAM_BATTING_SO  -0.0083209  0.0025485  -3.265  0.00111 ** 
## TEAM_BASERUN_SB   0.0257675  0.0043656   5.902 4.12e-09 ***
## TEAM_BASERUN_CS  -0.0109832  0.0157825  -0.696  0.48656    
## TEAM_PITCHING_H  -0.0008249  0.0003678  -2.243  0.02499 *  
## TEAM_PITCHING_HR  0.0128205  0.0243873   0.526  0.59915    
## TEAM_PITCHING_BB  0.0006745  0.0041570   0.162  0.87112    
## TEAM_PITCHING_SO  0.0028316  0.0009219   3.071  0.00216 ** 
## TEAM_FIELDING_E  -0.0196272  0.0024621  -7.972 2.46e-15 ***
## TEAM_FIELDING_DP -0.1211301  0.0129512  -9.353  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2260 degrees of freedom
## Multiple R-squared:  0.3158, Adjusted R-squared:  0.3113 
## F-statistic: 69.55 on 15 and 2260 DF,  p-value: < 2.2e-16
plot(ourLM2)

lmtest::bptest(ourLM2) 
## 
##  studentized Breusch-Pagan test
## 
## data:  ourLM2
## BP = 301.59, df = 15, p-value < 2.2e-16
car::ncvTest(ourLM2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 22.62, Df = 1, p = 1.9742e-06
plot(cooks.distance(ourLM2))