ourdata<-read.csv(file="C:/HW/moneyball-training-data.csv")
# visual observation of top records
head(ourdata)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
library(knitr)
# mean/median/IQR/NA
kable(summary(ourdata))
| INDEX | TARGET_WINS | TEAM_BATTING_H | TEAM_BATTING_2B | TEAM_BATTING_3B | TEAM_BATTING_HR | TEAM_BATTING_BB | TEAM_BATTING_SO | TEAM_BASERUN_SB | TEAM_BASERUN_CS | TEAM_BATTING_HBP | TEAM_PITCHING_H | TEAM_PITCHING_HR | TEAM_PITCHING_BB | TEAM_PITCHING_SO | TEAM_FIELDING_E | TEAM_FIELDING_DP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 1.0 | Min. : 0.00 | Min. : 891 | Min. : 69.0 | Min. : 0.00 | Min. : 0.00 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. :29.00 | Min. : 1137 | Min. : 0.0 | Min. : 0.0 | Min. : 0.0 | Min. : 65.0 | Min. : 52.0 | |
| 1st Qu.: 630.8 | 1st Qu.: 71.00 | 1st Qu.:1383 | 1st Qu.:208.0 | 1st Qu.: 34.00 | 1st Qu.: 42.00 | 1st Qu.:451.0 | 1st Qu.: 548.0 | 1st Qu.: 66.0 | 1st Qu.: 38.0 | 1st Qu.:50.50 | 1st Qu.: 1419 | 1st Qu.: 50.0 | 1st Qu.: 476.0 | 1st Qu.: 615.0 | 1st Qu.: 127.0 | 1st Qu.:131.0 | |
| Median :1270.5 | Median : 82.00 | Median :1454 | Median :238.0 | Median : 47.00 | Median :102.00 | Median :512.0 | Median : 750.0 | Median :101.0 | Median : 49.0 | Median :58.00 | Median : 1518 | Median :107.0 | Median : 536.5 | Median : 813.5 | Median : 159.0 | Median :149.0 | |
| Mean :1268.5 | Mean : 80.79 | Mean :1469 | Mean :241.2 | Mean : 55.25 | Mean : 99.61 | Mean :501.6 | Mean : 735.6 | Mean :124.8 | Mean : 52.8 | Mean :59.36 | Mean : 1779 | Mean :105.7 | Mean : 553.0 | Mean : 817.7 | Mean : 246.5 | Mean :146.4 | |
| 3rd Qu.:1915.5 | 3rd Qu.: 92.00 | 3rd Qu.:1537 | 3rd Qu.:273.0 | 3rd Qu.: 72.00 | 3rd Qu.:147.00 | 3rd Qu.:580.0 | 3rd Qu.: 930.0 | 3rd Qu.:156.0 | 3rd Qu.: 62.0 | 3rd Qu.:67.00 | 3rd Qu.: 1682 | 3rd Qu.:150.0 | 3rd Qu.: 611.0 | 3rd Qu.: 968.0 | 3rd Qu.: 249.2 | 3rd Qu.:164.0 | |
| Max. :2535.0 | Max. :146.00 | Max. :2554 | Max. :458.0 | Max. :223.00 | Max. :264.00 | Max. :878.0 | Max. :1399.0 | Max. :697.0 | Max. :201.0 | Max. :95.00 | Max. :30132 | Max. :343.0 | Max. :3645.0 | Max. :19278.0 | Max. :1898.0 | Max. :228.0 | |
| NA | NA | NA | NA | NA | NA | NA | NA’s :102 | NA’s :131 | NA’s :772 | NA’s :2085 | NA | NA | NA | NA’s :102 | NA | NA’s :286 |
# histogram/density
# near-normal
par(mar=c(2,2,2,2))
par(mfrow = c(2,2))
hist(ourdata$TARGET_WINS)
plot(density(ourdata$TARGET_WINS))
# near-normal, long right-tail, potential outliers
hist(ourdata$TEAM_BATTING_H)
plot(density(ourdata$TEAM_BATTING_H))
# near-normal, some weirdness around the mean
hist(ourdata$TEAM_BATTING_2B)
plot(density(ourdata$TEAM_BATTING_2B))
# heavy right skewed, outliers(?)
hist(ourdata$TEAM_BATTING_3B)
plot(density(ourdata$TEAM_BATTING_3B))
# looks very weird, does not look right, potentially needs to be dropped
hist(ourdata$TEAM_BATTING_HR)
plot(density(ourdata$TEAM_BATTING_HR))
# left skewed, outliers(?)
hist(ourdata$TEAM_BATTING_BB)
plot(density(ourdata$TEAM_BATTING_BB))
# weird, potentially to be dropped
hist(ourdata$TEAM_BATTING_SO)
plot(density(ourdata$TEAM_BATTING_SO,na.rm = TRUE))
# right skewed, outliers(?)
hist(ourdata$TEAM_BASERUN_SB)
plot(density(ourdata$TEAM_BASERUN_SB,na.rm = TRUE))
# right skewed, outliers(?)
hist(ourdata$TEAM_BASERUN_CS)
plot(density(ourdata$TEAM_BASERUN_CS,na.rm = TRUE))
# right skewed, outliers(?)
hist(ourdata$TEAM_FIELDING_E)
plot(density(ourdata$TEAM_FIELDING_E,na.rm = TRUE))
# near-normal, slightly left skewed
hist(ourdata$TEAM_FIELDING_DP)
plot(density(ourdata$TEAM_FIELDING_DP,na.rm = TRUE))
# right skewed, outliers
hist(ourdata$TEAM_PITCHING_BB)
plot(density(ourdata$TEAM_PITCHING_BB,na.rm = TRUE))
# severily right skewed, to be dropped(?)
hist(ourdata$TEAM_PITCHING_H)
plot(density(ourdata$TEAM_PITCHING_H,na.rm = TRUE))
# weird, to be dropped(?)
hist(ourdata$TEAM_PITCHING_HR)
plot(density(ourdata$TEAM_PITCHING_HR,na.rm = TRUE))
# severily right skewed, to be dropped(?)
hist(ourdata$TEAM_PITCHING_SO)
plot(density(ourdata$TEAM_PITCHING_SO,na.rm = TRUE))
pairs(ourdata)
cor(ourdata, use="pairwise.complete.obs", method="pearson")
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX 1.0000000000 -0.02105643 -0.017920241 0.01118301
## TARGET_WINS -0.0210564349 1.00000000 0.388767521 0.28910365
## TEAM_BATTING_H -0.0179202413 0.38876752 1.000000000 0.56284968
## TEAM_BATTING_2B 0.0111830135 0.28910365 0.562849678 1.00000000
## TEAM_BATTING_3B -0.0058146834 0.14260841 0.427696575 -0.10730582
## TEAM_BATTING_HR 0.0514810468 0.17615320 -0.006544685 0.43539729
## TEAM_BATTING_BB -0.0265672362 0.23255986 -0.072464013 0.25572610
## TEAM_BATTING_SO 0.0814501106 -0.03175071 -0.463853571 0.16268519
## TEAM_BASERUN_SB 0.0402671277 0.13513892 0.123567797 -0.19975724
## TEAM_BASERUN_CS 0.0005653743 0.02240407 0.016705668 -0.09981406
## TEAM_BATTING_HBP 0.0771930266 0.07350424 -0.029112176 0.04608475
## TEAM_PITCHING_H 0.0171031479 -0.10993705 0.302693709 0.02369219
## TEAM_PITCHING_HR 0.0509858973 0.18901373 0.072853119 0.45455082
## TEAM_PITCHING_BB -0.0152875130 0.12417454 0.094193027 0.17805420
## TEAM_PITCHING_SO 0.0558901457 -0.07843609 -0.252656790 0.06479231
## TEAM_FIELDING_E -0.0092331265 -0.17648476 0.264902478 -0.23515099
## TEAM_FIELDING_DP 0.0200642919 -0.03485058 0.155383321 0.29087998
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX -0.005814683 0.051481047 -0.02656724
## TARGET_WINS 0.142608411 0.176153200 0.23255986
## TEAM_BATTING_H 0.427696575 -0.006544685 -0.07246401
## TEAM_BATTING_2B -0.107305824 0.435397293 0.25572610
## TEAM_BATTING_3B 1.000000000 -0.635566946 -0.28723584
## TEAM_BATTING_HR -0.635566946 1.000000000 0.51373481
## TEAM_BATTING_BB -0.287235841 0.513734810 1.00000000
## TEAM_BATTING_SO -0.669781188 0.727069348 0.37975087
## TEAM_BASERUN_SB 0.533506448 -0.453578426 -0.10511564
## TEAM_BASERUN_CS 0.348764919 -0.433793868 -0.13698837
## TEAM_BATTING_HBP -0.174247154 0.106181160 0.04746007
## TEAM_PITCHING_H 0.194879411 -0.250145481 -0.44977762
## TEAM_PITCHING_HR -0.567836679 0.969371396 0.45955207
## TEAM_PITCHING_BB -0.002224148 0.136927564 0.48936126
## TEAM_PITCHING_SO -0.258818931 0.184707564 -0.02075682
## TEAM_FIELDING_E 0.509778447 -0.587339098 -0.65597081
## TEAM_FIELDING_DP -0.323074847 0.448985348 0.43087675
## TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX 0.08145011 0.04026713 0.0005653743
## TARGET_WINS -0.03175071 0.13513892 0.0224040691
## TEAM_BATTING_H -0.46385357 0.12356780 0.0167056677
## TEAM_BATTING_2B 0.16268519 -0.19975724 -0.0998140593
## TEAM_BATTING_3B -0.66978119 0.53350645 0.3487649195
## TEAM_BATTING_HR 0.72706935 -0.45357843 -0.4337938681
## TEAM_BATTING_BB 0.37975087 -0.10511564 -0.1369883707
## TEAM_BATTING_SO 1.00000000 -0.25448923 -0.2178813684
## TEAM_BASERUN_SB -0.25448923 1.00000000 0.6552448036
## TEAM_BASERUN_CS -0.21788137 0.65524480 1.0000000000
## TEAM_BATTING_HBP 0.22094219 -0.06400498 -0.0705138958
## TEAM_PITCHING_H -0.37568637 0.07328505 -0.0520078089
## TEAM_PITCHING_HR 0.66717889 -0.41651072 -0.4225660463
## TEAM_PITCHING_BB 0.03700514 0.14641513 -0.1069612356
## TEAM_PITCHING_SO 0.41623330 -0.13712861 -0.2102227352
## TEAM_FIELDING_E -0.58466444 0.50963090 0.0483218940
## TEAM_FIELDING_DP 0.15488939 -0.49707763 -0.2142480076
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## INDEX 0.07719303 0.01710315 0.05098590
## TARGET_WINS 0.07350424 -0.10993705 0.18901373
## TEAM_BATTING_H -0.02911218 0.30269371 0.07285312
## TEAM_BATTING_2B 0.04608475 0.02369219 0.45455082
## TEAM_BATTING_3B -0.17424715 0.19487941 -0.56783668
## TEAM_BATTING_HR 0.10618116 -0.25014548 0.96937140
## TEAM_BATTING_BB 0.04746007 -0.44977762 0.45955207
## TEAM_BATTING_SO 0.22094219 -0.37568637 0.66717889
## TEAM_BASERUN_SB -0.06400498 0.07328505 -0.41651072
## TEAM_BASERUN_CS -0.07051390 -0.05200781 -0.42256605
## TEAM_BATTING_HBP 1.00000000 -0.02769699 0.10675878
## TEAM_PITCHING_H -0.02769699 1.00000000 -0.14161276
## TEAM_PITCHING_HR 0.10675878 -0.14161276 1.00000000
## TEAM_PITCHING_BB 0.04785137 0.32067616 0.22193750
## TEAM_PITCHING_SO 0.22157375 0.26724807 0.20588053
## TEAM_FIELDING_E 0.04178971 0.66775901 -0.49314447
## TEAM_FIELDING_DP -0.07120824 -0.22865059 0.43917040
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## INDEX -0.015287513 0.05589015 -0.009233126
## TARGET_WINS 0.124174536 -0.07843609 -0.176484759
## TEAM_BATTING_H 0.094193027 -0.25265679 0.264902478
## TEAM_BATTING_2B 0.178054204 0.06479231 -0.235150986
## TEAM_BATTING_3B -0.002224148 -0.25881893 0.509778447
## TEAM_BATTING_HR 0.136927564 0.18470756 -0.587339098
## TEAM_BATTING_BB 0.489361263 -0.02075682 -0.655970815
## TEAM_BATTING_SO 0.037005141 0.41623330 -0.584664436
## TEAM_BASERUN_SB 0.146415134 -0.13712861 0.509630902
## TEAM_BASERUN_CS -0.106961236 -0.21022274 0.048321894
## TEAM_BATTING_HBP 0.047851371 0.22157375 0.041789712
## TEAM_PITCHING_H 0.320676162 0.26724807 0.667759010
## TEAM_PITCHING_HR 0.221937505 0.20588053 -0.493144466
## TEAM_PITCHING_BB 1.000000000 0.48849865 -0.022837561
## TEAM_PITCHING_SO 0.488498653 1.00000000 -0.023291783
## TEAM_FIELDING_E -0.022837561 -0.02329178 1.000000000
## TEAM_FIELDING_DP 0.324457226 0.02615804 -0.497684954
## TEAM_FIELDING_DP
## INDEX 0.02006429
## TARGET_WINS -0.03485058
## TEAM_BATTING_H 0.15538332
## TEAM_BATTING_2B 0.29087998
## TEAM_BATTING_3B -0.32307485
## TEAM_BATTING_HR 0.44898535
## TEAM_BATTING_BB 0.43087675
## TEAM_BATTING_SO 0.15488939
## TEAM_BASERUN_SB -0.49707763
## TEAM_BASERUN_CS -0.21424801
## TEAM_BATTING_HBP -0.07120824
## TEAM_PITCHING_H -0.22865059
## TEAM_PITCHING_HR 0.43917040
## TEAM_PITCHING_BB 0.32445723
## TEAM_PITCHING_SO 0.02615804
## TEAM_FIELDING_E -0.49768495
## TEAM_FIELDING_DP 1.00000000
# very high correlation for Home Runs - very suspicious - looks wrong
temp <- lm(TEAM_BATTING_HR~ TEAM_PITCHING_HR, data=ourdata)
with(ourdata,plot(TEAM_PITCHING_HR,TEAM_BATTING_HR))
abline(temp)
# some data is missing
sum(ourdata$TEAM_BATTING_H)
## [1] 3344058
sum(ourdata$TEAM_PITCHING_H)
## [1] 4049483
sum(ourdata$TEAM_BATTING_BB)
## [1] 1141548
sum(ourdata$TEAM_PITCHING_BB)
## [1] 1258646
ourdata$losses<-162-ourdata$TARGET_WINS
sum(ourdata$TARGET_WINS)
## [1] 183880
sum(ourdata$losses)
## [1] 184832
ourdata$losses<-NULL
Problems with the data
Number of wins and number of losses is not the same, so we do not have complete data; which still could be ok for our analysis
No strong correlations between dependent and independent variables
TEAM_BATTING_HR and TEAM_PITCHING_HR are almost identical , correlation between them 93%. It does not make sense. Their distribution look very weird too. Possible we need to drop both variables.
TEAM_PITCHING_H correlates positively, while it should have negative correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.has a lot of outliers. Maximum value is 30,132 or 191 per game, which is not possible (heavily right skewed) It needs to be fixed or dropped.
TEAM_BASERUN_CS correlates positively (actually close to zero), while it should have negative correlation. Right-skewed with potential outliers. If we use the variable we need to explain why we use variable that is counter-intuitive.
TEAM_BASERUN_HBP has too much missing data. It needs to be dropped.
TEAM_PITCHING_BB correlates positively, while it should have negative correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.. It has a lot of outliers, which seem to skew the data. Maybe it could be used after we fix it.
TEAM_PITCHING_SO correlates negatively, while it should have positive correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.. It has a lot of outliers, which seem to skew the data. Maybe it could be used after we fix it.
TEAM_FIELDING_DP correlates negatively (close to zero), while it should have positive correlation. If we use the variable we need to explain why we use variable that is counter-intuitive.
Strikeout numbers both for Batting and Pitching look very low. Distributions look weird, skewed.
# dropping bad variable
ourdata$TEAM_BATTING_HBP<-NULL
# We will keep one dataset the way it is, and we will create a copy of it to manually adjust data
ourdataM<-ourdata
# dropping pitching Home Runs
ourdataM$TEAM_PITCHING_HR<-NULL
# dropping outliers
82-(92-71)*2
## [1] 40
82+(92-71)*2
## [1] 124
# 116 Wins was the maximum, so to use 124 as cut off is very conservative; https://www.mlb.com/news/most-mlb-wins-in-a-season/c-289159676
# The minimum in more resent years was 40; http://writing.jmpressley.net/sports/worstseasons.html
ourdataM<-ourdataM[!(ourdata$TARGET_WINS<40),]
ourdataM<-ourdataM[!(ourdata$TARGET_WINS>124),]
# I use 2018 data as reference; https://www.baseball-reference.com/leagues/MLB/2018.shtml
1454-(1537-1383)*2
## [1] 1146
1454+(1537-1383)*2
## [1] 1762
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_H<1146),]
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_H>1762),]
238-(273-208)*2
## [1] 108
238+(273-208)*2
## [1] 368
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_2B<108),]
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_2B>368),]
47-(72-34)*2
## [1] -29
47+(72-34)*2
## [1] 123
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_3B>123),]
# Home Run historical data indicates 3 was the minumum ever; http://www.baseball-almanac.com/recbooks/rb_hr7.shtml
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_HR<3),]
512-(580-451)*2
## [1] 254
512+(580-451)*2
## [1] 770
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_BB<254),]
ourdataM<-ourdataM[!(ourdataM$TEAM_BATTING_BB>770),]
# TEAM_BATTING_SO is very inconsistent with the 2018 reference data; I would drop the variable
ourdataM$TEAM_BATTING_SO<-NULL
101-(156-56)*2
## [1] -99
101+(156-56)*2
## [1] 301
ourdataM<-ourdataM[!(ourdata$TEAM_BASERUN_SB>301),]
1518-(1682-1419)*2
## [1] 992
1518+(1682-1419)*2
## [1] 2044
ourdataM<-ourdataM[!(ourdata$TEAM_PITCHING_H<992),]
ourdataM<-ourdataM[!(ourdata$TEAM_PITCHING_H>2044),]
536.5-(611-476)*2
## [1] 266.5
536.5+(611-476)*2
## [1] 806.5
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_BB<266),]
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_BB>807),]
813.5-(968-615)*2
## [1] 107.5
813.5+(968-615)*2
## [1] 1519.5
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_SO<107),]
ourdataM<-ourdataM[!(ourdataM$TEAM_PITCHING_SO>1520),]
159-(249.2-127)*2
## [1] -85.4
159+(249.2-127)*2
## [1] 403.4
ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_E>404),]
149-(164-131)*2
## [1] 83
149+(164-131)*2
## [1] 215
ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_DP>215),]
ourdataM<-ourdataM[!(ourdataM$TEAM_FIELDING_DP<83),]
ourdataM<-ourdataM[!is.na(ourdataM$INDEX),]
summary(ourdataM)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 3 Min. : 40.00 Min. :1168 Min. :130
## 1st Qu.: 671 1st Qu.: 72.00 1st Qu.:1381 1st Qu.:215
## Median :1270 Median : 81.00 Median :1446 Median :244
## Mean :1254 Mean : 80.58 Mean :1453 Mean :247
## 3rd Qu.:1862 3rd Qu.: 90.00 3rd Qu.:1523 3rd Qu.:278
## Max. :2534 Max. :120.00 Max. :1758 Max. :367
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB
## Min. : 11.00 Min. : 5.0 Min. :273.0 Min. : 18.0
## 1st Qu.: 31.00 1st Qu.: 76.0 1st Qu.:472.0 1st Qu.: 63.0
## Median : 41.00 Median :119.0 Median :525.0 Median : 92.0
## Mean : 47.25 Mean :115.9 Mean :528.4 Mean :102.1
## 3rd Qu.: 60.00 3rd Qu.:156.0 3rd Qu.:586.0 3rd Qu.:132.0
## Max. :118.00 Max. :264.0 Max. :770.0 Max. :367.0
##
## TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO
## Min. : 11.00 Min. :1168 Min. :325.0 Min. : 345.0
## 1st Qu.: 38.00 1st Qu.:1404 1st Qu.:488.0 1st Qu.: 644.0
## Median : 50.00 Median :1486 Median :541.0 Median : 827.0
## Mean : 53.49 Mean :1513 Mean :548.9 Mean : 816.6
## 3rd Qu.: 63.00 3rd Qu.:1586 3rd Qu.:605.5 3rd Qu.: 972.5
## Max. :201.00 Max. :2394 Max. :807.0 Max. :1481.0
## NA's :253
## TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 65.0 Min. : 83.0
## 1st Qu.:121.0 1st Qu.:136.0
## Median :142.0 Median :151.0
## Mean :159.3 Mean :150.2
## 3rd Qu.:181.0 3rd Qu.:165.0
## Max. :397.0 Max. :215.0
##
# using median to impute missing values for manually adjusted model
ourdataM$TEAM_PITCHING_SO[is.na(ourdataM$TEAM_PITCHING_SO)]<-827
ourdataM$TEAM_BASERUN_SB[is.na(ourdataM$TEAM_BASERUN_SB)]<-92
ourdataM$TEAM_FIELDING_DP[is.na(ourdataM$TEAM_FIELDING_DP)]<-151
ourdataM$TEAM_BASERUN_CS[is.na(ourdataM$TEAM_BASERUN_CS)]<-50
# using median to impute missing values for original model
ourdata$TEAM_PITCHING_SO[is.na(ourdata$TEAM_PITCHING_SO)]<-817.7
ourdata$TEAM_BASERUN_SB[is.na(ourdata$TEAM_BASERUN_SB)]<-101
ourdata$TEAM_FIELDING_DP[is.na(ourdata$TEAM_FIELDING_DP)]<-149
ourdata$TEAM_BASERUN_CS[is.na(ourdata$TEAM_BASERUN_CS)]<-49
ourdata$TEAM_BATTING_SO[is.na(ourdata$TEAM_BATTING_SO)]<-750
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_BASERUN_CS, method="pearson", use = "complete.obs")
## [1] -0.02438445
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_H, method="pearson", use = "complete.obs")
## [1] 0.2322221
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_BB, method="pearson", use = "complete.obs")
## [1] 0.3021963
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_PITCHING_SO, method="pearson", use = "complete.obs")
## [1] -0.05017687
cor(ourdataM$TARGET_WINS,ourdataM$TEAM_FIELDING_DP, method="pearson", use = "complete.obs")
## [1] -0.03810618
#For 4 variables correlation still does not make sense, corrupted variables; we will NA them
ourdataM$TEAM_PITCHING_BB<-NULL
ourdataM$TEAM_PITCHING_H<-NULL
ourdataM$TEAM_PITCHING_SO<-NULL
ourdataM$TEAM_FIELDING_DP<-NULL
head(ourdataM)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## 7 7 80 1244 179 54
## 8 8 85 1273 171 37
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB TEAM_BASERUN_CS
## 3 137 602 46 27
## 4 96 451 43 30
## 5 102 472 49 39
## 6 92 443 107 59
## 7 122 525 80 54
## 8 115 456 40 36
## TEAM_FIELDING_E
## 3 175
## 4 164
## 5 138
## 6 123
## 7 136
## 8 112
library(car)
## Loading required package: carData
ourLM <- lm(TARGET_WINS~ TEAM_BATTING_H+TEAM_BATTING_2B+TEAM_BATTING_3B+TEAM_BATTING_BB+TEAM_BASERUN_CS+TEAM_BASERUN_SB+TEAM_FIELDING_E, data=ourdataM)
summary(ourLM)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_BASERUN_CS + TEAM_BASERUN_SB +
## TEAM_FIELDING_E, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.990 -7.131 -0.053 7.501 34.768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.734616 4.928213 0.758 0.449
## TEAM_BATTING_H 0.055283 0.004213 13.122 < 2e-16 ***
## TEAM_BATTING_2B -0.074178 0.009996 -7.421 2.00e-13 ***
## TEAM_BATTING_3B 0.150994 0.020923 7.217 8.64e-13 ***
## TEAM_BATTING_BB 0.039049 0.003695 10.569 < 2e-16 ***
## TEAM_BASERUN_CS -0.080828 0.015468 -5.226 2.00e-07 ***
## TEAM_BASERUN_SB 0.079308 0.006349 12.491 < 2e-16 ***
## TEAM_FIELDING_E -0.105254 0.007874 -13.368 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.65 on 1419 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3363
## F-statistic: 104.2 on 7 and 1419 DF, p-value: < 2.2e-16
plot(cooks.distance(ourLM))
plot(ourLM)
ourinf<-influence(ourLM)
sum(ourinf$hat)
## [1] 8
#install.packages("faraway")
library(faraway)
## Warning: package 'faraway' was built under R version 3.5.2
##
## Attaching package: 'faraway'
## The following objects are masked from 'package:car':
##
## logit, vif
halfnorm(lm.influence(ourLM)$hat,labs=row.names(ourdataM),ylab="Leverages")
ourLMsum<-summary(ourLM)
ourLMsum$sig
## [1] 10.64642
stud<-residuals(ourLM)/(ourLMsum$sig*sqrt(1-ourinf$hat))
qqnorm(stud)
abline(0,1)
dim(ourdataM)
## [1] 1427 10
jack<-rstudent(ourLM)
jack[which.max(abs(jack))]
## 1622
## 3.299737
# 2n/(n-1-p)
qt(.05/(1421*2),1421-1-8)
## [1] -4.150314
# no outliers
ourdataM[rownames(ourdataM)=='183'|rownames(ourdataM)=='1409',]<-NA
ourdataM<-ourdataM[!is.na(ourdataM$TARGET_WINS),]
LMTBH <- lm(TARGET_WINS~ TEAM_BATTING_H, data=ourdataM)
summary(LMTBH)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.653 -8.536 0.310 8.712 37.032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.783658 4.493969 3.957 7.96e-05 ***
## TEAM_BATTING_H 0.043213 0.003084 14.013 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.26 on 1423 degrees of freedom
## Multiple R-squared: 0.1213, Adjusted R-squared: 0.1206
## F-statistic: 196.4 on 1 and 1423 DF, p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_H,TARGET_WINS))
abline(LMTBH)
LMTBH_R = rstandard(LMTBH)
plot(ourdataM$TEAM_BATTING_H, LMTBH_R, ylab="Standardized Residuals", xlab="BATTING HITS", main="WINS")
abline(0, 0)
LMTB2B <- lm(TARGET_WINS~ TEAM_BATTING_2B, data=ourdataM)
summary(LMTB2B)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.588 -8.933 0.350 8.926 41.091
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.23438 1.97606 33.012 < 2e-16 ***
## TEAM_BATTING_2B 0.06216 0.00788 7.888 6.06e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.8 on 1423 degrees of freedom
## Multiple R-squared: 0.0419, Adjusted R-squared: 0.04122
## F-statistic: 62.23 on 1 and 1423 DF, p-value: 6.058e-15
with(ourdataM,plot(TEAM_BATTING_2B,TARGET_WINS))
abline(LMTB2B)
LMTB2B_R = rstandard(LMTB2B)
plot(ourdataM$TEAM_BATTING_2B, LMTB2B_R, ylab="Residuals", xlab="BATTING HITS 2 BASE", main="WINS")
abline(0, 0)
# QUDRATIC ?
LMTB3B <- lm(TARGET_WINS~ TEAM_BATTING_3B, data=ourdataM)
summary(LMTB3B)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_3B, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.675 -9.079 0.580 9.202 36.782
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.76037 0.83617 92.996 < 2e-16 ***
## TEAM_BATTING_3B 0.05993 0.01613 3.716 0.00021 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.02 on 1423 degrees of freedom
## Multiple R-squared: 0.00961, Adjusted R-squared: 0.008914
## F-statistic: 13.81 on 1 and 1423 DF, p-value: 0.0002104
with(ourdataM,plot(TEAM_BATTING_3B,TARGET_WINS))
abline(LMTB3B)
LMTB3B_R = rstandard(LMTB3B)
plot(ourdataM$TEAM_BATTING_3B, LMTB3B_R, ylab="Residuals", xlab="BATTING HITS 3 BASE", main="WINS")
abline(0, 0)
LMTBHR <- lm(TARGET_WINS~ TEAM_BATTING_HR, data=ourdataM)
summary(LMTBHR)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_HR, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.028 -8.874 0.472 8.781 44.893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.895440 0.790878 93.435 <2e-16 ***
## TEAM_BATTING_HR 0.057674 0.006166 9.354 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.7 on 1423 degrees of freedom
## Multiple R-squared: 0.05792, Adjusted R-squared: 0.05726
## F-statistic: 87.49 on 1 and 1423 DF, p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_HR,TARGET_WINS))
abline(LMTBHR)
LMTBHR_R = rstandard(LMTBHR)
plot(ourdataM$TEAM_BATTING_HR, LMTBHR_R, ylab="Residuals", xlab="BATTING HITS HOME RUNS", main="WINS")
abline(0, 0)
LMTBBB <- lm(TARGET_WINS~ TEAM_BATTING_BB, data=ourdataM)
summary(LMTBBB)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_BB, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.385 -8.188 0.188 8.799 44.144
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.087763 2.134672 25.34 <2e-16 ***
## TEAM_BATTING_BB 0.050157 0.003992 12.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.41 on 1423 degrees of freedom
## Multiple R-squared: 0.09987, Adjusted R-squared: 0.09924
## F-statistic: 157.9 on 1 and 1423 DF, p-value: < 2.2e-16
with(ourdataM,plot(TEAM_BATTING_BB,TARGET_WINS))
abline(LMTBBB)
LMTBBB_R = rstandard(LMTBBB)
plot(ourdataM$TEAM_BATTING_BB, LMTBHR_R, ylab="Residuals", xlab="BATTING HITS HOME RUNS", main="WINS")
abline(0, 0)
LMTBSB <- lm(TARGET_WINS~ TEAM_BASERUN_SB, data=ourdataM)
summary(LMTBSB)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.200 -9.059 0.475 8.911 38.901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.958747 0.738738 104.18 < 2e-16 ***
## TEAM_BASERUN_SB 0.035641 0.006421 5.55 3.39e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.94 on 1423 degrees of freedom
## Multiple R-squared: 0.02119, Adjusted R-squared: 0.0205
## F-statistic: 30.81 on 1 and 1423 DF, p-value: 3.394e-08
with(ourdataM,plot(TEAM_BASERUN_SB,TARGET_WINS))
abline(LMTBSB)
LMTBSB_R = rstandard(LMTBSB)
plot(ourdataM$TEAM_BASERUN_SB, LMTBSB_R, ylab="Residuals", xlab="BATTING SB RUNS", main="WINS")
abline(0, 0)
LMTBFE <- lm(TARGET_WINS~ TEAM_FIELDING_E, data=ourdataM)
summary(LMTBFE)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.490 -8.879 0.508 8.848 44.835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.286435 1.022156 85.394 < 2e-16 ***
## TEAM_FIELDING_E -0.042087 0.006058 -6.948 5.63e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 1423 degrees of freedom
## Multiple R-squared: 0.03281, Adjusted R-squared: 0.03213
## F-statistic: 48.27 on 1 and 1423 DF, p-value: 5.625e-12
with(ourdataM,plot(TEAM_FIELDING_E,TARGET_WINS))
abline(LMTBFE)
LMTBFE_R = rstandard(LMTBFE)
plot(ourdataM$TEAM_FIELDING_E, LMTBFE_R, ylab="Residuals", xlab="FIELD ERRORS", main="WINS")
abline(0, 0)
head(ourdataM)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## 7 7 80 1244 179 54
## 8 8 85 1273 171 37
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB TEAM_BASERUN_CS
## 3 137 602 46 27
## 4 96 451 43 30
## 5 102 472 49 39
## 6 92 443 107 59
## 7 122 525 80 54
## 8 115 456 40 36
## TEAM_FIELDING_E
## 3 175
## 4 164
## 5 138
## 6 123
## 7 136
## 8 112
powerTransform(cbind(ourdataM$TARGET_WINS,ourdataM$TEAM_BATTING_H,ourdataM$TEAM_BATTING_2B,ourdataM$TEAM_BATTING_3B,ourdataM$TEAM_BATTING_HR,ourdataM$TEAM_BATTING_BB,ourdataM$TEAM_BASERUN_CS,ourdataM$TEAM_BASERUN_SB,ourdataM$TEAM_FIELDING_E))
## Estimated transformation parameters
## Y1 Y2 Y3 Y4 Y5 Y6
## 1.22735945 0.26192872 0.43644658 0.33568310 0.67141571 0.68145733
## Y7 Y8 Y9
## 0.01584513 0.17057304 -0.29854402
ourdataM$TARGET_WINS_BC<-ourdataM$TARGET_WINS^1.23
ourdataM$TEAM_BATTING_H_BC<-ourdataM$TEAM_BATTING_H^0.26
ourdataM$TEAM_BATTING_2B_BC<-ourdataM$TEAM_BATTING_2B^0.44
ourdataM$TEAM_BATTING_3B_BC<-ourdataM$TEAM_BATTING_3B^0.34
ourdataM$TEAM_BATTING_HR_BC<-ourdataM$TEAM_BATTING_HR^0.67
ourdataM$TEAM_BATTING_BB_BC<-ourdataM$TEAM_BATTING_BB^0.68
ourdataM$TEAM_BASERUN_CS_BC<-ourdataM$TEAM_BASERUN_CS^0.02
ourdataM$TEAM_BASERUN_SB_BC<-ourdataM$TEAM_BASERUN_SB^0.17
ourdataM$TEAM_FIELDING_E_BC<-ourdataM$TEAM_FIELDING_E^(-0.3)
ourdataM[rownames(ourdataM)=='1622'|rownames(ourdataM)=='1352'|rownames(ourdataM)=='205',]<-NA
ourLM_BC <- lm(TARGET_WINS_BC~ TEAM_BATTING_H_BC+TEAM_BATTING_2B_BC+TEAM_BATTING_3B_BC+TEAM_BATTING_HR_BC+TEAM_BATTING_BB_BC+TEAM_BASERUN_CS_BC+TEAM_BASERUN_SB_BC+TEAM_FIELDING_E_BC, data=ourdataM)
summary(ourLM_BC)
##
## Call:
## lm(formula = TARGET_WINS_BC ~ TEAM_BATTING_H_BC + TEAM_BATTING_2B_BC +
## TEAM_BATTING_3B_BC + TEAM_BATTING_HR_BC + TEAM_BATTING_BB_BC +
## TEAM_BASERUN_CS_BC + TEAM_BASERUN_SB_BC + TEAM_FIELDING_E_BC,
## data = ourdataM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.470 -24.499 -0.183 24.388 105.146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -411.7253 165.6148 -2.486 0.013 *
## TEAM_BATTING_H_BC 146.1160 12.2780 11.901 < 2e-16 ***
## TEAM_BATTING_2B_BC -14.3516 1.6789 -8.548 < 2e-16 ***
## TEAM_BATTING_3B_BC 25.0834 2.9529 8.494 < 2e-16 ***
## TEAM_BATTING_HR_BC 1.0721 0.2161 4.961 7.86e-07 ***
## TEAM_BATTING_BB_BC 1.3971 0.1365 10.233 < 2e-16 ***
## TEAM_BASERUN_CS_BC -626.9268 150.9332 -4.154 3.47e-05 ***
## TEAM_BASERUN_SB_BC 62.0789 5.9107 10.503 < 2e-16 ***
## TEAM_FIELDING_E_BC 692.0484 78.1164 8.859 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.48 on 1413 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.3451, Adjusted R-squared: 0.3414
## F-statistic: 93.08 on 8 and 1413 DF, p-value: < 2.2e-16
plot(ourLM_BC)
car::ncvTest(ourLM_BC)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.7091673, Df = 1, p = 0.39972
lmtest::bptest(ourLM_BC)
##
## studentized Breusch-Pagan test
##
## data: ourLM_BC
## BP = 32.658, df = 8, p-value = 7.098e-05
ourLM2 <- lm(TARGET_WINS~ ., data=ourdata)
summary(ourLM2)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = ourdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.786 -8.594 0.068 8.311 59.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.1263754 5.4055651 4.463 8.47e-06 ***
## INDEX -0.0004470 0.0003764 -1.187 0.23521
## TEAM_BATTING_H 0.0488248 0.0036952 13.213 < 2e-16 ***
## TEAM_BATTING_2B -0.0209230 0.0091772 -2.280 0.02271 *
## TEAM_BATTING_3B 0.0656207 0.0168303 3.899 9.94e-05 ***
## TEAM_BATTING_HR 0.0533970 0.0274931 1.942 0.05224 .
## TEAM_BATTING_BB 0.0102557 0.0058396 1.756 0.07919 .
## TEAM_BATTING_SO -0.0083209 0.0025485 -3.265 0.00111 **
## TEAM_BASERUN_SB 0.0257675 0.0043656 5.902 4.12e-09 ***
## TEAM_BASERUN_CS -0.0109832 0.0157825 -0.696 0.48656
## TEAM_PITCHING_H -0.0008249 0.0003678 -2.243 0.02499 *
## TEAM_PITCHING_HR 0.0128205 0.0243873 0.526 0.59915
## TEAM_PITCHING_BB 0.0006745 0.0041570 0.162 0.87112
## TEAM_PITCHING_SO 0.0028316 0.0009219 3.071 0.00216 **
## TEAM_FIELDING_E -0.0196272 0.0024621 -7.972 2.46e-15 ***
## TEAM_FIELDING_DP -0.1211301 0.0129512 -9.353 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2260 degrees of freedom
## Multiple R-squared: 0.3158, Adjusted R-squared: 0.3113
## F-statistic: 69.55 on 15 and 2260 DF, p-value: < 2.2e-16
plot(ourLM2)
lmtest::bptest(ourLM2)
##
## studentized Breusch-Pagan test
##
## data: ourLM2
## BP = 301.59, df = 15, p-value < 2.2e-16
car::ncvTest(ourLM2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 22.62, Df = 1, p = 1.9742e-06
plot(cooks.distance(ourLM2))