##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'corrplot' was built under R version 4.0.2
## corrplot 0.84 loaded
Overview of Data
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 9 1209 170 33 83
## 2 10 1221 151 29 88
## 3 14 1395 183 29 93
## 4 47 1539 309 29 159
## 5 60 1445 203 68 5
## 6 63 1431 236 53 10
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 447 1080 62 50
## 2 516 929 54 39
## 3 509 816 59 47
## 4 486 914 148 57
## 5 95 416 NA NA
## 6 215 377 NA NA
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 NA 1209 83 447
## 2 NA 1221 88 516
## 3 NA 1395 93 509
## 4 42 1539 159 486
## 5 NA 3902 14 257
## 6 NA 2793 20 420
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 1080 140 156
## 2 929 135 164
## 3 816 156 153
## 4 914 124 154
## 5 1123 616 130
## 6 736 572 105
## INDEX TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 9 Min. : 819 Min. : 44.0 Min. : 14.00
## 1st Qu.: 708 1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00
## Median :1249 Median :1455 Median :239.0 Median : 52.00
## Mean :1264 Mean :1469 Mean :241.3 Mean : 55.91
## 3rd Qu.:1832 3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00
## Max. :2525 Max. :2170 Max. :376.0 Max. :155.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0
## Median :101.00 Median :509.0 Median : 686.0 Median : 92.0
## Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7
## 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8
## Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0
## NA's :18 NA's :13
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0
## 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0
## Median : 49.50 Median :62.00 Median : 1515 Median :104.0
## Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1
## 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5
## Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0
## NA's :87 NA's :240
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
## 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
## Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
## Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
## 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
## Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
## NA's :18 NA's :31
money_b_train <-read.csv('moneyball-training-data.csv')
# look at the first several rows of the data
head(money_b_train)## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1 39 1445 194 39
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 13 143 842 NA
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 NA NA 9364 84
## 2 28 NA 1347 191
## 3 27 NA 1377 137
## 4 30 NA 1396 97
## 5 39 NA 1297 102
## 6 59 NA 1279 92
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
This data set consists of 2276 observations of 17 numeric variables describing baseball wins.
- INDEX
- TARGET_WINS
- TEAM_BATTING_H
- TEAM_BATTING_2B
- TEAM_BATTING_3B
- TEAM_BATTING_HR
- TEAM_BATTING_BB
- TEAM_BATTING_SO
- TEAM_BASERUN_SB
- TEAM_BASERUN_CS
- TEAM_BATTING_HBP
- TEAM_PITCHING_H
- TEAM_PITCHING_HR
- TEAM_PITCHING_BB
- TEAM_PITCHING_SO
- TEAM_FIELDING_E
- TEAM_FIELDING_DP
- INDEX
Hypothesis
Our hypothesis predicts that :
- TEAM_BATTING_H Base Hits by batters (1B,2B,3B,HR) - Positive Impact on Wins
- TEAM_BATTING_2B Doubles by batters (2B) - Positive Impact on Wins
- TEAM_BATTING_3B Triples by batters (3B) - Positive Impact on Wins
- TEAM_BATTING_HR Homeruns by batters (4B) - Positive Impact on Wins
- TEAM_BATTING_BB Walks by batters - Positive Impact on Wins
- TEAM_BATTING_HBP Batters hit by pitch (get a free base) - Positive Impact on Wins
- TEAM_BATTING_SO Strikeouts by batters - Negative Impact on Wins
- TEAM_BASERUN_SB Stolen bases - Positive Impact on Wins
- TEAM_BASERUN_CS Caught stealing - Negative Impact on Wins
- TEAM_FIELDING_E Errors - Negative Impact on Wins
- TEAM_FIELDING_DP Double Plays - Positive Impact on Wins
- TEAM_PITCHING_BB Walks allowed - Negative Impact on Wins
- TEAM_PITCHING_H Hits allowed - Negative Impact on Wins
- TEAM_PITCHING_HR Homeruns allowed - Negative Impact on Wins
- TEAM_PITCHING_SO Strikeouts by pitchers Positive Impact on Wins
# lets observe how targets_win are effected by other factors
hist(money_b_train$TARGET_WINS,xlab="TARGET_WINS",main="")Model 1
model <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data=money_b_train)
summary(model)##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.041 -8.558 0.145 8.907 53.331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.2930631 4.5824723 6.174 7.86e-10 ***
## TEAM_BATTING_H 0.0429317 0.0034842 12.322 < 2e-16 ***
## TEAM_BATTING_2B -0.0042264 0.0097037 -0.436 0.663209
## TEAM_BATTING_3B 0.0649817 0.0170796 3.805 0.000146 ***
## TEAM_BATTING_HR 0.0703822 0.0282035 2.496 0.012648 *
## TEAM_BATTING_BB 0.0022019 0.0059003 0.373 0.709047
## TEAM_BATTING_SO -0.0101940 0.0020913 -4.875 1.17e-06 ***
## TEAM_BASERUN_SB 0.0042336 0.0042081 1.006 0.314494
## TEAM_BASERUN_CS -0.0077185 0.0109760 -0.703 0.481993
## TEAM_BATTING_HBP -0.0542327 0.0194784 -2.784 0.005410 **
## TEAM_PITCHING_H -0.0008096 0.0003800 -2.130 0.033238 *
## TEAM_PITCHING_HR -0.0025561 0.0249180 -0.103 0.918304
## TEAM_PITCHING_BB 0.0028645 0.0042178 0.679 0.497119
## TEAM_PITCHING_SO 0.0019800 0.0009422 2.101 0.035710 *
## TEAM_FIELDING_E -0.0286814 0.0030151 -9.512 < 2e-16 ***
## TEAM_FIELDING_DP -0.0657273 0.0101980 -6.445 1.41e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.23 on 2260 degrees of freedom
## Multiple R-squared: 0.299, Adjusted R-squared: 0.2944
## F-statistic: 64.27 on 15 and 2260 DF, p-value: < 2.2e-16
Backward elimination process
We will be rejecting predictors with p-value greater than 0.05 with the backward elimination process. We will stop after all the predictors are less than 0.05
# remove TEAM_BATTING_SO because of highest p-value
model <- update(model, .~. - TEAM_PITCHING_HR, data=money_b_train)
summary(model)##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.042 -8.547 0.145 8.864 53.323
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.3391588 4.5593894 6.216 6.07e-10 ***
## TEAM_BATTING_H 0.0429009 0.0034704 12.362 < 2e-16 ***
## TEAM_BATTING_2B -0.0042423 0.0097003 -0.437 0.661908
## TEAM_BATTING_3B 0.0647784 0.0169605 3.819 0.000137 ***
## TEAM_BATTING_HR 0.0676580 0.0094944 7.126 1.38e-12 ***
## TEAM_BATTING_BB 0.0024621 0.0053262 0.462 0.643941
## TEAM_BATTING_SO -0.0102303 0.0020605 -4.965 7.39e-07 ***
## TEAM_BASERUN_SB 0.0042559 0.0042016 1.013 0.311203
## TEAM_BASERUN_CS -0.0077347 0.0109725 -0.705 0.480933
## TEAM_BATTING_HBP -0.0539519 0.0192808 -2.798 0.005182 **
## TEAM_PITCHING_H -0.0008071 0.0003792 -2.129 0.033385 *
## TEAM_PITCHING_BB 0.0026460 0.0036397 0.727 0.467318
## TEAM_PITCHING_SO 0.0020109 0.0008924 2.253 0.024335 *
## TEAM_FIELDING_E -0.0286897 0.0030134 -9.521 < 2e-16 ***
## TEAM_FIELDING_DP -0.0657165 0.0101952 -6.446 1.40e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.23 on 2261 degrees of freedom
## Multiple R-squared: 0.299, Adjusted R-squared: 0.2947
## F-statistic: 68.89 on 14 and 2261 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.881 -8.564 0.157 8.871 52.974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.9383318 4.3479010 6.656 3.53e-11 ***
## TEAM_BATTING_H 0.0419016 0.0026116 16.044 < 2e-16 ***
## TEAM_BATTING_3B 0.0653117 0.0169136 3.861 0.000116 ***
## TEAM_BATTING_HR 0.0678105 0.0094863 7.148 1.18e-12 ***
## TEAM_BATTING_BB 0.0023148 0.0053146 0.436 0.663204
## TEAM_BATTING_SO -0.0103433 0.0020439 -5.060 4.52e-07 ***
## TEAM_BASERUN_SB 0.0043889 0.0041898 1.048 0.294972
## TEAM_BASERUN_CS -0.0077472 0.0109705 -0.706 0.480143
## TEAM_BATTING_HBP -0.0563987 0.0184480 -3.057 0.002260 **
## TEAM_PITCHING_H -0.0008085 0.0003791 -2.133 0.033042 *
## TEAM_PITCHING_BB 0.0026416 0.0036391 0.726 0.467973
## TEAM_PITCHING_SO 0.0019677 0.0008868 2.219 0.026592 *
## TEAM_FIELDING_E -0.0285535 0.0029967 -9.528 < 2e-16 ***
## TEAM_FIELDING_DP -0.0660580 0.0101634 -6.500 9.88e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.23 on 2262 degrees of freedom
## Multiple R-squared: 0.2989, Adjusted R-squared: 0.2949
## F-statistic: 74.2 on 13 and 2262 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.964 -8.518 0.118 8.918 52.990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.5329754 4.1272489 7.156 1.12e-12 ***
## TEAM_BATTING_H 0.0419019 0.0026111 16.047 < 2e-16 ***
## TEAM_BATTING_3B 0.0652171 0.0169092 3.857 0.000118 ***
## TEAM_BATTING_HR 0.0686535 0.0092851 7.394 2.00e-13 ***
## TEAM_BATTING_SO -0.0101948 0.0020150 -5.060 4.54e-07 ***
## TEAM_BASERUN_SB 0.0045692 0.0041685 1.096 0.273145
## TEAM_BASERUN_CS -0.0075804 0.0109618 -0.692 0.489303
## TEAM_BATTING_HBP -0.0567545 0.0184266 -3.080 0.002095 **
## TEAM_PITCHING_H -0.0008720 0.0003499 -2.492 0.012759 *
## TEAM_PITCHING_BB 0.0038763 0.0022814 1.699 0.089443 .
## TEAM_PITCHING_SO 0.0017785 0.0007730 2.301 0.021494 *
## TEAM_FIELDING_E -0.0288608 0.0029119 -9.911 < 2e-16 ***
## TEAM_FIELDING_DP -0.0659700 0.0101596 -6.493 1.03e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.22 on 2263 degrees of freedom
## Multiple R-squared: 0.2989, Adjusted R-squared: 0.2952
## F-statistic: 80.39 on 12 and 2263 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BATTING_HBP +
## TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.994 -8.566 0.139 8.920 53.075
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.9071741 4.0911511 7.310 3.68e-13 ***
## TEAM_BATTING_H 0.0416877 0.0025924 16.081 < 2e-16 ***
## TEAM_BATTING_3B 0.0660782 0.0168613 3.919 9.16e-05 ***
## TEAM_BATTING_HR 0.0693627 0.0092272 7.517 8.02e-14 ***
## TEAM_BATTING_SO -0.0105859 0.0019337 -5.474 4.88e-08 ***
## TEAM_BASERUN_SB 0.0041471 0.0041231 1.006 0.31461
## TEAM_BATTING_HBP -0.0549311 0.0182349 -3.012 0.00262 **
## TEAM_PITCHING_H -0.0009103 0.0003454 -2.635 0.00846 **
## TEAM_PITCHING_BB 0.0039810 0.0022761 1.749 0.08042 .
## TEAM_PITCHING_SO 0.0018089 0.0007717 2.344 0.01915 *
## TEAM_FIELDING_E -0.0286461 0.0028950 -9.895 < 2e-16 ***
## TEAM_FIELDING_DP -0.0674800 0.0099210 -6.802 1.32e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.22 on 2264 degrees of freedom
## Multiple R-squared: 0.2987, Adjusted R-squared: 0.2953
## F-statistic: 87.68 on 11 and 2264 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_SO + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.356 -8.522 0.150 8.918 52.512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.8812953 3.9748749 7.769 1.19e-14 ***
## TEAM_BATTING_H 0.0416515 0.0025922 16.068 < 2e-16 ***
## TEAM_BATTING_3B 0.0684400 0.0166970 4.099 4.30e-05 ***
## TEAM_BATTING_HR 0.0680528 0.0091349 7.450 1.32e-13 ***
## TEAM_BATTING_SO -0.0103713 0.0019219 -5.396 7.51e-08 ***
## TEAM_BATTING_HBP -0.0544337 0.0182282 -2.986 0.00285 **
## TEAM_PITCHING_H -0.0009219 0.0003452 -2.670 0.00763 **
## TEAM_PITCHING_BB 0.0046295 0.0021829 2.121 0.03405 *
## TEAM_PITCHING_SO 0.0016404 0.0007532 2.178 0.02953 *
## TEAM_FIELDING_E -0.0294204 0.0027908 -10.542 < 2e-16 ***
## TEAM_FIELDING_DP -0.0721842 0.0087494 -8.250 2.65e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.22 on 2265 degrees of freedom
## Multiple R-squared: 0.2984, Adjusted R-squared: 0.2953
## F-statistic: 96.35 on 10 and 2265 DF, p-value: < 2.2e-16
The intercept of our example is 30.8812953 if all the other predictor values are 0. Approximately 90% of variation in Target Wins can be explained by our model. This f statistic tells us if there is a relationship between the dependent and independent variables we are testing. Generally, a large F indicates a stronger relationship and here we have 96.35.
Our residuals look pretty symmetrical around 0, suggesting that our model fits the data well.
We observed that many of the points fit close to the line.
Conclusion
- Our conclusion predicts that :
- TEAM_BATTING_H Base Hits by batters (1B,2B,3B,HR) - Positive Impact on Wins Confirmed
- TEAM_BATTING_2B Doubles by batters (2B) - Positive Impact on Wins Confirmed
- TEAM_BATTING_3B Triples by batters (3B) - Positive Impact on Wins Confirmed
- TEAM_BATTING_HR Homeruns by batters (4B) - Positive Impact on Wins Confirmed
- TEAM_BATTING_BB Walks by batters - Positive Impact on Wins Confirmed
- TEAM_BATTING_HBP Batters hit by pitch (get a free base) - Positive Impact on Wins Confirmed
- TEAM_BATTING_SO Strikeouts by batters - Negative Impact on Wins Disconfirmed
- TEAM_BASERUN_SB Stolen bases - Positive Impact on Wins Disconfirmed
- TEAM_BASERUN_CS Caught stealing - Negative Impact on Wins DisConfirmed
- TEAM_FIELDING_E Errors - Negative Impact on Wins DisConfirmed
- TEAM_FIELDING_DP Double Plays - Positive Impact on Wins Confirmed
- TEAM_PITCHING_BB Walks allowed - Negative Impact on Wins DisConfirmed
- TEAM_PITCHING_H Hits allowed - Negative Impact on Wins DisConfirmed
- TEAM_PITCHING_HR Homeruns allowed - Negative Impact on Wins Confirmed
“TEAM_BATTING_H”, “TEAM_BATTING_2B”, “TEAM_BATTING_3B”, “TEAM_BATTING_HR”, “TEAM_BATTING_BB”, “TEAM_BATTING_SO”, “TEAM_BASERUN_SB”, “TEAM_BASERUN_CS”, “TEAM_BATTING_HBP”, “TEAM_PITCHING_H”, “TEAM_PITCHING_HR”,“TEAM_PITCHING_BB”, “TEAM_PITCHING_SO”, “TEAM_FIELDING_E”, “TEAM_FIELDING_DP” are all significant variables to predict target wins.
Prediction
Using the model we formed, we will predict the target wins for our evaluation model.
## [1] "INDEX" "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [5] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_BATTING_SO" "TEAM_BASERUN_SB"
## [9] "TEAM_BASERUN_CS" "TEAM_BATTING_HBP" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
#remove the predictors that have negative effect to the target wins
new_eval_model = select(money_b_eval, - TEAM_PITCHING_HR, - TEAM_BATTING_2B, - TEAM_BATTING_BB, - TEAM_BASERUN_CS, - TEAM_BASERUN_SB)
# Turn the NA values in 0
new_eval_model[is.na(new_eval_model)] = 0
# prediction model
prediction_model <- predict(model, newdata=new_eval_model)
prediction_model## 1 2 3 4 5 6 7 8
## 65.29066 67.05335 75.61094 83.58844 63.67519 67.05147 79.19122 71.82463
## 9 10 11 12 13 14 15 16
## 69.16859 75.83207 72.85894 85.86148 85.12034 85.10176 84.27875 79.95146
## 17 18 19 20 21 22 23 24
## 73.94082 77.55507 74.53160 93.23044 83.53652 86.22625 83.02278 76.22636
## 25 26 27 28 29 30 31 32
## 80.86718 83.07934 55.98740 77.97675 84.45316 78.42339 91.19233 86.82518
## 33 34 35 36 37 38 39 40
## 84.66657 86.55304 82.97853 86.90949 77.26532 91.69608 87.43666 92.88114
## 41 42 43 44 45 46 47 48
## 85.30480 90.80702 30.92441 94.02767 88.60586 93.81935 98.29944 73.75140
## 49 50 51 52 53 54 55 56
## 68.74731 78.75908 80.73946 86.98164 78.66985 76.60196 76.37108 79.80282
## 57 58 59 60 61 62 63 64
## 82.72422 70.91085 65.76220 79.90202 85.05061 76.37925 86.51602 83.86209
## 65 66 67 68 69 70 71 72
## 79.34097 96.32018 73.25331 79.75918 76.17212 79.46304 89.46515 75.15088
## 73 74 75 76 77 78 79 80
## 81.31482 90.27151 83.56028 85.59852 79.77346 82.02732 75.28789 78.90232
## 81 82 83 84 85 86 87 88
## 88.67058 91.46127 99.98607 79.50832 85.85703 82.59919 80.95092 83.43583
## 89 90 91 92 93 94 95 96
## 84.49936 88.60315 81.04518 83.14081 72.12435 85.15947 82.37573 82.05240
## 97 98 99 100 101 102 103 104
## 80.56066 100.73643 87.81227 87.32391 83.35173 76.72292 86.80641 84.59432
## 105 106 107 108 109 110 111 112
## 79.90443 68.64333 58.75003 76.67873 83.35499 66.96303 82.54775 82.64571
## 113 114 115 116 117 118 119 120
## 90.44238 90.70932 82.42676 80.10387 86.60076 77.60082 74.92640 74.46265
## 121 122 123 124 125 126 127 128
## 94.11966 69.79301 73.43003 71.03893 67.27271 89.39861 93.26195 77.00327
## 129 130 131 132 133 134 135 136
## 92.94760 95.25139 87.78804 80.46972 76.60287 82.46187 82.76507 65.95787
## 137 138 139 140 141 142 143 144
## 75.29018 79.95943 81.42159 79.07409 63.94306 74.73422 91.59832 77.07120
## 145 146 147 148 149 150 151 152
## 74.38314 76.10073 78.46125 79.81763 82.32677 84.42143 83.97449 80.39391
## 153 154 155 156 157 158 159 160
## 34.56344 73.39020 76.25927 70.03222 84.42043 71.77533 86.26074 74.84888
## 161 162 163 164 165 166 167 168
## 103.35588 103.08979 93.55366 104.01778 97.81376 92.49966 84.31036 83.72956
## 169 170 171 172 173 174 175 176
## 73.49488 82.14438 88.25983 82.83997 82.79442 93.14520 85.39982 76.46767
## 177 178 179 180 181 182 183 184
## 78.47613 75.04096 77.59930 78.70224 76.81687 85.54835 83.64993 82.72803
## 185 186 187 188 189 190 191 192
## 88.79406 87.21167 87.88620 59.82733 67.42666 112.68146 73.76840 82.90580
## 193 194 195 196 197 198 199 200
## 78.10002 80.51338 82.96158 70.59017 79.43493 84.55084 81.08498 85.39992
## 201 202 203 204 205 206 207 208
## 77.91625 81.13866 75.02271 86.77097 79.49386 79.66930 79.62775 78.41551
## 209 210 211 212 213 214 215 216
## 83.94179 75.94934 107.15578 98.85732 79.92905 68.66213 73.80250 86.78465
## 217 218 219 220 221 222 223 224
## 82.85373 87.33769 77.98820 77.48902 79.20992 74.20021 79.70004 72.39399
## 225 226 227 228 229 230 231 232
## 85.88911 75.22900 81.09733 76.31865 78.62687 76.02780 78.43372 92.01634
## 233 234 235 236 237 238 239 240
## 81.42248 89.05693 79.72622 74.38382 80.57963 78.15124 95.25373 71.82234
## 241 242 243 244 245 246 247 248
## 90.11123 89.48959 85.29521 82.99218 60.68792 87.90637 81.36588 85.17542
## 249 250 251 252 253 254 255 256
## 76.66661 79.72225 80.48652 54.90579 93.33442 49.92541 70.11177 76.12061
## 257 258 259
## 78.41965 79.00451 80.66990
If the values for target wins are negative, it means the team would lose more games. Overall this model does very well.
Model 2
# remove the variables with high NA values
# TEAM_FIELDING_DP, TEAM_PITCHING_SO, TEAM_BATTING_HBP, TEAM_BASERUN_CS, TEAM_BASERUN_SB, TEAM_BATTING_SO
money_b_train2 <- money_b_train%>% select(TARGET_WINS, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATTING_HR, TEAM_BATTING_BB, TEAM_PITCHING_H, TEAM_PITCHING_HR, TEAM_PITCHING_BB, TEAM_FIELDING_E)
#Backward elimination process
model2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data=money_b_train)
summary(model2)##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_FIELDING_E, data = money_b_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.423 -8.867 0.115 8.887 55.548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.738568 3.511940 1.919 0.055140 .
## TEAM_BATTING_H 0.048908 0.003251 15.045 < 2e-16 ***
## TEAM_BATTING_2B -0.026239 0.009073 -2.892 0.003865 **
## TEAM_BATTING_3B 0.102433 0.016734 6.121 1.09e-09 ***
## TEAM_BATTING_HR 0.057039 0.026548 2.149 0.031778 *
## TEAM_BATTING_BB -0.001320 0.004840 -0.273 0.785147
## TEAM_PITCHING_H -0.001329 0.000369 -3.602 0.000323 ***
## TEAM_PITCHING_HR -0.019072 0.023835 -0.800 0.423689
## TEAM_PITCHING_BB 0.011387 0.003085 3.691 0.000228 ***
## TEAM_FIELDING_E -0.016523 0.002373 -6.963 4.34e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.48 on 2266 degrees of freedom
## Multiple R-squared: 0.2703, Adjusted R-squared: 0.2674
## F-statistic: 93.24 on 9 and 2266 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_FIELDING_E, data = money_b_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.273 -8.832 0.127 8.886 55.587
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.526453 3.423988 1.906 0.0568 .
## TEAM_BATTING_H 0.048766 0.003208 15.200 < 2e-16 ***
## TEAM_BATTING_2B -0.026072 0.009050 -2.881 0.0040 **
## TEAM_BATTING_3B 0.102196 0.016708 6.116 1.12e-09 ***
## TEAM_BATTING_HR 0.054383 0.024691 2.203 0.0277 *
## TEAM_PITCHING_H -0.001282 0.000327 -3.922 9.05e-05 ***
## TEAM_PITCHING_HR -0.016991 0.022575 -0.753 0.4517
## TEAM_PITCHING_BB 0.010755 0.002036 5.283 1.40e-07 ***
## TEAM_FIELDING_E -0.016351 0.002287 -7.149 1.18e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.48 on 2267 degrees of freedom
## Multiple R-squared: 0.2702, Adjusted R-squared: 0.2677
## F-statistic: 104.9 on 8 and 2267 DF, p-value: < 2.2e-16
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_PITCHING_H + TEAM_PITCHING_BB +
## TEAM_FIELDING_E, data = money_b_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.763 -8.861 0.095 8.860 55.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2713462 3.2775220 2.219 0.02662 *
## TEAM_BATTING_H 0.0484775 0.0031849 15.221 < 2e-16 ***
## TEAM_BATTING_2B -0.0258127 0.0090430 -2.854 0.00435 **
## TEAM_BATTING_3B 0.1010776 0.0166406 6.074 1.46e-09 ***
## TEAM_BATTING_HR 0.0366916 0.0075591 4.854 1.29e-06 ***
## TEAM_PITCHING_H -0.0013088 0.0003251 -4.026 5.87e-05 ***
## TEAM_PITCHING_BB 0.0103207 0.0019522 5.287 1.36e-07 ***
## TEAM_FIELDING_E -0.0166263 0.0022577 -7.364 2.48e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.48 on 2268 degrees of freedom
## Multiple R-squared: 0.27, Adjusted R-squared: 0.2678
## F-statistic: 119.9 on 7 and 2268 DF, p-value: < 2.2e-16
## [1] "INDEX" "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [5] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_BATTING_SO" "TEAM_BASERUN_SB"
## [9] "TEAM_BASERUN_CS" "TEAM_BATTING_HBP" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
#remove the predictors that have negative effect to the target wins
new_eval_model2 <- money_b_eval %>% select(TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATTING_HR, TEAM_PITCHING_H, TEAM_PITCHING_BB, TEAM_FIELDING_E)
# Turn the NA values in 0
new_eval_model2[is.na(new_eval_model2)] = 0
# prediction model
prediction_model2 <- predict(model2, newdata=new_eval_model2)
prediction_model2## 1 2 3 4 5 6 7 8
## 68.57679 70.20767 77.35107 83.60728 66.44188 67.44392 74.01699 72.52290
## 9 10 11 12 13 14 15 16
## 72.07908 75.86204 76.14127 85.66302 84.25863 82.11244 79.28366 80.65313
## 17 18 19 20 21 22 23 24
## 72.72498 80.73209 68.24429 93.15727 84.03790 86.72537 83.94422 76.45507
## 25 26 27 28 29 30 31 32
## 82.33443 84.46690 53.99437 77.34772 83.55037 76.54752 89.64897 87.49762
## 33 34 35 36 37 38 39 40
## 86.39979 88.63464 83.07959 82.97654 76.59917 90.98962 88.25264 89.93392
## 41 42 43 44 45 46 47 48
## 81.06430 86.65244 32.00565 93.94542 84.49850 91.12091 95.25990 72.55215
## 49 50 51 52 53 54 55 56
## 70.71842 77.42567 80.56279 86.18097 79.54452 75.66770 76.77920 78.91475
## 57 58 59 60 61 62 63 64
## 87.00232 70.24445 62.43238 76.94456 85.57690 82.32992 84.10415 84.08464
## 65 66 67 68 69 70 71 72
## 81.72510 88.61128 77.01994 84.45808 75.03575 84.58887 93.11545 78.11656
## 73 74 75 76 77 78 79 80
## 83.60987 87.48446 83.25982 87.59647 81.10361 79.45530 69.17038 75.34361
## 81 82 83 84 85 86 87 88
## 86.58620 91.02278 98.65784 83.24041 86.29588 81.38914 77.81345 83.29427
## 89 90 91 92 93 94 95 96
## 82.14307 85.78844 77.31626 90.17090 74.92238 80.27929 76.63840 76.41073
## 97 98 99 100 101 102 103 104
## 83.76351 101.49146 90.66066 91.80633 85.67709 75.74458 85.85636 82.51112
## 105 106 107 108 109 110 111 112
## 80.28514 75.74648 59.21657 80.05705 83.36447 63.89810 81.69559 80.89442
## 113 114 115 116 117 118 119 120
## 90.51339 88.42404 82.00004 79.88766 89.12636 79.28716 78.32773 70.56117
## 121 122 123 124 125 126 127 128
## 88.18073 64.83877 68.79647 62.89740 70.53486 89.14903 93.52098 77.13546
## 129 130 131 132 133 134 135 136
## 89.76420 96.00349 87.87496 79.55286 74.18762 83.65916 84.63120 67.92567
## 137 138 139 140 141 142 143 144
## 76.76088 79.31622 80.25903 79.00221 65.97271 70.88566 93.96534 80.09868
## 145 146 147 148 149 150 151 152
## 75.63502 76.66057 79.09194 81.58381 85.45157 81.03183 83.18578 79.69117
## 153 154 155 156 157 158 159 160
## 32.00533 74.74922 76.72696 73.53798 83.62346 70.38656 90.86799 71.82949
## 161 162 163 164 165 166 167 168
## 103.86302 102.94796 91.40787 103.43996 96.25437 92.15061 87.44536 83.28689
## 169 170 171 172 173 174 175 176
## 73.88550 80.44850 87.53529 83.90489 81.81791 91.73197 83.62750 78.62979
## 177 178 179 180 181 182 183 184
## 78.72177 78.62720 77.61974 80.23747 75.75891 82.42463 82.50687 83.40560
## 185 186 187 188 189 190 191 192
## 93.86719 84.08224 84.88270 59.90440 62.71131 106.61875 70.30532 79.80179
## 193 194 195 196 197 198 199 200
## 77.50981 80.91032 82.33698 71.26555 77.85090 81.87750 80.77272 86.39044
## 201 202 203 204 205 206 207 208
## 80.67028 82.42010 76.24716 85.64095 77.63218 78.86158 80.18659 76.75479
## 209 210 211 212 213 214 215 216
## 78.45877 74.04968 102.73424 94.95937 83.50166 71.04174 76.47425 88.75922
## 217 218 219 220 221 222 223 224
## 87.14607 86.32952 77.07959 76.85061 79.78128 75.26852 82.97115 79.90219
## 225 226 227 228 229 230 231 232
## 88.18673 76.73958 79.38705 80.08819 80.21071 76.68290 71.78995 94.28243
## 233 234 235 236 237 238 239 240
## 83.96099 86.59268 79.03424 74.35427 81.44309 78.25418 92.42152 75.30730
## 241 242 243 244 245 246 247 248
## 90.78059 88.86296 85.17144 83.49939 63.68535 86.98493 79.74425 82.77956
## 249 250 251 252 253 254 255 256
## 76.14821 84.12894 82.43395 59.19506 90.43189 46.23627 70.80823 77.18288
## 257 258 259
## 75.82183 77.87520 77.54615