library(knitr)
library(ggplot2)
require(gridExtra)
## Loading required package: gridExtra
require(ggplot2)
mb_train <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/moneyball-training-data2.csv",header=TRUE)
attach(mb_train)
# describe structure of dataset
str(mb_train)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1380 824 823 1383 825 2088 588 1068 2373 1103 ...
## $ TARGET_WINS : int 68 76 69 67 73 79 67 75 95 78 ...
## $ TEAM_BATTING_H : int 1378 1380 1349 1366 1453 1379 1351 1476 1481 1454 ...
## $ TEAM_BATTING_2B : int 273 287 239 297 335 273 259 304 259 309 ...
## $ TEAM_BATTING_3B : int 30 28 21 32 15 26 29 35 25 42 ...
## $ TEAM_BATTING_HR : int 209 194 182 136 222 161 200 227 235 182 ...
## $ TEAM_BATTING_BB : int 488 599 524 543 611 678 585 733 675 497 ...
## $ TEAM_BATTING_SO : int 1399 1335 1326 1320 1303 1273 1269 1264 1253 1249 ...
## $ TEAM_BASERUN_SB : int 66 77 80 139 72 129 63 65 87 110 ...
## $ TEAM_BASERUN_CS : int 36 25 34 40 23 44 21 46 51 58 ...
## $ TEAM_BATTING_HBP: int 72 81 79 68 62 41 44 58 84 74 ...
## $ TEAM_PITCHING_H : int 1378 1380 1349 1374 1453 1379 1351 2079 1481 1454 ...
## $ TEAM_PITCHING_HR: int 209 194 182 137 222 161 200 320 235 182 ...
## $ TEAM_PITCHING_BB: int 488 599 524 546 611 678 585 1033 675 497 ...
## $ TEAM_PITCHING_SO: int 1399 1335 1326 1328 1303 1273 1269 1781 1253 1249 ...
## $ TEAM_FIELDING_E : int 103 113 141 118 104 145 114 116 111 126 ...
## $ TEAM_FIELDING_DP: int 156 123 152 133 133 127 144 127 148 166 ...
mb_cor <- cor(mb_train[,3:17])
round(mb_cor, 3)
## TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## TEAM_BATTING_H 1.000 0.563 0.428
## TEAM_BATTING_2B 0.563 1.000 -0.107
## TEAM_BATTING_3B 0.428 -0.107 1.000
## TEAM_BATTING_HR -0.007 0.435 -0.636
## TEAM_BATTING_BB -0.072 0.256 -0.287
## TEAM_BATTING_SO -0.453 0.152 -0.656
## TEAM_BASERUN_SB 0.108 -0.183 0.486
## TEAM_BASERUN_CS 0.001 -0.046 0.136
## TEAM_BATTING_HBP -0.002 0.044 -0.043
## TEAM_PITCHING_H 0.303 0.024 0.195
## TEAM_PITCHING_HR 0.073 0.455 -0.568
## TEAM_PITCHING_BB 0.094 0.178 -0.002
## TEAM_PITCHING_SO NA NA NA
## TEAM_FIELDING_E 0.265 -0.235 0.510
## TEAM_FIELDING_DP 0.125 0.257 -0.228
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## TEAM_BATTING_H -0.007 -0.072 -0.453
## TEAM_BATTING_2B 0.435 0.256 0.152
## TEAM_BATTING_3B -0.636 -0.287 -0.656
## TEAM_BATTING_HR 1.000 0.514 0.693
## TEAM_BATTING_BB 0.514 1.000 0.371
## TEAM_BATTING_SO 0.693 0.371 1.000
## TEAM_BASERUN_SB -0.407 -0.043 -0.212
## TEAM_BASERUN_CS -0.225 -0.046 -0.103
## TEAM_BATTING_HBP 0.056 0.019 0.066
## TEAM_PITCHING_H -0.250 -0.450 -0.376
## TEAM_PITCHING_HR 0.969 0.460 0.633
## TEAM_PITCHING_BB 0.137 0.489 0.035
## TEAM_PITCHING_SO NA NA NA
## TEAM_FIELDING_E -0.587 -0.656 -0.583
## TEAM_FIELDING_DP 0.392 0.330 0.111
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP
## TEAM_BATTING_H 0.108 0.001 -0.002
## TEAM_BATTING_2B -0.183 -0.046 0.044
## TEAM_BATTING_3B 0.486 0.136 -0.043
## TEAM_BATTING_HR -0.407 -0.225 0.056
## TEAM_BATTING_BB -0.043 -0.046 0.019
## TEAM_BATTING_SO -0.212 -0.103 0.066
## TEAM_BASERUN_SB 1.000 0.233 -0.018
## TEAM_BASERUN_CS 0.233 1.000 -0.032
## TEAM_BATTING_HBP -0.018 -0.032 1.000
## TEAM_PITCHING_H 0.040 -0.053 -0.007
## TEAM_PITCHING_HR -0.380 -0.228 0.052
## TEAM_PITCHING_BB 0.129 -0.047 0.005
## TEAM_PITCHING_SO NA NA NA
## TEAM_FIELDING_E 0.326 -0.029 -0.018
## TEAM_FIELDING_DP -0.270 -0.102 -0.008
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TEAM_BATTING_H 0.303 0.073 0.094
## TEAM_BATTING_2B 0.024 0.455 0.178
## TEAM_BATTING_3B 0.195 -0.568 -0.002
## TEAM_BATTING_HR -0.250 0.969 0.137
## TEAM_BATTING_BB -0.450 0.460 0.489
## TEAM_BATTING_SO -0.376 0.633 0.035
## TEAM_BASERUN_SB 0.040 -0.380 0.129
## TEAM_BASERUN_CS -0.053 -0.228 -0.047
## TEAM_BATTING_HBP -0.007 0.052 0.005
## TEAM_PITCHING_H 1.000 -0.142 0.321
## TEAM_PITCHING_HR -0.142 1.000 0.222
## TEAM_PITCHING_BB 0.321 0.222 1.000
## TEAM_PITCHING_SO NA NA NA
## TEAM_FIELDING_E 0.668 -0.493 -0.023
## TEAM_FIELDING_DP -0.045 0.390 0.192
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TEAM_BATTING_H NA 0.265 0.125
## TEAM_BATTING_2B NA -0.235 0.257
## TEAM_BATTING_3B NA 0.510 -0.228
## TEAM_BATTING_HR NA -0.587 0.392
## TEAM_BATTING_BB NA -0.656 0.330
## TEAM_BATTING_SO NA -0.583 0.111
## TEAM_BASERUN_SB NA 0.326 -0.270
## TEAM_BASERUN_CS NA -0.029 -0.102
## TEAM_BATTING_HBP NA -0.018 -0.008
## TEAM_PITCHING_H NA 0.668 -0.045
## TEAM_PITCHING_HR NA -0.493 0.390
## TEAM_PITCHING_BB NA -0.023 0.192
## TEAM_PITCHING_SO 1 NA NA
## TEAM_FIELDING_E NA 1.000 -0.227
## TEAM_FIELDING_DP NA -0.227 1.000
Full model
m_full <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(m_full)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.938 -8.521 0.132 8.326 58.440
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.5116026 6.8099547 2.865 0.004208 **
## TEAM_BATTING_H 0.0488853 0.0037213 13.136 < 2e-16 ***
## TEAM_BATTING_2B -0.0240858 0.0092637 -2.600 0.009386 **
## TEAM_BATTING_3B 0.0623564 0.0169284 3.684 0.000236 ***
## TEAM_BATTING_HR 0.0647802 0.0273081 2.372 0.017770 *
## TEAM_BATTING_BB 0.0087866 0.0058039 1.514 0.130196
## TEAM_BATTING_SO -0.0096118 0.0025775 -3.729 0.000197 ***
## TEAM_BASERUN_SB 0.0212420 0.0043569 4.876 1.16e-06 ***
## TEAM_BASERUN_CS 0.0020603 0.0158179 0.130 0.896382
## TEAM_BATTING_HBP 0.0489915 0.0720771 0.680 0.496761
## TEAM_PITCHING_H -0.0011201 0.0003644 -3.074 0.002140 **
## TEAM_PITCHING_HR 0.0104297 0.0240923 0.433 0.665126
## TEAM_PITCHING_BB 0.0021713 0.0041091 0.528 0.597275
## TEAM_PITCHING_SO 0.0028132 0.0009101 3.091 0.002019 **
## TEAM_FIELDING_E -0.0170992 0.0024687 -6.926 5.67e-12 ***
## TEAM_FIELDING_DP -0.1100226 0.0135438 -8.123 7.54e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.89 on 2158 degrees of freedom
## (102 observations deleted due to missingness)
## Multiple R-squared: 0.3191, Adjusted R-squared: 0.3144
## F-statistic: 67.43 on 15 and 2158 DF, p-value: < 2.2e-16
Drop variable with highest P value: TEAM_BASERUN_CS
model1 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP,
## data = mb_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.942 -8.544 0.188 8.321 58.443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.6529169 6.7214408 2.924 0.003493 **
## TEAM_BATTING_H 0.0488988 0.0037190 13.148 < 2e-16 ***
## TEAM_BATTING_2B -0.0240623 0.0092599 -2.599 0.009425 **
## TEAM_BATTING_3B 0.0622775 0.0169137 3.682 0.000237 ***
## TEAM_BATTING_HR 0.0643854 0.0271331 2.373 0.017734 *
## TEAM_BATTING_BB 0.0087721 0.0058016 1.512 0.130672
## TEAM_BATTING_SO -0.0096038 0.0025762 -3.728 0.000198 ***
## TEAM_BASERUN_SB 0.0213681 0.0042470 5.031 5.27e-07 ***
## TEAM_BATTING_HBP 0.0488209 0.0720488 0.678 0.498093
## TEAM_PITCHING_H -0.0011168 0.0003634 -3.073 0.002147 **
## TEAM_PITCHING_HR 0.0105503 0.0240690 0.438 0.661186
## TEAM_PITCHING_BB 0.0021474 0.0041041 0.523 0.600868
## TEAM_PITCHING_SO 0.0028142 0.0009098 3.093 0.002006 **
## TEAM_FIELDING_E -0.0171753 0.0023981 -7.162 1.09e-12 ***
## TEAM_FIELDING_DP -0.1100727 0.0135353 -8.132 7.02e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.89 on 2159 degrees of freedom
## (102 observations deleted due to missingness)
## Multiple R-squared: 0.3191, Adjusted R-squared: 0.3147
## F-statistic: 72.28 on 14 and 2159 DF, p-value: < 2.2e-16
Drop 2 variables with highest P value: TEAM_BASERUN_CS and TEAM_PITCHING_HR
model2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.973 -8.476 0.220 8.335 58.493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.4133827 6.6979371 2.898 0.003788 **
## TEAM_BATTING_H 0.0490845 0.0036941 13.287 < 2e-16 ***
## TEAM_BATTING_2B -0.0241890 0.0092536 -2.614 0.009011 **
## TEAM_BATTING_3B 0.0631924 0.0167812 3.766 0.000171 ***
## TEAM_BATTING_HR 0.0754749 0.0098043 7.698 2.09e-14 ***
## TEAM_BATTING_BB 0.0077223 0.0052831 1.462 0.143969
## TEAM_BATTING_SO -0.0094303 0.0025451 -3.705 0.000216 ***
## TEAM_BASERUN_SB 0.0212290 0.0042343 5.014 5.78e-07 ***
## TEAM_BATTING_HBP 0.0483618 0.0720277 0.671 0.502017
## TEAM_PITCHING_H -0.0011275 0.0003626 -3.110 0.001898 **
## TEAM_PITCHING_BB 0.0030536 0.0035447 0.861 0.389073
## TEAM_PITCHING_SO 0.0026891 0.0008637 3.113 0.001873 **
## TEAM_FIELDING_E -0.0171314 0.0023956 -7.151 1.17e-12 ***
## TEAM_FIELDING_DP -0.1100116 0.0135320 -8.130 7.17e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.89 on 2160 degrees of freedom
## (102 observations deleted due to missingness)
## Multiple R-squared: 0.3191, Adjusted R-squared: 0.315
## F-statistic: 77.85 on 13 and 2160 DF, p-value: < 2.2e-16
Drop 3 variables with highest P value: TEAM_BASERUN_CS and TEAM_PITCHING_HR and TEAM_BATTING_HBP
model3 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model3)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.981 -8.464 0.229 8.339 58.446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.1422779 5.3232260 4.160 3.31e-05 ***
## TEAM_BATTING_H 0.0490957 0.0036936 13.292 < 2e-16 ***
## TEAM_BATTING_2B -0.0240428 0.0092499 -2.599 0.009406 **
## TEAM_BATTING_3B 0.0631322 0.0167789 3.763 0.000173 ***
## TEAM_BATTING_HR 0.0754887 0.0098030 7.701 2.05e-14 ***
## TEAM_BATTING_BB 0.0077493 0.0052822 1.467 0.142508
## TEAM_BATTING_SO -0.0093647 0.0025429 -3.683 0.000236 ***
## TEAM_BASERUN_SB 0.0212000 0.0042336 5.008 5.96e-07 ***
## TEAM_PITCHING_H -0.0011279 0.0003625 -3.111 0.001887 **
## TEAM_PITCHING_BB 0.0030540 0.0035442 0.862 0.388959
## TEAM_PITCHING_SO 0.0026849 0.0008636 3.109 0.001901 **
## TEAM_FIELDING_E -0.0170842 0.0023942 -7.136 1.31e-12 ***
## TEAM_FIELDING_DP -0.1102425 0.0135259 -8.150 6.07e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.89 on 2161 degrees of freedom
## (102 observations deleted due to missingness)
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.3151
## F-statistic: 84.33 on 12 and 2161 DF, p-value: < 2.2e-16