library(knitr)
library(ggplot2)

require(gridExtra)
## Loading required package: gridExtra
require(ggplot2)
mb_train <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/moneyball-training-data2.csv",header=TRUE)
attach(mb_train)

# describe structure of dataset
str(mb_train)
## 'data.frame':    2276 obs. of  17 variables:
##  $ INDEX           : int  1380 824 823 1383 825 2088 588 1068 2373 1103 ...
##  $ TARGET_WINS     : int  68 76 69 67 73 79 67 75 95 78 ...
##  $ TEAM_BATTING_H  : int  1378 1380 1349 1366 1453 1379 1351 1476 1481 1454 ...
##  $ TEAM_BATTING_2B : int  273 287 239 297 335 273 259 304 259 309 ...
##  $ TEAM_BATTING_3B : int  30 28 21 32 15 26 29 35 25 42 ...
##  $ TEAM_BATTING_HR : int  209 194 182 136 222 161 200 227 235 182 ...
##  $ TEAM_BATTING_BB : int  488 599 524 543 611 678 585 733 675 497 ...
##  $ TEAM_BATTING_SO : int  1399 1335 1326 1320 1303 1273 1269 1264 1253 1249 ...
##  $ TEAM_BASERUN_SB : int  66 77 80 139 72 129 63 65 87 110 ...
##  $ TEAM_BASERUN_CS : int  36 25 34 40 23 44 21 46 51 58 ...
##  $ TEAM_BATTING_HBP: int  72 81 79 68 62 41 44 58 84 74 ...
##  $ TEAM_PITCHING_H : int  1378 1380 1349 1374 1453 1379 1351 2079 1481 1454 ...
##  $ TEAM_PITCHING_HR: int  209 194 182 137 222 161 200 320 235 182 ...
##  $ TEAM_PITCHING_BB: int  488 599 524 546 611 678 585 1033 675 497 ...
##  $ TEAM_PITCHING_SO: int  1399 1335 1326 1328 1303 1273 1269 1781 1253 1249 ...
##  $ TEAM_FIELDING_E : int  103 113 141 118 104 145 114 116 111 126 ...
##  $ TEAM_FIELDING_DP: int  156 123 152 133 133 127 144 127 148 166 ...
mb_cor <- cor(mb_train[,3:17])

round(mb_cor, 3)
##                  TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## TEAM_BATTING_H            1.000           0.563           0.428
## TEAM_BATTING_2B           0.563           1.000          -0.107
## TEAM_BATTING_3B           0.428          -0.107           1.000
## TEAM_BATTING_HR          -0.007           0.435          -0.636
## TEAM_BATTING_BB          -0.072           0.256          -0.287
## TEAM_BATTING_SO          -0.453           0.152          -0.656
## TEAM_BASERUN_SB           0.108          -0.183           0.486
## TEAM_BASERUN_CS           0.001          -0.046           0.136
## TEAM_BATTING_HBP         -0.002           0.044          -0.043
## TEAM_PITCHING_H           0.303           0.024           0.195
## TEAM_PITCHING_HR          0.073           0.455          -0.568
## TEAM_PITCHING_BB          0.094           0.178          -0.002
## TEAM_PITCHING_SO             NA              NA              NA
## TEAM_FIELDING_E           0.265          -0.235           0.510
## TEAM_FIELDING_DP          0.125           0.257          -0.228
##                  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## TEAM_BATTING_H            -0.007          -0.072          -0.453
## TEAM_BATTING_2B            0.435           0.256           0.152
## TEAM_BATTING_3B           -0.636          -0.287          -0.656
## TEAM_BATTING_HR            1.000           0.514           0.693
## TEAM_BATTING_BB            0.514           1.000           0.371
## TEAM_BATTING_SO            0.693           0.371           1.000
## TEAM_BASERUN_SB           -0.407          -0.043          -0.212
## TEAM_BASERUN_CS           -0.225          -0.046          -0.103
## TEAM_BATTING_HBP           0.056           0.019           0.066
## TEAM_PITCHING_H           -0.250          -0.450          -0.376
## TEAM_PITCHING_HR           0.969           0.460           0.633
## TEAM_PITCHING_BB           0.137           0.489           0.035
## TEAM_PITCHING_SO              NA              NA              NA
## TEAM_FIELDING_E           -0.587          -0.656          -0.583
## TEAM_FIELDING_DP           0.392           0.330           0.111
##                  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP
## TEAM_BATTING_H             0.108           0.001           -0.002
## TEAM_BATTING_2B           -0.183          -0.046            0.044
## TEAM_BATTING_3B            0.486           0.136           -0.043
## TEAM_BATTING_HR           -0.407          -0.225            0.056
## TEAM_BATTING_BB           -0.043          -0.046            0.019
## TEAM_BATTING_SO           -0.212          -0.103            0.066
## TEAM_BASERUN_SB            1.000           0.233           -0.018
## TEAM_BASERUN_CS            0.233           1.000           -0.032
## TEAM_BATTING_HBP          -0.018          -0.032            1.000
## TEAM_PITCHING_H            0.040          -0.053           -0.007
## TEAM_PITCHING_HR          -0.380          -0.228            0.052
## TEAM_PITCHING_BB           0.129          -0.047            0.005
## TEAM_PITCHING_SO              NA              NA               NA
## TEAM_FIELDING_E            0.326          -0.029           -0.018
## TEAM_FIELDING_DP          -0.270          -0.102           -0.008
##                  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TEAM_BATTING_H             0.303            0.073            0.094
## TEAM_BATTING_2B            0.024            0.455            0.178
## TEAM_BATTING_3B            0.195           -0.568           -0.002
## TEAM_BATTING_HR           -0.250            0.969            0.137
## TEAM_BATTING_BB           -0.450            0.460            0.489
## TEAM_BATTING_SO           -0.376            0.633            0.035
## TEAM_BASERUN_SB            0.040           -0.380            0.129
## TEAM_BASERUN_CS           -0.053           -0.228           -0.047
## TEAM_BATTING_HBP          -0.007            0.052            0.005
## TEAM_PITCHING_H            1.000           -0.142            0.321
## TEAM_PITCHING_HR          -0.142            1.000            0.222
## TEAM_PITCHING_BB           0.321            0.222            1.000
## TEAM_PITCHING_SO              NA               NA               NA
## TEAM_FIELDING_E            0.668           -0.493           -0.023
## TEAM_FIELDING_DP          -0.045            0.390            0.192
##                  TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TEAM_BATTING_H                 NA           0.265            0.125
## TEAM_BATTING_2B                NA          -0.235            0.257
## TEAM_BATTING_3B                NA           0.510           -0.228
## TEAM_BATTING_HR                NA          -0.587            0.392
## TEAM_BATTING_BB                NA          -0.656            0.330
## TEAM_BATTING_SO                NA          -0.583            0.111
## TEAM_BASERUN_SB                NA           0.326           -0.270
## TEAM_BASERUN_CS                NA          -0.029           -0.102
## TEAM_BATTING_HBP               NA          -0.018           -0.008
## TEAM_PITCHING_H                NA           0.668           -0.045
## TEAM_PITCHING_HR               NA          -0.493            0.390
## TEAM_PITCHING_BB               NA          -0.023            0.192
## TEAM_PITCHING_SO                1              NA               NA
## TEAM_FIELDING_E                NA           1.000           -0.227
## TEAM_FIELDING_DP               NA          -0.227            1.000

Full model

m_full <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(m_full)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.938  -8.521   0.132   8.326  58.440 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      19.5116026  6.8099547   2.865 0.004208 ** 
## TEAM_BATTING_H    0.0488853  0.0037213  13.136  < 2e-16 ***
## TEAM_BATTING_2B  -0.0240858  0.0092637  -2.600 0.009386 ** 
## TEAM_BATTING_3B   0.0623564  0.0169284   3.684 0.000236 ***
## TEAM_BATTING_HR   0.0647802  0.0273081   2.372 0.017770 *  
## TEAM_BATTING_BB   0.0087866  0.0058039   1.514 0.130196    
## TEAM_BATTING_SO  -0.0096118  0.0025775  -3.729 0.000197 ***
## TEAM_BASERUN_SB   0.0212420  0.0043569   4.876 1.16e-06 ***
## TEAM_BASERUN_CS   0.0020603  0.0158179   0.130 0.896382    
## TEAM_BATTING_HBP  0.0489915  0.0720771   0.680 0.496761    
## TEAM_PITCHING_H  -0.0011201  0.0003644  -3.074 0.002140 ** 
## TEAM_PITCHING_HR  0.0104297  0.0240923   0.433 0.665126    
## TEAM_PITCHING_BB  0.0021713  0.0041091   0.528 0.597275    
## TEAM_PITCHING_SO  0.0028132  0.0009101   3.091 0.002019 ** 
## TEAM_FIELDING_E  -0.0170992  0.0024687  -6.926 5.67e-12 ***
## TEAM_FIELDING_DP -0.1100226  0.0135438  -8.123 7.54e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.89 on 2158 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.3191, Adjusted R-squared:  0.3144 
## F-statistic: 67.43 on 15 and 2158 DF,  p-value: < 2.2e-16

Drop variable with highest P value: TEAM_BASERUN_CS

model1 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, 
##     data = mb_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.942  -8.544   0.188   8.321  58.443 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      19.6529169  6.7214408   2.924 0.003493 ** 
## TEAM_BATTING_H    0.0488988  0.0037190  13.148  < 2e-16 ***
## TEAM_BATTING_2B  -0.0240623  0.0092599  -2.599 0.009425 ** 
## TEAM_BATTING_3B   0.0622775  0.0169137   3.682 0.000237 ***
## TEAM_BATTING_HR   0.0643854  0.0271331   2.373 0.017734 *  
## TEAM_BATTING_BB   0.0087721  0.0058016   1.512 0.130672    
## TEAM_BATTING_SO  -0.0096038  0.0025762  -3.728 0.000198 ***
## TEAM_BASERUN_SB   0.0213681  0.0042470   5.031 5.27e-07 ***
## TEAM_BATTING_HBP  0.0488209  0.0720488   0.678 0.498093    
## TEAM_PITCHING_H  -0.0011168  0.0003634  -3.073 0.002147 ** 
## TEAM_PITCHING_HR  0.0105503  0.0240690   0.438 0.661186    
## TEAM_PITCHING_BB  0.0021474  0.0041041   0.523 0.600868    
## TEAM_PITCHING_SO  0.0028142  0.0009098   3.093 0.002006 ** 
## TEAM_FIELDING_E  -0.0171753  0.0023981  -7.162 1.09e-12 ***
## TEAM_FIELDING_DP -0.1100727  0.0135353  -8.132 7.02e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.89 on 2159 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.3191, Adjusted R-squared:  0.3147 
## F-statistic: 72.28 on 14 and 2159 DF,  p-value: < 2.2e-16

Drop 2 variables with highest P value: TEAM_BASERUN_CS and TEAM_PITCHING_HR

model2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model2)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.973  -8.476   0.220   8.335  58.493 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      19.4133827  6.6979371   2.898 0.003788 ** 
## TEAM_BATTING_H    0.0490845  0.0036941  13.287  < 2e-16 ***
## TEAM_BATTING_2B  -0.0241890  0.0092536  -2.614 0.009011 ** 
## TEAM_BATTING_3B   0.0631924  0.0167812   3.766 0.000171 ***
## TEAM_BATTING_HR   0.0754749  0.0098043   7.698 2.09e-14 ***
## TEAM_BATTING_BB   0.0077223  0.0052831   1.462 0.143969    
## TEAM_BATTING_SO  -0.0094303  0.0025451  -3.705 0.000216 ***
## TEAM_BASERUN_SB   0.0212290  0.0042343   5.014 5.78e-07 ***
## TEAM_BATTING_HBP  0.0483618  0.0720277   0.671 0.502017    
## TEAM_PITCHING_H  -0.0011275  0.0003626  -3.110 0.001898 ** 
## TEAM_PITCHING_BB  0.0030536  0.0035447   0.861 0.389073    
## TEAM_PITCHING_SO  0.0026891  0.0008637   3.113 0.001873 ** 
## TEAM_FIELDING_E  -0.0171314  0.0023956  -7.151 1.17e-12 ***
## TEAM_FIELDING_DP -0.1100116  0.0135320  -8.130 7.17e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.89 on 2160 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.3191, Adjusted R-squared:  0.315 
## F-statistic: 77.85 on 13 and 2160 DF,  p-value: < 2.2e-16

Drop 3 variables with highest P value: TEAM_BASERUN_CS and TEAM_PITCHING_HR and TEAM_BATTING_HBP

model3 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO
+ TEAM_BASERUN_SB +  TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
summary(model3)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = mb_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.981  -8.464   0.229   8.339  58.446 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.1422779  5.3232260   4.160 3.31e-05 ***
## TEAM_BATTING_H    0.0490957  0.0036936  13.292  < 2e-16 ***
## TEAM_BATTING_2B  -0.0240428  0.0092499  -2.599 0.009406 ** 
## TEAM_BATTING_3B   0.0631322  0.0167789   3.763 0.000173 ***
## TEAM_BATTING_HR   0.0754887  0.0098030   7.701 2.05e-14 ***
## TEAM_BATTING_BB   0.0077493  0.0052822   1.467 0.142508    
## TEAM_BATTING_SO  -0.0093647  0.0025429  -3.683 0.000236 ***
## TEAM_BASERUN_SB   0.0212000  0.0042336   5.008 5.96e-07 ***
## TEAM_PITCHING_H  -0.0011279  0.0003625  -3.111 0.001887 ** 
## TEAM_PITCHING_BB  0.0030540  0.0035442   0.862 0.388959    
## TEAM_PITCHING_SO  0.0026849  0.0008636   3.109 0.001901 ** 
## TEAM_FIELDING_E  -0.0170842  0.0023942  -7.136 1.31e-12 ***
## TEAM_FIELDING_DP -0.1102425  0.0135259  -8.150 6.07e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.89 on 2161 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3151 
## F-statistic: 84.33 on 12 and 2161 DF,  p-value: < 2.2e-16