Part 1. Data Exploration

library(alr3)
library(car)
library(corrplot)
library(fBasics)
library(knitr)
library(MASS)

library(psych)
library(car)
library(corrplot)

Load and explore original data

mb_train <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/moneyball-training-data.csv")
describe(mb_train)

##                  vars    n    mean      sd median trimmed    mad  min
## INDEX               1 2276 1268.46  736.35 1270.5 1268.57 952.57    1
## TARGET_WINS         2 2276   80.79   15.75   82.0   81.31  14.83    0
## TEAM_BATTING_H      3 2276 1469.27  144.59 1454.0 1459.04 114.16  891
## TEAM_BATTING_2B     4 2276  241.25   46.80  238.0  240.40  47.44   69
## TEAM_BATTING_3B     5 2276   55.25   27.94   47.0   52.18  23.72    0
## TEAM_BATTING_HR     6 2276   99.61   60.55  102.0   97.39  78.58    0
## TEAM_BATTING_BB     7 2276  501.56  122.67  512.0  512.18  94.89    0
## TEAM_BATTING_SO     8 2174  735.61  248.53  750.0  742.31 284.66    0
## TEAM_BASERUN_SB     9 2145  124.76   87.79  101.0  110.81  60.79    0
## TEAM_BASERUN_CS    10 1504   52.80   22.96   49.0   50.36  17.79    0
## TEAM_BATTING_HBP   11  191   59.36   12.97   58.0   58.86  11.86   29
## TEAM_PITCHING_H    12 2276 1779.21 1406.84 1518.0 1555.90 174.95 1137
## TEAM_PITCHING_HR   13 2276  105.70   61.30  107.0  103.16  74.13    0
## TEAM_PITCHING_BB   14 2276  553.01  166.36  536.5  542.62  98.59    0
## TEAM_PITCHING_SO   15 2174  817.73  553.09  813.5  796.93 257.23    0
## TEAM_FIELDING_E    16 2276  246.48  227.77  159.0  193.44  62.27   65
## TEAM_FIELDING_DP   17 1990  146.39   26.23  149.0  147.58  23.72   52
##                    max range  skew kurtosis    se
## INDEX             2535  2534  0.00    -1.22 15.43
## TARGET_WINS        146   146 -0.40     1.03  0.33
## TEAM_BATTING_H    2554  1663  1.57     7.28  3.03
## TEAM_BATTING_2B    458   389  0.22     0.01  0.98
## TEAM_BATTING_3B    223   223  1.11     1.50  0.59
## TEAM_BATTING_HR    264   264  0.19    -0.96  1.27
## TEAM_BATTING_BB    878   878 -1.03     2.18  2.57
## TEAM_BATTING_SO   1399  1399 -0.30    -0.32  5.33
## TEAM_BASERUN_SB    697   697  1.97     5.49  1.90
## TEAM_BASERUN_CS    201   201  1.98     7.62  0.59
## TEAM_BATTING_HBP    95    66  0.32    -0.11  0.94
## TEAM_PITCHING_H  30132 28995 10.33   141.84 29.49
## TEAM_PITCHING_HR   343   343  0.29    -0.60  1.28
## TEAM_PITCHING_BB  3645  3645  6.74    96.97  3.49
## TEAM_PITCHING_SO 19278 19278 22.17   671.19 11.86
## TEAM_FIELDING_E   1898  1833  2.99    10.97  4.77
## TEAM_FIELDING_DP   228   176 -0.39     0.18  0.59

boxplot(mb_train, xlab="Boxplot Predictor Comparitive")

Look for correlations among values

##                        INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX             1.00000000 -0.04895047    -0.09493748     -0.01841285
## TARGET_WINS      -0.04895047  1.00000000     0.46994665      0.31298400
## TEAM_BATTING_H   -0.09493748  0.46994665     1.00000000      0.56177286
## TEAM_BATTING_2B  -0.01841285  0.31298400     0.56177286      1.00000000
## TEAM_BATTING_3B  -0.04639454 -0.12434586     0.21391883      0.04203441
## TEAM_BATTING_HR  -0.09532668  0.42241683     0.39627593      0.25099045
## TEAM_BATTING_BB   0.04702541  0.46868793     0.19735234      0.19749256
## TEAM_BATTING_SO   0.04132959 -0.22889273    -0.34174328     -0.06415123
## TEAM_BASERUN_SB  -0.03159666  0.01483639     0.07167495     -0.18768279
## TEAM_BASERUN_CS  -0.07692325 -0.17875598    -0.09377545     -0.20413884
## TEAM_BATTING_HBP  0.07719303  0.07350424    -0.02911218      0.04608475
## TEAM_PITCHING_H  -0.08865725  0.47123431     0.99919269      0.56045355
## TEAM_PITCHING_HR -0.09361594  0.42246683     0.39495630      0.24999875
## TEAM_PITCHING_BB  0.04958287  0.46839882     0.19529071      0.19592157
## TEAM_PITCHING_SO  0.04466127 -0.22936481    -0.34445001     -0.06616615
## TEAM_FIELDING_E  -0.02004841 -0.38668800    -0.25381638     -0.19427027
## TEAM_FIELDING_DP  0.13168916 -0.19586601     0.01776946     -0.02488808
##                  TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX                -0.04639454     -0.09532668      0.04702541
## TARGET_WINS          -0.12434586      0.42241683      0.46868793
## TEAM_BATTING_H        0.21391883      0.39627593      0.19735234
## TEAM_BATTING_2B       0.04203441      0.25099045      0.19749256
## TEAM_BATTING_3B       1.00000000     -0.21879927     -0.20584392
## TEAM_BATTING_HR      -0.21879927      1.00000000      0.45638161
## TEAM_BATTING_BB      -0.20584392      0.45638161      1.00000000
## TEAM_BATTING_SO      -0.19291841      0.21045444      0.21833871
## TEAM_BASERUN_SB       0.16946086     -0.19021893     -0.08806123
## TEAM_BASERUN_CS       0.23213978     -0.27579838     -0.20878051
## TEAM_BATTING_HBP     -0.17424715      0.10618116      0.04746007
## TEAM_PITCHING_H       0.21250322      0.39549390      0.19848687
## TEAM_PITCHING_HR     -0.21973263      0.99993259      0.45659283
## TEAM_PITCHING_BB     -0.20675383      0.45542468      0.99988140
## TEAM_PITCHING_SO     -0.19386654      0.20829574      0.21793253
## TEAM_FIELDING_E      -0.06513145      0.01567397     -0.07847126
## TEAM_FIELDING_DP      0.13314758     -0.06182222     -0.07929078
##                  TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX                 0.04132959     -0.03159666    -0.076923249
## TARGET_WINS          -0.22889273      0.01483639    -0.178755979
## TEAM_BATTING_H       -0.34174328      0.07167495    -0.093775445
## TEAM_BATTING_2B      -0.06415123     -0.18768279    -0.204138837
## TEAM_BATTING_3B      -0.19291841      0.16946086     0.232139777
## TEAM_BATTING_HR       0.21045444     -0.19021893    -0.275798375
## TEAM_BATTING_BB       0.21833871     -0.08806123    -0.208780510
## TEAM_BATTING_SO       1.00000000     -0.07475974    -0.056130355
## TEAM_BASERUN_SB      -0.07475974      1.00000000     0.624737808
## TEAM_BASERUN_CS      -0.05613035      0.62473781     1.000000000
## TEAM_BATTING_HBP      0.22094219     -0.06400498    -0.070513896
## TEAM_PITCHING_H      -0.34145321      0.07395373    -0.092977893
## TEAM_PITCHING_HR      0.21111617     -0.18948057    -0.275471495
## TEAM_PITCHING_BB      0.21895783     -0.08741902    -0.208470154
## TEAM_PITCHING_SO      0.99976835     -0.07351325    -0.055308336
## TEAM_FIELDING_E       0.30814540      0.04292341     0.207701189
## TEAM_FIELDING_DP     -0.12319072     -0.13023054    -0.006764233
##                  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## INDEX                  0.07719303     -0.08865725      -0.09361594
## TARGET_WINS            0.07350424      0.47123431       0.42246683
## TEAM_BATTING_H        -0.02911218      0.99919269       0.39495630
## TEAM_BATTING_2B        0.04608475      0.56045355       0.24999875
## TEAM_BATTING_3B       -0.17424715      0.21250322      -0.21973263
## TEAM_BATTING_HR        0.10618116      0.39549390       0.99993259
## TEAM_BATTING_BB        0.04746007      0.19848687       0.45659283
## TEAM_BATTING_SO        0.22094219     -0.34145321       0.21111617
## TEAM_BASERUN_SB       -0.06400498      0.07395373      -0.18948057
## TEAM_BASERUN_CS       -0.07051390     -0.09297789      -0.27547150
## TEAM_BATTING_HBP       1.00000000     -0.02769699       0.10675878
## TEAM_PITCHING_H       -0.02769699      1.00000000       0.39463199
## TEAM_PITCHING_HR       0.10675878      0.39463199       1.00000000
## TEAM_PITCHING_BB       0.04785137      0.19703302       0.45580983
## TEAM_PITCHING_SO       0.22157375     -0.34330646       0.20920115
## TEAM_FIELDING_E        0.04178971     -0.25073028       0.01689330
## TEAM_FIELDING_DP      -0.07120824      0.01416807      -0.06292475
##                  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## INDEX                  0.04958287       0.04466127     -0.02004841
## TARGET_WINS            0.46839882      -0.22936481     -0.38668800
## TEAM_BATTING_H         0.19529071      -0.34445001     -0.25381638
## TEAM_BATTING_2B        0.19592157      -0.06616615     -0.19427027
## TEAM_BATTING_3B       -0.20675383      -0.19386654     -0.06513145
## TEAM_BATTING_HR        0.45542468       0.20829574      0.01567397
## TEAM_BATTING_BB        0.99988140       0.21793253     -0.07847126
## TEAM_BATTING_SO        0.21895783       0.99976835      0.30814540
## TEAM_BASERUN_SB       -0.08741902      -0.07351325      0.04292341
## TEAM_BASERUN_CS       -0.20847015      -0.05530834      0.20770119
## TEAM_BATTING_HBP       0.04785137       0.22157375      0.04178971
## TEAM_PITCHING_H        0.19703302      -0.34330646     -0.25073028
## TEAM_PITCHING_HR       0.45580983       0.20920115      0.01689330
## TEAM_PITCHING_BB       1.00000000       0.21887700     -0.07692315
## TEAM_PITCHING_SO       0.21887700       1.00000000      0.31008407
## TEAM_FIELDING_E       -0.07692315       0.31008407      1.00000000
## TEAM_FIELDING_DP      -0.08040645      -0.12492321      0.04020581
##                  TEAM_FIELDING_DP
## INDEX                 0.131689160
## TARGET_WINS          -0.195866006
## TEAM_BATTING_H        0.017769456
## TEAM_BATTING_2B      -0.024888081
## TEAM_BATTING_3B       0.133147578
## TEAM_BATTING_HR      -0.061822219
## TEAM_BATTING_BB      -0.079290775
## TEAM_BATTING_SO      -0.123190715
## TEAM_BASERUN_SB      -0.130230537
## TEAM_BASERUN_CS      -0.006764233
## TEAM_BATTING_HBP     -0.071208241
## TEAM_PITCHING_H       0.014168073
## TEAM_PITCHING_HR     -0.062924751
## TEAM_PITCHING_BB     -0.080406452
## TEAM_PITCHING_SO     -0.124923213
## TEAM_FIELDING_E       0.040205814
## TEAM_FIELDING_DP      1.000000000

Part 2. Data Preparation

New Variable Creation

Creating a new column for batting singles and eliminating hits for batting

mb_e <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/moneyball-training-data.csv") 
#eliminate index column
mb_e1 <- mb_e[,-1]

#add singles column for hitting
mb_e1$TEAM_BATTING_1B <- as.numeric(mb_e1$TEAM_BATTING_H-mb_e1$TEAM_BATTING_2B-mb_e1$TEAM_BATTING_3B-mb_e1$TEAM_BATTING_HR)
mb_e1 <- mb_e1[,-2]
mb_e1 <- as.data.frame(mb_e1)

Building a regression model and filling in NA’s for SB.

Note: This approach is suggested in LMAR p. 201. “A more sophisticated alternative to mean imputation is to use regression methods to predict the missing values of the covariates.”

SB <- lm(data=mb_e1, TEAM_BASERUN_SB~.)
summary(SB)

#eliminate CS as there are no blank SB's with a value for CS + eliminate pitching, wins and fielding variables
SB1 <- lm(data=mb_e1, TEAM_BASERUN_SB~TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB)
summary(SB1)

#eliminate singles
SB2 <- lm(data=mb_e1, TEAM_BASERUN_SB~TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB)
summary(SB2)

#fill in NA for SB
mb_e2 <- mb_e1
mb_e2$TEAM_BASERUN_SB[is.na(mb_e2$TEAM_BASERUN_SB)]<-round(79.36805-0.19419*mb_e2$TEAM_BATTING_2B+1.41686*mb_e2$TEAM_BATTING_3B-0.24513*mb_e2$TEAM_BATTING_HR+0.08060*mb_e2$TEAM_BATTING_BB)

Building a linear regression model and filling in NA’s for CS

CS <- lm(data=mb_e2, TEAM_BASERUN_CS~.)
summary(CS)

#eliminate wins, pitching and fielding 
CS1 <- lm(data=mb_e2, TEAM_BASERUN_CS~TEAM_BASERUN_SB +TEAM_BATTING_1B +TEAM_BATTING_2B + TEAM_BATTING_3B +TEAM_BATTING_BB + TEAM_BATTING_HR)
summary(CS1)

#eliminate walks
CS2 <- lm(data=mb_e2, TEAM_BASERUN_CS~TEAM_BASERUN_SB +TEAM_BATTING_2B + TEAM_BATTING_3B +TEAM_BATTING_1B + TEAM_BATTING_HR)
summary(CS2)

#fill in NA for CS
mb_e3 <- mb_e2
mb_e3$TEAM_BASERUN_CS[is.na(mb_e3$TEAM_BASERUN_CS)]<-round(49.356793+0.322543*mb_e3$TEAM_BASERUN_SB-0.044486*mb_e3$TEAM_BATTING_2B+0.281124*mb_e3$TEAM_BATTING_3B-0.10797*mb_e3$TEAM_BATTING_HR-0.014034*mb_e3$TEAM_BATTING_1B)

Building a regression model and filling in NA’s for batting SO’s

BSO <- lm(data=mb_e3, TEAM_BATTING_SO~.)
summary(BSO)

#eliminate fielding and wins and baserunning and HBP and pitching SO's as it contains similar blanks
BSO1 <- lm(data=mb_e3, TEAM_BATTING_SO~TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_HR)
summary(BSO1)

#eliminate pitching HR's
BSO2 <- lm(data=mb_e3, TEAM_BATTING_SO~TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_BB + TEAM_PITCHING_H  + TEAM_PITCHING_BB + TEAM_BATTING_HR)
summary(BSO2)

#fill in NA for batting SO
mb_e4 <- mb_e3
mb_e4$TEAM_BATTING_SO[is.na(mb_e4$TEAM_BATTING_SO)]<-round(1605-0.8434*mb_e4$TEAM_BATTING_1B+0.2832*mb_e4$TEAM_BATTING_2B-1.348*mb_e4$TEAM_BATTING_3B-0.3493*mb_e4$TEAM_BATTING_BB-0.02903*mb_e4$TEAM_PITCHING_H+0.1657*mb_e4$TEAM_PITCHING_BB+1.703*mb_e$TEAM_BATTING_HR)

Building a regression model and filling in NA’s for pitching SO’s

PSO <- lm(data=mb_e4, TEAM_PITCHING_SO~.)
summary(PSO)

#eliminate wins, fielding, baserunning and HBP
PSO1 <- lm(data=mb_e4, TEAM_PITCHING_SO~TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_BATTING_SO + TEAM_PITCHING_BB + TEAM_PITCHING_HR)
summary(PSO1)

#eliminate batting 3B's
PSO2 <- lm(data=mb_e4, TEAM_PITCHING_SO~TEAM_BATTING_1B + TEAM_BATTING_2B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_PITCHING_H + TEAM_BATTING_SO + TEAM_PITCHING_BB + TEAM_PITCHING_HR)
summary(PSO2)

#replace NA with values for pitching SO
mb_e5 <- mb_e4
mb_e5$TEAM_PITCHING_SO[is.na(mb_e5$TEAM_PITCHING_SO)]<-round(4422.87422-0.46455*mb_e5$TEAM_BATTING_1B + 0.72823*mb_e5$TEAM_BATTING_2B +8.06941*mb_e5$TEAM_BATTING_HR -3.45005*mb_e5$TEAM_BATTING_BB-0.01815*mb_e5$TEAM_PITCHING_H+1.20501*mb_e5$TEAM_BATTING_SO+3.03598*mb_e5$TEAM_PITCHING_BB-8.40807*mb_e$TEAM_PITCHING_HR)

Building a linear regression and filling in NA’s for DP

#build a regression model for DP's

DP <- lm(data=mb_e5, TEAM_FIELDING_DP~.)
summary(DP)

#eliminate wins, hitting, HBP

DP1 <- lm(data=mb_e5, TEAM_FIELDING_DP~ TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_PITCHING_H+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E)
summary(DP1)

#eliminate pitching hits allowed
DP2 <- lm(data=mb_e5, TEAM_FIELDING_DP~ TEAM_BASERUN_SB+TEAM_BASERUN_CS+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E)
summary(DP2)

#eliminate CS as it makes no sense
DP3 <- lm(data=mb_e5, TEAM_FIELDING_DP~ TEAM_BASERUN_SB+TEAM_PITCHING_HR+TEAM_PITCHING_BB+TEAM_PITCHING_SO+TEAM_FIELDING_E)
summary(DP3)

#replace NA with values for DP
mb_e6 <- mb_e5
mb_e6$TEAM_FIELDING_DP[is.na(mb_e6$TEAM_FIELDING_DP)]<- round(158.8-0.1235*mb_e6$TEAM_BASERUN_SB +0.03320*mb_e6$TEAM_PITCHING_HR+0.02815*mb_e6$TEAM_PITCHING_BB - 0.006109*mb_e6$TEAM_PITCHING_SO - 0.06573*mb_e6$TEAM_FIELDING_E)

summary(mb_e6)
#only NA's left are HBP

Data Imputation

# read EVALUATION data set
eval_data <- read.csv("https://raw.githubusercontent.com/jtopor/CUNY-MSDA-621/master/HW-1/moneyball-evaluation-data.csv")

# read training data set
mb_e <- read.csv("https://raw.githubusercontent.com/jtopor/CUNY-MSDA-621/master/HW-1/moneyball-training-data.csv")

#eliminate index column
# mb_e1 <- mb_e[,-1]
mb_e1 <- mb_e

#####Creating a new column for batting singles and eliminating hits for batting

#add singles column for hitting
mb_e1$TEAM_BATTING_1B <- as.numeric(mb_e1$TEAM_BATTING_H-mb_e1$TEAM_BATTING_2B-mb_e1$TEAM_BATTING_3B-mb_e1$TEAM_BATTING_HR)
mb_e1 <- mb_e1[,-3]
mb_e1 <- as.data.frame(mb_e1)

eval_data$TEAM_BATTING_1B <- as.numeric(eval_data$TEAM_BATTING_H - eval_data$TEAM_BATTING_2B - eval_data$TEAM_BATTING_3B - eval_data$TEAM_BATTING_HR)

# HITS is in second column in eval data
eval_data <- eval_data[,-2]

# ADD A DUMMY COLUMN TO EVAL DATA FOR TARGET WINS
eval_data$TARGET_WINS <- 0

Eliminate HBP, CS, and pitching HR’s.

mb <- mb_e1[,-c(9,10,12)]
# summary(mb)

eval_data <- eval_data[,-c(8,9,11)]
# summary(eval_data)

Build model for batting SO using Gelman approach

#take out double plays + pitching SO + SB as data set is incomplete + Wins as they are not present in the evaluation data

BSO.1 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB -TARGET_WINS)
summary(BSO.1)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -367.76  -82.46   -3.12   76.44  401.62 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1978.17499   44.47257  44.481  < 2e-16 ***
## TEAM_BATTING_2B    -0.17459    0.17430  -1.002 0.316646    
## TEAM_BATTING_3B    -2.29879    0.24381  -9.429  < 2e-16 ***
## TEAM_BATTING_HR     0.88912    0.18212   4.882 1.14e-06 ***
## TEAM_BATTING_BB     0.98359    0.43292   2.272 0.023203 *  
## TEAM_PITCHING_H     0.36442    0.15123   2.410 0.016066 *  
## TEAM_PITCHING_BB   -1.17748    0.41011  -2.871 0.004137 ** 
## TEAM_FIELDING_E    -0.29146    0.07594  -3.838 0.000128 ***
## TEAM_BATTING_1B    -1.45473    0.16446  -8.845  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.1 on 1826 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7248, Adjusted R-squared:  0.7236 
## F-statistic: 601.1 on 8 and 1826 DF,  p-value: < 2.2e-16

#eliminate doubles
BSO.2 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B)
summary(BSO.2)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -366.77  -82.05   -2.74   76.89  399.16 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1975.11400   44.36750  44.517  < 2e-16 ***
## TEAM_BATTING_3B    -2.17815    0.21199 -10.275  < 2e-16 ***
## TEAM_BATTING_HR     1.02471    0.12183   8.411  < 2e-16 ***
## TEAM_BATTING_BB     0.59635    0.19483   3.061 0.002240 ** 
## TEAM_PITCHING_H     0.22723    0.06414   3.543 0.000406 ***
## TEAM_PITCHING_BB   -0.81040    0.18409  -4.402 1.13e-05 ***
## TEAM_FIELDING_E    -0.26762    0.07212  -3.711 0.000213 ***
## TEAM_BATTING_1B    -1.31541    0.08777 -14.988  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.1 on 1827 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7246, Adjusted R-squared:  0.7236 
## F-statistic: 686.8 on 7 and 1827 DF,  p-value: < 2.2e-16

vif(BSO.2)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_PITCHING_H 
##         2.984944         6.215565        39.110249        17.594555 
## TEAM_PITCHING_BB  TEAM_FIELDING_E  TEAM_BATTING_1B 
##        45.384546         2.448073         8.602393

# vif says remove TEAM_PITCHING_BB
BSO.3 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B - TEAM_PITCHING_BB)
summary(BSO.3)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -385.33  -80.03   -1.99   77.48  399.57 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1980.01367   44.57596  44.419  < 2e-16 ***
## TEAM_BATTING_3B   -1.79602    0.19437  -9.240  < 2e-16 ***
## TEAM_BATTING_HR    1.42972    0.08027  17.812  < 2e-16 ***
## TEAM_BATTING_BB   -0.24810    0.03431  -7.230 7.06e-13 ***
## TEAM_PITCHING_H   -0.03911    0.02139  -1.828   0.0677 .  
## TEAM_FIELDING_E   -0.29047    0.07229  -4.018 6.11e-05 ***
## TEAM_BATTING_1B   -0.99171    0.04816 -20.591  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.6 on 1828 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7217, Adjusted R-squared:  0.7208 
## F-statistic: 790.1 on 6 and 1828 DF,  p-value: < 2.2e-16

# pvals say remove PITCHING_H
BSO.4 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B - TEAM_PITCHING_BB - TEAM_PITCHING_H)
summary(BSO.4)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_PITCHING_BB - TEAM_PITCHING_H, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -384.09  -79.88   -1.51   77.07  393.44 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1986.07279   44.48099  44.650  < 2e-16 ***
## TEAM_BATTING_3B   -1.86025    0.19129  -9.725  < 2e-16 ***
## TEAM_BATTING_HR    1.37824    0.07521  18.325  < 2e-16 ***
## TEAM_BATTING_BB   -0.25047    0.03431  -7.300 4.28e-13 ***
## TEAM_FIELDING_E   -0.29947    0.07217  -4.149 3.49e-05 ***
## TEAM_BATTING_1B   -1.04331    0.03905 -26.719  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.7 on 1829 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7212, Adjusted R-squared:  0.7204 
## F-statistic: 946.2 on 5 and 1829 DF,  p-value: < 2.2e-16

vif(BSO.4)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_FIELDING_E 
##        2.403283        2.342414        1.199339        2.424099 
## TEAM_BATTING_1B 
##        1.683596

##All p-values are low with a 686.8 F-statistic and adjusted R squared of 0.7236
#take a look
par(mfrow=c(2,2))
plot(BSO.2)

# ---------------------------------------
# function definition for impute function
impute <- function (a, a.impute){
  ifelse (is.na(a), a.impute,a)
}
# ---------------------------------------

#prediction function
pred.BSO <- round(predict(BSO.4, mb))
BSO.imp <- impute(mb$TEAM_BATTING_SO, pred.BSO)

# impute the evaluation data
pred_eval.BSO <- round(predict(BSO.4, eval_data))
eval.BSO.imp <- impute(eval_data$TEAM_BATTING_SO, pred_eval.BSO)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb$TEAM_BATTING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   548.0   750.0   735.6   930.0  1399.0     102

summary(BSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   543.8   730.0   727.6   925.0  1399.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb$TEAM_BATTING_SO, breaks = 200)
hist(BSO.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data$TEAM_BATTING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   545.0   686.0   709.3   912.0  1268.0      18

summary(eval.BSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   532.5   677.0   699.8   904.5  1268.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data$TEAM_BATTING_SO, breaks = 30)
hist(eval.BSO.imp, breaks = 30)
###################################################

# update dataframes with imputed values
mb1 <- mb
mb1$TEAM_BATTING_SO <- BSO.imp

eval_data.1 <- eval_data
eval_data.1$TEAM_BATTING_SO <- eval.BSO.imp

Build model for Pitching SO

#take out double plays + SB as data set is incomplete and wins as they are not present in evaluation data

PSO.1 <- lm(data=mb1, TEAM_PITCHING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_BASERUN_SB - TARGET_WINS)
summary(PSO.1)

## 
## Call:
## lm(formula = TEAM_PITCHING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_BASERUN_SB - TARGET_WINS, data = mb1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -130.988   -3.674    1.299    4.675  147.753 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.693339   8.122849   0.085    0.932    
## TEAM_BATTING_2B  -0.291561   0.022062 -13.216  < 2e-16 ***
## TEAM_BATTING_3B  -0.268622   0.031593  -8.503  < 2e-16 ***
## TEAM_BATTING_HR  -0.269698   0.023194 -11.628  < 2e-16 ***
## TEAM_BATTING_BB  -0.883985   0.054857 -16.114  < 2e-16 ***
## TEAM_BATTING_SO   1.044883   0.002961 352.860  < 2e-16 ***
## TEAM_PITCHING_H   0.270090   0.019167  14.092  < 2e-16 ***
## TEAM_PITCHING_BB  0.850575   0.052011  16.354  < 2e-16 ***
## TEAM_FIELDING_E  -0.057521   0.009648  -5.962 2.99e-09 ***
## TEAM_BATTING_1B  -0.280665   0.021251 -13.207  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.43 on 1825 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.9958, Adjusted R-squared:  0.9958 
## F-statistic: 4.809e+04 on 9 and 1825 DF,  p-value: < 2.2e-16

vif(PSO.1)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         7.903547         4.140572        14.070952       193.640509 
##  TEAM_BATTING_SO  TEAM_PITCHING_H TEAM_PITCHING_BB  TEAM_FIELDING_E 
##         3.633459        98.139615       226.271134         2.736403 
##  TEAM_BATTING_1B 
##        31.499003

# vif says remove TEAM_PITCHING_BB

PSO.2 <- lm(data=mb1, TEAM_PITCHING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_BASERUN_SB - TARGET_WINS - TEAM_PITCHING_BB)
summary(PSO.2)

## 
## Call:
## lm(formula = TEAM_PITCHING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_BASERUN_SB - TARGET_WINS - TEAM_PITCHING_BB, data = mb1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -126.748   -4.347    1.094    5.334  149.232 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     11.722036   8.665285   1.353   0.1763    
## TEAM_BATTING_2B -0.613714   0.010633 -57.719   <2e-16 ***
## TEAM_BATTING_3B -0.578732   0.027050 -21.395   <2e-16 ***
## TEAM_BATTING_HR -0.601828   0.011993 -50.181   <2e-16 ***
## TEAM_BATTING_BB  0.010252   0.004704   2.180   0.0294 *  
## TEAM_BATTING_SO  1.041636   0.003163 329.346   <2e-16 ***
## TEAM_PITCHING_H  0.580032   0.003060 189.545   <2e-16 ***
## TEAM_FIELDING_E -0.097555   0.009990  -9.765   <2e-16 ***
## TEAM_BATTING_1B -0.610139   0.007239 -84.289   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.45 on 1826 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.9952, Adjusted R-squared:  0.9952 
## F-statistic: 4.719e+04 on 8 and 1826 DF,  p-value: < 2.2e-16

vif(PSO.2)

## TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB 
##        1.602077        2.648887        3.282938        1.242359 
## TEAM_BATTING_SO TEAM_PITCHING_H TEAM_FIELDING_E TEAM_BATTING_1B 
##        3.617130        2.183095        2.560245        3.189218

#all low P value and F statistic of 4719 with adj R squared of 0.9952
#take a look
par(mfrow=c(2,2))
plot(PSO.2)

#place back in the data base with imputed data for SO's
pred.PSO <- round(predict(PSO.2, mb1))
PSO.imp <- impute(mb1$TEAM_PITCHING_SO, pred.PSO)

# impute the evaluation data
pred_eval.PSO <- round(predict(PSO.2, eval_data.1))
eval.PSO.imp <- impute(eval_data.1$TEAM_PITCHING_SO, pred_eval.PSO)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb1$TEAM_PITCHING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   615.0   813.5   817.7   968.0 19280.0     102

summary(PSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   606.0   797.0   807.4   957.0 19280.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb1$TEAM_PITCHING_SO, breaks = 200)
hist(PSO.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.1$TEAM_PITCHING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   613.0   745.0   799.7   938.0  9963.0      18

summary(eval.PSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   596.0   728.0   785.4   927.5  9963.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.1$TEAM_PITCHING_SO, breaks = 30)
hist(eval.PSO.imp, breaks = 30)

###################################################

# update dataframes with imputed values 

mb2 <- mb1
mb2$TEAM_PITCHING_SO <- PSO.imp

eval_data.2 <- eval_data.1
eval_data.2$TEAM_PITCHING_SO <- eval.PSO.imp

Build model for SB

#Take out DP as incomplete data and target wins
SB.1 <- lm(data=mb2, TEAM_BASERUN_SB~. -INDEX -TEAM_FIELDING_DP - TARGET_WINS)
summary(SB.1)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.054  -32.496   -5.714   28.546  206.666 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -181.56290   25.61997  -7.087 1.92e-12 ***
## TEAM_BATTING_2B    -0.21829    0.07305  -2.988 0.002841 ** 
## TEAM_BATTING_3B     0.31700    0.10053   3.153 0.001640 ** 
## TEAM_BATTING_HR    -0.61700    0.07599  -8.119 8.27e-16 ***
## TEAM_BATTING_BB     0.12841    0.18355   0.700 0.484255    
## TEAM_BATTING_SO     0.43941    0.07872   5.582 2.71e-08 ***
## TEAM_PITCHING_H     0.20258    0.06368   3.181 0.001489 ** 
## TEAM_PITCHING_BB   -0.08197    0.17430  -0.470 0.638206    
## TEAM_PITCHING_SO   -0.25116    0.07480  -3.358 0.000801 ***
## TEAM_FIELDING_E     0.32079    0.02796  11.473  < 2e-16 ***
## TEAM_BATTING_1B    -0.12224    0.07046  -1.735 0.082937 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.26 on 1926 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3552, Adjusted R-squared:  0.3518 
## F-statistic: 106.1 on 10 and 1926 DF,  p-value: < 2.2e-16

#eliminate pitching BB's
SB.2 <- lm(data=mb2, TEAM_BASERUN_SB~. -INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS)
summary(SB.2)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.234  -32.317   -5.706   28.512  206.540 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -182.64502   25.51127  -7.159 1.15e-12 ***
## TEAM_BATTING_2B    -0.19473    0.05316  -3.663 0.000256 ***
## TEAM_BATTING_3B     0.34027    0.08749   3.889 0.000104 ***
## TEAM_BATTING_HR    -0.59236    0.05503 -10.765  < 2e-16 ***
## TEAM_BATTING_BB     0.04234    0.01376   3.076 0.002127 ** 
## TEAM_BATTING_SO     0.45256    0.07357   6.151 9.32e-10 ***
## TEAM_PITCHING_H     0.17992    0.04164   4.321 1.63e-05 ***
## TEAM_PITCHING_SO   -0.26348    0.07004  -3.762 0.000174 ***
## TEAM_FIELDING_E     0.32263    0.02768  11.655  < 2e-16 ***
## TEAM_BATTING_1B    -0.09787    0.04774  -2.050 0.040499 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.25 on 1927 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3551, Adjusted R-squared:  0.3521 
## F-statistic: 117.9 on 9 and 1927 DF,  p-value: < 2.2e-16

#eliminate singles
SB.3 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB -TEAM_BATTING_1B - TARGET_WINS)
summary(SB.3)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TEAM_BATTING_1B - TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -113.836  -32.731   -5.467   28.523  205.971 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -201.72737   23.77215  -8.486  < 2e-16 ***
## TEAM_BATTING_2B    -0.11845    0.03799  -3.118  0.00185 ** 
## TEAM_BATTING_3B     0.40962    0.08075   5.073 4.30e-07 ***
## TEAM_BATTING_HR    -0.51178    0.03855 -13.277  < 2e-16 ***
## TEAM_BATTING_BB     0.04187    0.01377   3.040  0.00240 ** 
## TEAM_BATTING_SO     0.32253    0.03731   8.645  < 2e-16 ***
## TEAM_PITCHING_H     0.10155    0.01650   6.154 9.17e-10 ***
## TEAM_PITCHING_SO   -0.13495    0.03125  -4.318 1.66e-05 ***
## TEAM_FIELDING_E     0.33595    0.02693  12.475  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.29 on 1928 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3537, Adjusted R-squared:  0.351 
## F-statistic: 131.9 on 8 and 1928 DF,  p-value: < 2.2e-16

#simplify the model by taking out pitching
SB.4 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB -TEAM_BATTING_1B - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H)
summary(SB.4)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TEAM_BATTING_1B - TARGET_WINS - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_H, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -117.914  -33.217   -5.623   29.822  199.657 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -83.147587  14.021226  -5.930 3.58e-09 ***
## TEAM_BATTING_2B   0.041592   0.028337   1.468   0.1423    
## TEAM_BATTING_3B   0.572336   0.077093   7.424 1.70e-13 ***
## TEAM_BATTING_HR  -0.405437   0.034672 -11.694  < 2e-16 ***
## TEAM_BATTING_BB   0.035111   0.013862   2.533   0.0114 *  
## TEAM_BATTING_SO   0.151522   0.008054  18.814  < 2e-16 ***
## TEAM_FIELDING_E   0.360947   0.026930  13.403  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.77 on 1930 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3395, Adjusted R-squared:  0.3374 
## F-statistic: 165.3 on 6 and 1930 DF,  p-value: < 2.2e-16

#add singles back in
SB.5 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H)
summary(SB.5)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H, 
##     data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.276  -33.011   -5.119   28.546  203.041 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.872e+02  2.563e+01  -7.305 4.05e-13 ***
## TEAM_BATTING_2B -1.075e-03  2.952e-02  -0.036  0.97096    
## TEAM_BATTING_3B  5.195e-01  7.742e-02   6.710 2.55e-11 ***
## TEAM_BATTING_HR -4.080e-01  3.448e-02 -11.834  < 2e-16 ***
## TEAM_BATTING_BB  3.995e-02  1.382e-02   2.891  0.00388 ** 
## TEAM_BATTING_SO  1.762e-01  9.496e-03  18.558  < 2e-16 ***
## TEAM_FIELDING_E  3.577e-01  2.678e-02  13.356  < 2e-16 ***
## TEAM_BATTING_1B  9.198e-02  1.901e-02   4.839 1.41e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.5 on 1929 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3474, Adjusted R-squared:  0.345 
## F-statistic: 146.7 on 7 and 1929 DF,  p-value: < 2.2e-16

#eliminate doubles
SB.6 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - TEAM_BATTING_2B)
summary(SB.6)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - 
##     TEAM_BATTING_2B, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.233  -32.996   -5.134   28.584  203.038 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.872e+02  2.561e+01  -7.309 3.91e-13 ***
## TEAM_BATTING_3B  5.190e-01  7.610e-02   6.820 1.21e-11 ***
## TEAM_BATTING_HR -4.084e-01  3.261e-02 -12.522  < 2e-16 ***
## TEAM_BATTING_BB  3.991e-02  1.376e-02   2.900  0.00377 ** 
## TEAM_BATTING_SO  1.762e-01  9.475e-03  18.596  < 2e-16 ***
## TEAM_FIELDING_E  3.579e-01  2.624e-02  13.642  < 2e-16 ***
## TEAM_BATTING_1B  9.177e-02  1.813e-02   5.060 4.58e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.49 on 1930 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3474, Adjusted R-squared:  0.3454 
## F-statistic: 171.2 on 6 and 1930 DF,  p-value: < 2.2e-16

#eliminate walks
SB.7 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - TEAM_BATTING_2B - TEAM_BATTING_BB)
summary(SB.7)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - 
##     TEAM_BATTING_2B - TEAM_BATTING_BB, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.972  -32.412   -5.034   29.206  206.559 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.609e+02  2.400e+01  -6.705 2.63e-11 ***
## TEAM_BATTING_3B  5.197e-01  7.624e-02   6.816 1.25e-11 ***
## TEAM_BATTING_HR -3.807e-01  3.124e-02 -12.185  < 2e-16 ***
## TEAM_BATTING_SO  1.715e-01  9.351e-03  18.336  < 2e-16 ***
## TEAM_FIELDING_E  3.453e-01  2.592e-02  13.320  < 2e-16 ***
## TEAM_BATTING_1B  8.923e-02  1.815e-02   4.917 9.54e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.58 on 1931 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3446, Adjusted R-squared:  0.3429 
## F-statistic:   203 on 5 and 1931 DF,  p-value: < 2.2e-16

#all low P value and F statistic of 202.9 with adj R squared of 0.3427
#take a look
par(mfrow=c(2,2))
plot(SB.7)

#place back in the data base with imputed data for SB's
pred.SB <- round(predict(SB.7, mb2))
SB.imp <- impute(mb2$TEAM_BASERUN_SB, pred.SB)

# impute the evaluation data
pred_eval.SB <- round(predict(SB.7, eval_data.2))
eval.SB.imp <- impute(eval_data.2$TEAM_BASERUN_SB, pred_eval.SB)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb2$TEAM_BASERUN_SB)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    66.0   101.0   124.8   156.0   697.0     131

summary(SB.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    67.0   106.0   137.8   172.0   697.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb2$TEAM_BASERUN_SB, breaks = 200)
hist(SB.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.2$TEAM_BASERUN_SB)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    59.0    92.0   123.7   151.8   580.0      13

summary(eval.SB.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    60.5    96.0   134.9   164.5   580.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.2$TEAM_BASERUN_SB, breaks = 30)
hist(eval.SB.imp, breaks = 30)
###################################################

# update dataframes with imputed values
mb3 <- mb2
mb3$TEAM_BASERUN_SB <- SB.imp

eval_data.3 <- eval_data.2
eval_data.3$TEAM_BASERUN_SB <- eval.SB.imp

Build model to replace DP

#remove target wins
DP.1 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS)
summary(DP.1)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.685 -13.815  -0.923  13.056  64.642 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      119.570410  11.008885  10.861  < 2e-16 ***
## TEAM_BATTING_2B    0.015947   0.013100   1.217  0.22360    
## TEAM_BATTING_3B   -0.095545   0.031158  -3.066  0.00220 ** 
## TEAM_BATTING_HR    0.062465   0.016446   3.798  0.00015 ***
## TEAM_BATTING_BB    0.195238   0.027750   7.036 2.72e-12 ***
## TEAM_BATTING_SO   -0.107809   0.015781  -6.831 1.12e-11 ***
## TEAM_BASERUN_SB   -0.126130   0.009838 -12.820  < 2e-16 ***
## TEAM_PITCHING_H    0.009159   0.004069   2.251  0.02448 *  
## TEAM_PITCHING_BB  -0.153349   0.025468  -6.021 2.06e-09 ***
## TEAM_PITCHING_SO   0.090351   0.014318   6.310 3.43e-10 ***
## TEAM_FIELDING_E   -0.068602   0.009653  -7.107 1.65e-12 ***
## TEAM_BATTING_1B    0.024316   0.010083   2.412  0.01598 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.05 on 1978 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4185, Adjusted R-squared:  0.4153 
## F-statistic: 129.4 on 11 and 1978 DF,  p-value: < 2.2e-16

#remove batting 2B's
DP.2 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS - TEAM_BATTING_2B)
summary(DP.2)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.620 -14.051  -1.043  12.979  64.061 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      120.547069  10.980952  10.978  < 2e-16 ***
## TEAM_BATTING_3B   -0.087844   0.030513  -2.879  0.00403 ** 
## TEAM_BATTING_HR    0.065419   0.016268   4.021 6.00e-05 ***
## TEAM_BATTING_BB    0.200115   0.027463   7.287 4.56e-13 ***
## TEAM_BATTING_SO   -0.107359   0.015779  -6.804 1.34e-11 ***
## TEAM_BASERUN_SB   -0.126208   0.009839 -12.827  < 2e-16 ***
## TEAM_PITCHING_H    0.010913   0.003806   2.868  0.00418 ** 
## TEAM_PITCHING_BB  -0.157287   0.025265  -6.226 5.84e-10 ***
## TEAM_PITCHING_SO   0.090027   0.014318   6.288 3.95e-10 ***
## TEAM_FIELDING_E   -0.072227   0.009183  -7.865 6.01e-15 ***
## TEAM_BATTING_1B    0.024008   0.010081   2.381  0.01734 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.06 on 1979 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4181, Adjusted R-squared:  0.4152 
## F-statistic: 142.2 on 10 and 1979 DF,  p-value: < 2.2e-16

# results show that EVERYTHING ELSE is statistically signficant, so:

# run vif to check for collinearity
vif(DP.2)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         2.378033         4.489647        40.081217        61.937276 
##  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO 
##         2.207539        10.533005        33.459414        51.795257 
##  TEAM_FIELDING_E  TEAM_BATTING_1B 
##         6.392623         4.748132

# results show TEAM_BATTING_SO should be removed

# remove TEAM_BATTING_SO
DP.3 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO)
summary(DP.3)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.858 -14.365  -0.308  13.180  68.663 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      107.364168  10.931590   9.821  < 2e-16 ***
## TEAM_BATTING_3B   -0.075566   0.030806  -2.453  0.01425 *  
## TEAM_BATTING_HR    0.049820   0.016289   3.059  0.00225 ** 
## TEAM_BATTING_BB    0.059620   0.018312   3.256  0.00115 ** 
## TEAM_BASERUN_SB   -0.135254   0.009860 -13.717  < 2e-16 ***
## TEAM_PITCHING_H    0.016359   0.003763   4.348 1.45e-05 ***
## TEAM_PITCHING_BB  -0.022297   0.015820  -1.409  0.15887    
## TEAM_PITCHING_SO  -0.003527   0.004037  -0.874  0.38244    
## TEAM_FIELDING_E   -0.057520   0.009027  -6.372 2.31e-10 ***
## TEAM_BATTING_1B    0.020596   0.010183   2.023  0.04326 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 1980 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4045, Adjusted R-squared:  0.4018 
## F-statistic: 149.4 on 9 and 1980 DF,  p-value: < 2.2e-16

# p-value says remove TEAM_PITCHING_SO;


# remove TEAM_PITCHING_SO
DP.4 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TEAM_BATTING_2B -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO)
summary(DP.4)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TEAM_BATTING_2B - 
##     TARGET_WINS - TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.801 -14.399  -0.536  13.355  68.091 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      100.469748   7.563815  13.283  < 2e-16 ***
## TEAM_BATTING_3B   -0.067701   0.029459  -2.298 0.021658 *  
## TEAM_BATTING_HR    0.045067   0.015352   2.936 0.003368 ** 
## TEAM_BATTING_BB    0.063875   0.017651   3.619 0.000303 ***
## TEAM_BASERUN_SB   -0.138490   0.009137 -15.157  < 2e-16 ***
## TEAM_PITCHING_H    0.015675   0.003680   4.259 2.15e-05 ***
## TEAM_PITCHING_BB  -0.025444   0.015403  -1.652 0.098711 .  
## TEAM_FIELDING_E   -0.055612   0.008758  -6.350 2.66e-10 ***
## TEAM_BATTING_1B    0.025138   0.008755   2.871 0.004134 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 1981 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4043, Adjusted R-squared:  0.4018 
## F-statistic:   168 on 8 and 1981 DF,  p-value: < 2.2e-16

vif(DP.4)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BASERUN_SB 
##         2.167319         3.909442        16.189150         1.861261 
##  TEAM_PITCHING_H TEAM_PITCHING_BB  TEAM_FIELDING_E  TEAM_BATTING_1B 
##         9.630593        12.159984         5.684878         3.501740

# P values and vif both indicate remove TEAM_PITCHING_BB

# remove TEAM_PITCHING_BB
DP.5 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(DP.5)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.547 -14.352  -0.402  13.240  66.253 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     98.606000   7.482458  13.178  < 2e-16 ***
## TEAM_BATTING_3B -0.071477   0.029383  -2.433   0.0151 *  
## TEAM_BATTING_HR  0.057086   0.013524   4.221 2.54e-05 ***
## TEAM_BATTING_BB  0.036352   0.005830   6.235 5.51e-10 ***
## TEAM_BASERUN_SB -0.138165   0.009139 -15.118  < 2e-16 ***
## TEAM_PITCHING_H  0.011078   0.002409   4.598 4.53e-06 ***
## TEAM_FIELDING_E -0.051135   0.008332  -6.137 1.01e-09 ***
## TEAM_BATTING_1B  0.032220   0.007637   4.219 2.57e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.29 on 1982 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4034, Adjusted R-squared:  0.4013 
## F-statistic: 191.5 on 7 and 1982 DF,  p-value: < 2.2e-16

vif(DP.5)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.154275        3.031272        1.764845        1.860396 
## TEAM_PITCHING_H TEAM_FIELDING_E TEAM_BATTING_1B 
##        4.123897        5.140583        2.662224

# vif says remove TEAM_FIELDING_E; p-values all < .05 so remove TEAM_FIELDING_E

DP.6 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E)
summary(DP.6)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.558 -14.399  -0.152  13.362  64.018 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     86.427823   7.280960  11.870  < 2e-16 ***
## TEAM_BATTING_3B -0.093997   0.029422  -3.195  0.00142 ** 
## TEAM_BATTING_HR  0.091627   0.012411   7.383 2.27e-13 ***
## TEAM_BATTING_BB  0.045730   0.005678   8.053 1.38e-15 ***
## TEAM_BASERUN_SB -0.155983   0.008745 -17.836  < 2e-16 ***
## TEAM_PITCHING_H  0.001250   0.001817   0.688  0.49148    
## TEAM_BATTING_1B  0.044340   0.007446   5.955 3.07e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.48 on 1983 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.3921, Adjusted R-squared:  0.3903 
## F-statistic: 213.2 on 6 and 1983 DF,  p-value: < 2.2e-16

vif(DP.6)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.120678        2.506351        1.643620        1.672663 
## TEAM_PITCHING_H TEAM_BATTING_1B 
##        2.301907        2.484225

# now no collinearity but p-values say remove TEAM_PITCHING_H

DP.7 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E - TEAM_PITCHING_H)
summary(DP.7)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E - 
##     TEAM_PITCHING_H, data = mb3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.58 -14.41  -0.19  13.38  63.83 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     85.333602   7.104233  12.012  < 2e-16 ***
## TEAM_BATTING_3B -0.094593   0.029405  -3.217  0.00132 ** 
## TEAM_BATTING_HR  0.094530   0.011670   8.100 9.47e-16 ***
## TEAM_BATTING_BB  0.044401   0.005339   8.316  < 2e-16 ***
## TEAM_BASERUN_SB -0.153554   0.008000 -19.194  < 2e-16 ***
## TEAM_BATTING_1B  0.047370   0.006003   7.892 4.88e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.48 on 1984 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.3919, Adjusted R-squared:  0.3904 
## F-statistic: 255.8 on 5 and 1984 DF,  p-value: < 2.2e-16

vif(DP.7)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.118844        2.216648        1.453424        1.400175 
## TEAM_BATTING_1B 
##        1.615010

# no collinearity, all p-values < .05 so stop


#all low P value and F statistic of 255.8 with adj R squared of 0.3904
#take a look
par(mfrow=c(2,2))
plot(DP.7)

#place back in the data base with imputed data for SB's
# NOTE: Changed DP.4 to DP.7 here
pred.DP <- round(predict(DP.7, mb3))
DP.imp <- impute(mb3$TEAM_FIELDING_DP, pred.DP)

# impute the evaluation data
pred_eval.DP <- round(predict(DP.7, eval_data.3))
eval.DP.imp <- impute(eval_data.3$TEAM_FIELDING_DP, pred_eval.DP)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb3$TEAM_FIELDING_DP)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    52.0   131.0   149.0   146.4   164.0   228.0     286

summary(DP.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    48.0   124.0   145.0   141.5   162.0   228.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb3$TEAM_FIELDING_DP, breaks = 200)
hist(DP.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.3$TEAM_FIELDING_DP)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    69.0   131.0   148.0   146.1   164.0   204.0      31

summary(eval.DP.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0   123.5   146.0   141.3   160.5   204.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.3$TEAM_FIELDING_DP, breaks = 30)
hist(eval.DP.imp, breaks = 30)
###################################################

# update data frames with imputed values
mb4 <- mb3
mb4$TEAM_FIELDING_DP <- DP.imp

eval_data.4 <- eval_data.3
eval_data.4$TEAM_FIELDING_DP <- eval.DP.imp

Eliminate unhistorical outliers - DO THIS FOR THE EVAL DATA AS WELL

# check rowcount before removal of outliers
nrow(mb4)

## [1] 2276

nrow(eval_data.4)

## [1] 259

############## TEAM PITCHING_SO ############################
#most pitching SO's is 1450.  So delete all records with more than 1450 pitching SO's
mb5 <- mb4

# fixed error in this line: dataframe in 'which' call was mb1 so changed to mb5
mb5 <- mb5[which(mb5$TEAM_PITCHING_SO < 1450),]

eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_PITCHING_SO < 1450),]

# check rowcount
nrow(mb5)

## [1] 2251

nrow(eval_data.4)

## [1] 257

############ TEAM_PITCHING_H ##############################
#most ever hits by a team is 1730.  So delete all pitching hits >3000 to be conservative with the median
mb6 <- mb5
mb6 <- mb6[which(mb6$TEAM_PITCHING_H < 3001),]

eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_PITCHING_H < 3001),]

# check rowcount
nrow(mb6)

## [1] 2174

nrow(eval_data.4)

## [1] 249

############ TEAM_FIELDING_E ##############################
#most ever errors by a team is 639 by 1883 Philadelphia.  Prorating to 162 games gives a value of 1046.
mb7 <- mb6
mb7 <- mb7[which(mb7$TEAM_FIELDING_E < 1047),]

eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_FIELDING_E < 1047),]

# ----------------------------------------------------------------------
# ----------------------------------------------------------------------

# check rowcount: result is 2172 => removed total of 104 rows
nrow(mb7)

## [1] 2172

nrow(eval_data.4)

## [1] 248

dim(mb)-dim(mb7)

## [1] 104   0

#we removed 104 rows total due to outliers in TRAINING data set.

# we removed 11 rows from the EVALUATION data set

# now renumber rows of dataframe so that there are no gaps in row numbers
rownames(mb7) <- 1:nrow(mb7)
rownames(eval_data.4) <- 1:nrow(eval_data.4)

# drop INDEX column from training set
# mb7 <- mb7[,-1]

# now drop dummy column from evaluation data
# eval_data.4 <- eval_data.4[,-14]


# create CSV files containing updated data sets
#SMK commented out for now-doesn't work on a mac
#write.csv(mb7, file = "C:/SQLData/621-HW1-Clean-Data.csv", row.names = FALSE, col.names = TRUE)

#write.csv(eval_data.4, file = "C:/SQLData/621-HW1-Clean-EvalData-.csv", row.names = FALSE, col.names = TRUE)

Part 3. Build Models

Model 1: General Model Using Backward Selection

library(car)

mb_clean <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/621-HW1-Clean-Data.csv")

Build Models

Model using all remaining variables as a starting point

Yields r^2= 0.3347, Adj r^2 = 0.3322, F = 136

# keep the clean data set pure
mb <- mb_clean

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.179  -7.684   0.193   7.338  63.241 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      48.966514   5.630361   8.697  < 2e-16 ***
## TEAM_BATTING_2B  -0.042963   0.009058  -4.743 2.24e-06 ***
## TEAM_BATTING_3B   0.119709   0.017111   6.996 3.50e-12 ***
## TEAM_BATTING_HR   0.076628   0.010255   7.473 1.14e-13 ***
## TEAM_BATTING_BB   0.179272   0.016504  10.863  < 2e-16 ***
## TEAM_BATTING_SO  -0.047387   0.011793  -4.018 6.06e-05 ***
## TEAM_BASERUN_SB   0.067351   0.004963  13.570  < 2e-16 ***
## TEAM_PITCHING_H   0.038622   0.004966   7.778 1.13e-14 ***
## TEAM_PITCHING_BB -0.139535   0.014311  -9.750  < 2e-16 ***
## TEAM_PITCHING_SO  0.029867   0.011011   2.712  0.00673 ** 
## TEAM_FIELDING_E  -0.076068   0.003894 -19.535  < 2e-16 ***
## TEAM_FIELDING_DP -0.118907   0.013074  -9.095  < 2e-16 ***
## TEAM_BATTING_1B  -0.011641   0.006975  -1.669  0.09526 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.62 on 2159 degrees of freedom
## Multiple R-squared:  0.3658, Adjusted R-squared:  0.3623 
## F-statistic: 103.8 on 12 and 2159 DF,  p-value: < 2.2e-16

# p-value indicates remove TEAM_BATTING_1B

# --------------------
# remove TEAM_BATTING_1B
model.2 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.306  -7.736   0.089   7.278  61.942 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      43.726614   4.675707   9.352  < 2e-16 ***
## TEAM_BATTING_2B  -0.037030   0.008334  -4.443 9.32e-06 ***
## TEAM_BATTING_3B   0.123030   0.017002   7.236 6.37e-13 ***
## TEAM_BATTING_HR   0.084581   0.009084   9.311  < 2e-16 ***
## TEAM_BATTING_BB   0.175496   0.016355  10.731  < 2e-16 ***
## TEAM_BATTING_SO  -0.057558   0.010100  -5.699 1.37e-08 ***
## TEAM_BASERUN_SB   0.066640   0.004947  13.471  < 2e-16 ***
## TEAM_PITCHING_H   0.032020   0.003003  10.663  < 2e-16 ***
## TEAM_PITCHING_BB -0.136552   0.014205  -9.613  < 2e-16 ***
## TEAM_PITCHING_SO  0.040806   0.008851   4.610 4.26e-06 ***
## TEAM_FIELDING_E  -0.074297   0.003748 -19.823  < 2e-16 ***
## TEAM_FIELDING_DP -0.120293   0.013053  -9.216  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.62 on 2160 degrees of freedom
## Multiple R-squared:  0.365,  Adjusted R-squared:  0.3618 
## F-statistic: 112.9 on 11 and 2160 DF,  p-value: < 2.2e-16

# p-values are OK so check collinearity
vif(model.2)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.319929         3.359689         4.643042        43.103947 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##        83.755735         3.378362         9.471591        37.110476 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        62.742239         4.973211         2.082577

# vif says remove TEAM_BATTING_SO or PITCHING_SO, so remove PITCHING_SO per other models

# -------------------
#eliminate TEAM_PITCHING_SO
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.949  -7.717   0.285   7.601  63.510 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      38.652404   4.565554   8.466  < 2e-16 ***
## TEAM_BATTING_2B  -0.042295   0.008294  -5.099 3.71e-07 ***
## TEAM_BATTING_3B   0.107468   0.016741   6.419 1.68e-10 ***
## TEAM_BATTING_HR   0.077621   0.009000   8.625  < 2e-16 ***
## TEAM_BATTING_BB   0.123390   0.011876  10.390  < 2e-16 ***
## TEAM_BATTING_SO  -0.012083   0.002182  -5.538 3.43e-08 ***
## TEAM_BASERUN_SB   0.061650   0.004850  12.713  < 2e-16 ***
## TEAM_PITCHING_H   0.036828   0.002829  13.017  < 2e-16 ***
## TEAM_PITCHING_BB -0.090392   0.010123  -8.929  < 2e-16 ***
## TEAM_FIELDING_E  -0.071010   0.003697 -19.208  < 2e-16 ***
## TEAM_FIELDING_DP -0.119234   0.013112  -9.093  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.67 on 2161 degrees of freedom
## Multiple R-squared:  0.3588, Adjusted R-squared:  0.3558 
## F-statistic: 120.9 on 10 and 2161 DF,  p-value: < 2.2e-16

vif(model.3)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.276374         3.227243         4.514804        22.517049 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##         3.871698         3.216671         8.329160        18.672715 
##  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         4.793258         2.081932

# vif says remove TEAM_BATTING_BB or PITCHINNG_BB so go with PITCHING_BB

# -------------------
#eliminate TEAM_PITCHING_BB
model.4 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.780  -8.035   0.320   7.776  69.832 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.579012   3.763135  16.629  < 2e-16 ***
## TEAM_BATTING_2B  -0.008694   0.007525  -1.155    0.248    
## TEAM_BATTING_3B   0.134601   0.016760   8.031 1.57e-15 ***
## TEAM_BATTING_HR   0.103304   0.008681  11.899  < 2e-16 ***
## TEAM_BATTING_BB   0.020938   0.003117   6.716 2.37e-11 ***
## TEAM_BATTING_SO  -0.016911   0.002152  -7.859 6.07e-15 ***
## TEAM_BASERUN_SB   0.063121   0.004934  12.793  < 2e-16 ***
## TEAM_PITCHING_H   0.016891   0.001769   9.549  < 2e-16 ***
## TEAM_FIELDING_E  -0.069910   0.003761 -18.586  < 2e-16 ***
## TEAM_FIELDING_DP -0.108598   0.013294  -8.169 5.22e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.89 on 2162 degrees of freedom
## Multiple R-squared:  0.3351, Adjusted R-squared:  0.3323 
## F-statistic: 121.1 on 9 and 2162 DF,  p-value: < 2.2e-16

# --------------------
# eliminate TEAM_BATTING_2B
model.5 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_BATTING_2B)
summary(model.5)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB - TEAM_BATTING_2B, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.412  -8.089   0.329   7.776  68.696 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.392894   3.759976  16.594  < 2e-16 ***
## TEAM_BATTING_3B   0.132616   0.016673   7.954 2.89e-15 ***
## TEAM_BATTING_HR   0.101500   0.008541  11.884  < 2e-16 ***
## TEAM_BATTING_BB   0.020574   0.003102   6.633 4.14e-11 ***
## TEAM_BATTING_SO  -0.017105   0.002145  -7.973 2.49e-15 ***
## TEAM_BASERUN_SB   0.063188   0.004934  12.806  < 2e-16 ***
## TEAM_PITCHING_H   0.015911   0.001553  10.248  < 2e-16 ***
## TEAM_FIELDING_E  -0.068573   0.003579 -19.158  < 2e-16 ***
## TEAM_FIELDING_DP -0.108871   0.013293  -8.190 4.40e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.89 on 2163 degrees of freedom
## Multiple R-squared:  0.3347, Adjusted R-squared:  0.3322 
## F-statistic:   136 on 8 and 2163 DF,  p-value: < 2.2e-16

# p-values < .05 so check for collinearity
vif(model.5)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         3.088083         3.922549         1.481720         3.611668 
##  TEAM_BASERUN_SB  TEAM_PITCHING_H  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         3.212525         2.419816         4.334892         2.064098

# no colinearity so STOP HERE

DIAGNOSTICS

Plots are linear except for errors

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.5, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Lack of constant variability in Resid vs. Fitted. Normal QQ shows a bit of skew in upper right end but not drastic; Residuals appear to be within 2 std devs. Outliers at 1920, 1737, 393, 1515

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.5)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for several variables: 3B, HR, SB, Pitch_H, Fielding_E, BATT_BB, BATT_SO, FIELDING_DP

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.5)
par(mfrow=c(2,2))

plot(mb$TEAM_BATTING_3B, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_HR, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_PITCHING_H, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship whose slope might be skewed by outliers in upper right of plot

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.5$fitted.values
# nrow(fit1)

par(mfrow = c(1,1))
plot(fit1, mb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb$TARGET_WINS),lty=2)

Now cleanup data objects that are no longer required

rm(model, model.2, model.3, model.4)

REMOVE OUTLIERS AND REFIT

Per Cooks Distance, remove items 1920, 1737, 393, 1515

############ FIRST SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem <- mb_clean[-c(1920, 1737, 393, 1515),]

# renumber rows
rownames(mb_rem) <- 1:nrow(mb_rem)

Now refit first model from above: all variables

Yields r^2= 0.3504, Adj r^2 = 0.348, F = 145.6

# keep the clean data set pure
mb <- mb_rem

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.312  -7.606   0.169   7.370  45.974 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      52.702659   5.533457   9.524  < 2e-16 ***
## TEAM_BATTING_2B  -0.048364   0.008899  -5.435 6.10e-08 ***
## TEAM_BATTING_3B   0.131471   0.016835   7.809 8.91e-15 ***
## TEAM_BATTING_HR   0.077797   0.010045   7.745 1.46e-14 ***
## TEAM_BATTING_BB   0.184502   0.016217  11.377  < 2e-16 ***
## TEAM_BATTING_SO  -0.049773   0.011616  -4.285 1.91e-05 ***
## TEAM_BASERUN_SB   0.069365   0.004863  14.265  < 2e-16 ***
## TEAM_PITCHING_H   0.039055   0.004879   8.005 1.94e-15 ***
## TEAM_PITCHING_BB -0.144207   0.014074 -10.246  < 2e-16 ***
## TEAM_PITCHING_SO  0.031063   0.010855   2.862  0.00425 ** 
## TEAM_FIELDING_E  -0.081383   0.003851 -21.135  < 2e-16 ***
## TEAM_FIELDING_DP -0.119896   0.012795  -9.370  < 2e-16 ***
## TEAM_BATTING_1B  -0.013777   0.006856  -2.010  0.04460 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.37 on 2155 degrees of freedom
## Multiple R-squared:  0.3839, Adjusted R-squared:  0.3805 
## F-statistic: 111.9 on 12 and 2155 DF,  p-value: < 2.2e-16

vif(model)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.753180         3.434748         5.922306        44.068099 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##       115.662896         3.377840        25.603370        37.973500 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP  TEAM_BATTING_1B 
##        98.479804         5.337711         2.083157         8.252408

# vif indicates remove TEAM_PITCHING_SO

# --------------------
# remove TEAM_PITCHING_SO
model.2 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.494  -7.559   0.328   7.446  45.527 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      55.515612   5.454519  10.178  < 2e-16 ***
## TEAM_BATTING_2B  -0.056967   0.008389  -6.790 1.44e-11 ***
## TEAM_BATTING_3B   0.120478   0.016419   7.338 3.06e-13 ***
## TEAM_BATTING_HR   0.066476   0.009248   7.188 9.02e-13 ***
## TEAM_BATTING_BB   0.162650   0.014330  11.350  < 2e-16 ***
## TEAM_BATTING_SO  -0.017208   0.002335  -7.368 2.45e-13 ***
## TEAM_BASERUN_SB   0.067604   0.004831  13.992  < 2e-16 ***
## TEAM_PITCHING_H   0.048059   0.003735  12.866  < 2e-16 ***
## TEAM_PITCHING_BB -0.124471   0.012289 -10.129  < 2e-16 ***
## TEAM_FIELDING_E  -0.081495   0.003857 -21.130  < 2e-16 ***
## TEAM_FIELDING_DP -0.117961   0.012798  -9.217  < 2e-16 ***
## TEAM_BATTING_1B  -0.025520   0.005501  -4.639 3.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.39 on 2156 degrees of freedom
## Multiple R-squared:  0.3816, Adjusted R-squared:  0.3784 
## F-statistic: 120.9 on 11 and 2156 DF,  p-value: < 2.2e-16

# p-values are OK so check collinearity
vif(model.2)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.438954         3.255919         5.003748        34.297363 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##         4.659832         3.323768        14.956756        28.854387 
##  TEAM_FIELDING_E TEAM_FIELDING_DP  TEAM_BATTING_1B 
##         5.337159         2.077336         5.295547

# vif says remove TEAM_PITCHING_BB

# -------------------
#eliminate TEAM_PITCHING_BB
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.824  -7.989   0.294   7.673  42.263 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.243821   5.574685  10.448  < 2e-16 ***
## TEAM_BATTING_2B  -0.014168   0.007416  -1.910   0.0562 .  
## TEAM_BATTING_3B   0.142709   0.016650   8.571  < 2e-16 ***
## TEAM_BATTING_HR   0.106734   0.008545  12.491  < 2e-16 ***
## TEAM_BATTING_BB   0.020695   0.003061   6.762 1.75e-11 ***
## TEAM_BATTING_SO  -0.015795   0.002386  -6.621 4.49e-11 ***
## TEAM_BASERUN_SB   0.063058   0.004923  12.810  < 2e-16 ***
## TEAM_PITCHING_H   0.015173   0.001890   8.028 1.61e-15 ***
## TEAM_FIELDING_E  -0.072930   0.003851 -18.940  < 2e-16 ***
## TEAM_FIELDING_DP -0.111549   0.013080  -8.528  < 2e-16 ***
## TEAM_BATTING_1B   0.007382   0.004543   1.625   0.1043    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.65 on 2157 degrees of freedom
## Multiple R-squared:  0.3522, Adjusted R-squared:  0.3492 
## F-statistic: 117.3 on 10 and 2157 DF,  p-value: < 2.2e-16

# p-values say remove TEAM_BATTING_1B

# -------------------
#eliminate TEAM_BATTING_1B
model.4 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_1B - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.166  -7.970   0.318   7.689  41.838 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      65.022700   3.699490  17.576  < 2e-16 ***
## TEAM_BATTING_2B  -0.013481   0.007407  -1.820   0.0689 .  
## TEAM_BATTING_3B   0.146314   0.016508   8.863  < 2e-16 ***
## TEAM_BATTING_HR   0.105677   0.008524  12.398  < 2e-16 ***
## TEAM_BATTING_BB   0.020919   0.003059   6.839 1.03e-11 ***
## TEAM_BATTING_SO  -0.017600   0.002112  -8.334  < 2e-16 ***
## TEAM_BASERUN_SB   0.064519   0.004842  13.326  < 2e-16 ***
## TEAM_PITCHING_H   0.016390   0.001736   9.443  < 2e-16 ***
## TEAM_FIELDING_E  -0.074524   0.003725 -20.007  < 2e-16 ***
## TEAM_FIELDING_DP -0.109715   0.013037  -8.416  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.66 on 2158 degrees of freedom
## Multiple R-squared:  0.3514, Adjusted R-squared:  0.3487 
## F-statistic: 129.9 on 9 and 2158 DF,  p-value: < 2.2e-16

vif(model.4)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         1.814319         3.140953         4.055967         1.491080 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H  TEAM_FIELDING_E 
##         3.636171         3.185297         3.081945         4.750943 
## TEAM_FIELDING_DP 
##         2.056837

# pvals say remove TEAM_BATTING_2B
model.5 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_BATTING_2B - TEAM_BATTING_1B - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.5)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_BATTING_2B - TEAM_BATTING_1B - 
##     TEAM_PITCHING_SO - TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.605  -8.030   0.317   7.757  42.164 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      64.703618   3.697313  17.500  < 2e-16 ***
## TEAM_BATTING_3B   0.143076   0.016420   8.713  < 2e-16 ***
## TEAM_BATTING_HR   0.102858   0.008386  12.265  < 2e-16 ***
## TEAM_BATTING_BB   0.020349   0.003044   6.684 2.94e-11 ***
## TEAM_BATTING_SO  -0.017890   0.002107  -8.491  < 2e-16 ***
## TEAM_BASERUN_SB   0.064609   0.004844  13.338  < 2e-16 ***
## TEAM_PITCHING_H   0.014888   0.001528   9.745  < 2e-16 ***
## TEAM_FIELDING_E  -0.072407   0.003541 -20.450  < 2e-16 ***
## TEAM_FIELDING_DP -0.110117   0.013042  -8.443  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.66 on 2159 degrees of freedom
## Multiple R-squared:  0.3504, Adjusted R-squared:  0.348 
## F-statistic: 145.6 on 8 and 2159 DF,  p-value: < 2.2e-16

vif(model.5)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         3.104460         3.922081         1.475492         3.615480 
##  TEAM_BASERUN_SB  TEAM_PITCHING_H  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         3.184962         2.384787         4.287809         2.056247

# vif and pvals OK so stop

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Lack of constant variability in Resid vs. Fitted. Normal QQ shows a bit of skew in upper right end but not drastic; Residuals appear to be within 2 std devs. Might not be a good model.

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.4)

Plots show outliers so remove them and re-fit

Per Cooks Distance, remove items 1931, 391, 820, 1933, 835, 2124

############ FIRST SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem2 <- mb[-c(1931, 391, 820, 1933, 835, 2124),]

# renumber rows
rownames(mb_rem2) <- 1:nrow(mb_rem2)

Model using all remaining variables as a starting point

Yields r^2= 0.3598, Adj r^2 = 0.3572, F = 134.4

# keep the clean data set pure
mb <- mb_rem2

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX )
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.835  -7.607   0.200   7.360  45.489 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      53.581847   5.491812   9.757  < 2e-16 ***
## TEAM_BATTING_2B  -0.051601   0.008852  -5.829 6.40e-09 ***
## TEAM_BATTING_3B   0.131444   0.016694   7.874 5.42e-15 ***
## TEAM_BATTING_HR   0.080041   0.009950   8.044 1.42e-15 ***
## TEAM_BATTING_BB   0.171434   0.016181  10.595  < 2e-16 ***
## TEAM_BATTING_SO  -0.042020   0.011568  -3.633 0.000287 ***
## TEAM_BASERUN_SB   0.068541   0.004809  14.252  < 2e-16 ***
## TEAM_PITCHING_H   0.039065   0.004831   8.086 1.02e-15 ***
## TEAM_PITCHING_BB -0.132140   0.014071  -9.391  < 2e-16 ***
## TEAM_PITCHING_SO  0.023143   0.010823   2.138 0.032608 *  
## TEAM_FIELDING_E  -0.080763   0.003829 -21.093  < 2e-16 ***
## TEAM_FIELDING_DP -0.118407   0.012650  -9.360  < 2e-16 ***
## TEAM_BATTING_1B  -0.013730   0.006805  -2.018 0.043761 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.24 on 2149 degrees of freedom
## Multiple R-squared:  0.3885, Adjusted R-squared:  0.3851 
## F-statistic: 113.8 on 12 and 2149 DF,  p-value: < 2.2e-16

# p-values all < .05 so check collinearity
vif(model)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.747597         3.421239         5.926475        44.308309 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##       116.779721         3.345094        25.197359        38.414216 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP  TEAM_BATTING_1B 
##        99.525121         5.265275         2.072666         8.259655

# vif says remove TEAM_PITCHING_SO

# -------------------
#eliminate TEAM_PITCHING_SO
model.2 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.771  -7.636   0.258   7.342  45.148 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      55.723220   5.404214  10.311  < 2e-16 ***
## TEAM_BATTING_2B  -0.057976   0.008342  -6.950 4.82e-12 ***
## TEAM_BATTING_3B   0.123500   0.016289   7.582 5.04e-14 ***
## TEAM_BATTING_HR   0.071757   0.009173   7.823 8.03e-15 ***
## TEAM_BATTING_BB   0.154860   0.014216  10.893  < 2e-16 ***
## TEAM_BATTING_SO  -0.017783   0.002312  -7.691 2.20e-14 ***
## TEAM_BASERUN_SB   0.067235   0.004774  14.083  < 2e-16 ***
## TEAM_PITCHING_H   0.045685   0.003712  12.308  < 2e-16 ***
## TEAM_PITCHING_BB -0.117136   0.012207  -9.596  < 2e-16 ***
## TEAM_FIELDING_E  -0.080871   0.003832 -21.106  < 2e-16 ***
## TEAM_FIELDING_DP -0.116948   0.012642  -9.251  < 2e-16 ***
## TEAM_BATTING_1B  -0.022420   0.005463  -4.104 4.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.25 on 2150 degrees of freedom
## Multiple R-squared:  0.3872, Adjusted R-squared:  0.384 
## F-statistic: 123.5 on 11 and 2150 DF,  p-value: < 2.2e-16

# p-values OK so check collinearity
vif(model.2)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.435898         3.251820         5.028029        34.141690 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##         4.657608         3.291089        14.849242        28.862800 
##  TEAM_FIELDING_E TEAM_FIELDING_DP  TEAM_BATTING_1B 
##         5.264363         2.066635         5.314335

# vif says remove TEAM_PITCHING_BB

# -------------------
#eliminate TEAM_BATTING_BB
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.474  -7.928   0.289   7.545  42.297 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      58.415319   5.509998  10.602  < 2e-16 ***
## TEAM_BATTING_2B  -0.018310   0.007397  -2.475   0.0134 *  
## TEAM_BATTING_3B   0.145166   0.016470   8.814  < 2e-16 ***
## TEAM_BATTING_HR   0.109983   0.008436  13.038  < 2e-16 ***
## TEAM_BATTING_BB   0.021452   0.003028   7.084 1.89e-12 ***
## TEAM_BATTING_SO  -0.016549   0.002357  -7.021 2.93e-12 ***
## TEAM_BASERUN_SB   0.063016   0.004854  12.984  < 2e-16 ***
## TEAM_PITCHING_H   0.014679   0.001865   7.872 5.48e-15 ***
## TEAM_FIELDING_E  -0.073003   0.003821 -19.104  < 2e-16 ***
## TEAM_FIELDING_DP -0.110856   0.012890  -8.600  < 2e-16 ***
## TEAM_BATTING_1B   0.008578   0.004498   1.907   0.0567 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.48 on 2151 degrees of freedom
## Multiple R-squared:  0.3609, Adjusted R-squared:  0.358 
## F-statistic: 121.5 on 10 and 2151 DF,  p-value: < 2.2e-16

# p-values says remove TEAM_BATTING_1B

# -------------------
#eliminate TEAM_BATTING_1B
model.4 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_BATTING_1B)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB - 
##     TEAM_BATTING_1B, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.519  -8.001   0.336   7.614  41.812 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.261095   3.667019  18.069  < 2e-16 ***
## TEAM_BATTING_2B  -0.017347   0.007384  -2.349   0.0189 *  
## TEAM_BATTING_3B   0.149391   0.016330   9.148  < 2e-16 ***
## TEAM_BATTING_HR   0.108695   0.008414  12.919  < 2e-16 ***
## TEAM_BATTING_BB   0.021727   0.003027   7.178 9.67e-13 ***
## TEAM_BATTING_SO  -0.018634   0.002089  -8.920  < 2e-16 ***
## TEAM_BASERUN_SB   0.064701   0.004775  13.549  < 2e-16 ***
## TEAM_PITCHING_H   0.016082   0.001715   9.379  < 2e-16 ***
## TEAM_FIELDING_E  -0.074833   0.003701 -20.219  < 2e-16 ***
## TEAM_FIELDING_DP -0.108762   0.012851  -8.463  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.49 on 2152 degrees of freedom
## Multiple R-squared:  0.3598, Adjusted R-squared:  0.3572 
## F-statistic: 134.4 on 9 and 2152 DF,  p-value: < 2.2e-16

# p-values OK so check collinearity
vif(model.4)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         1.829117         3.131630         4.053547         1.482965 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H  TEAM_FIELDING_E 
##         3.643091         3.155029         3.036321         4.706195 
## TEAM_FIELDING_DP 
##         2.046463

# vif OK so STOP

# get MSE of residuals
anova(model.4)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq  F value    Pr(>F)    
## TEAM_BATTING_2B     1  23852   23852 180.7371 < 2.2e-16 ***
## TEAM_BATTING_3B     1  12881   12881  97.6052 < 2.2e-16 ***
## TEAM_BATTING_HR     1  30779   30779 233.2296 < 2.2e-16 ***
## TEAM_BATTING_BB     1  15162   15162 114.8928 < 2.2e-16 ***
## TEAM_BATTING_SO     1   5965    5965  45.2032 2.268e-11 ***
## TEAM_BASERUN_SB     1  12795   12795  96.9513 < 2.2e-16 ***
## TEAM_PITCHING_H     1     63      63   0.4781    0.4894    
## TEAM_FIELDING_E     1  48693   48693 368.9741 < 2.2e-16 ***
## TEAM_FIELDING_DP    1   9452    9452  71.6227 < 2.2e-16 ***
## Residuals        2152 283999     132                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Diagnostics

Plots are linear except for errors

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.4, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Lack of constant variability in Resid vs. Fitted. Normal QQ shows a bit of skew in upper right end but not drastic; Residuals appear to be within 2 std devs.

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.4)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for several variables: 3B, HR, SB, Pitch_H, Fielding_E, PITCH_BB, PITCH_SO, FIELDING_DP

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.4)
par(mfrow=c(2,2))

plot(mb$TEAM_BATTING_2B, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_3B, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_HR, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_PITCHING_H, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship whose slope might be skewed by outliers in upper right of plot

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.4$fitted.values
# nrow(fit1)

par(mfrow = c(1,1))
plot(fit1, mb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb$TARGET_WINS),lty=2)

Now try same model but with FIELD_E transformed by Box-Cox recommended power transform

# TEAM_FIELDING_E: Box-cox yields -1 => 1/y

mb$TEAM_FIELDING_E <- 1/mb$TEAM_FIELDING_E

Yields r^2= 0.3168, Adj r^2 = 0.3143, F = 124.8

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.440  -7.481   0.083   7.789  41.052 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.414e+01  5.587e+00   4.321 1.62e-05 ***
## TEAM_BATTING_2B  -2.377e-02  8.948e-03  -2.656  0.00797 ** 
## TEAM_BATTING_3B   1.520e-01  1.771e-02   8.585  < 2e-16 ***
## TEAM_BATTING_HR   8.730e-02  1.038e-02   8.414  < 2e-16 ***
## TEAM_BATTING_BB   1.226e-01  1.663e-02   7.372 2.38e-13 ***
## TEAM_BATTING_SO  -4.739e-02  1.204e-02  -3.936 8.55e-05 ***
## TEAM_BASERUN_SB   3.330e-02  4.524e-03   7.360 2.60e-13 ***
## TEAM_PITCHING_H   1.082e-02  4.739e-03   2.284  0.02250 *  
## TEAM_PITCHING_BB -8.508e-02  1.440e-02  -5.907 4.05e-09 ***
## TEAM_PITCHING_SO  2.482e-02  1.126e-02   2.205  0.02754 *  
## TEAM_FIELDING_E   3.247e+03  2.057e+02  15.781  < 2e-16 ***
## TEAM_FIELDING_DP -1.143e-01  1.318e-02  -8.673  < 2e-16 ***
## TEAM_BATTING_1B   1.841e-02  6.831e-03   2.696  0.00708 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.69 on 2149 degrees of freedom
## Multiple R-squared:  0.3385, Adjusted R-squared:  0.3348 
## F-statistic: 91.65 on 12 and 2149 DF,  p-value: < 2.2e-16

# p-values all < .05 so check collinearity

vif(model)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.595850         3.558823         5.957395        43.240147 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##       116.965149         2.736438        22.414047        37.210796 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP  TEAM_BATTING_1B 
##        99.513424         4.243335         2.081098         7.692617

# vif says remove TEAM_PITCHING_SO

# -------------------
#eliminate TEAM_PITCHING_SO
model.2 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.855  -7.541   0.243   7.891  39.291 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.640e+01  5.498e+00   4.802 1.68e-06 ***
## TEAM_BATTING_2B  -3.055e-02  8.410e-03  -3.633 0.000287 ***
## TEAM_BATTING_3B   1.435e-01  1.730e-02   8.296  < 2e-16 ***
## TEAM_BATTING_HR   7.844e-02  9.574e-03   8.193 4.34e-16 ***
## TEAM_BATTING_BB   1.047e-01  1.453e-02   7.205 8.01e-13 ***
## TEAM_BATTING_SO  -2.140e-02  2.459e-03  -8.701  < 2e-16 ***
## TEAM_BASERUN_SB   3.184e-02  4.480e-03   7.108 1.59e-12 ***
## TEAM_PITCHING_H   1.788e-02  3.498e-03   5.112 3.48e-07 ***
## TEAM_PITCHING_BB -6.891e-02  1.241e-02  -5.553 3.16e-08 ***
## TEAM_FIELDING_E   3.250e+03  2.059e+02  15.784  < 2e-16 ***
## TEAM_FIELDING_DP -1.128e-01  1.317e-02  -8.558  < 2e-16 ***
## TEAM_BATTING_1B   9.141e-03  5.388e-03   1.697 0.089933 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.7 on 2150 degrees of freedom
## Multiple R-squared:  0.337,  Adjusted R-squared:  0.3336 
## F-statistic: 99.36 on 11 and 2150 DF,  p-value: < 2.2e-16

# p-values says remove TEAM_BATTING_1B

# -------------------
#eliminate TEAM_BATTING_1B
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.322  -7.571   0.210   7.727  39.339 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.140e+01  4.642e+00   6.765 1.71e-11 ***
## TEAM_BATTING_2B  -3.284e-02  8.305e-03  -3.954 7.94e-05 ***
## TEAM_BATTING_3B   1.435e-01  1.731e-02   8.290  < 2e-16 ***
## TEAM_BATTING_HR   7.398e-02  9.211e-03   8.032 1.56e-15 ***
## TEAM_BATTING_BB   1.187e-01  1.199e-02   9.902  < 2e-16 ***
## TEAM_BATTING_SO  -2.305e-02  2.258e-03 -10.212  < 2e-16 ***
## TEAM_BASERUN_SB   3.240e-02  4.469e-03   7.250 5.79e-13 ***
## TEAM_PITCHING_H   2.159e-02  2.733e-03   7.900 4.42e-15 ***
## TEAM_PITCHING_BB -8.082e-02  1.024e-02  -7.893 4.67e-15 ***
## TEAM_FIELDING_E   3.274e+03  2.055e+02  15.931  < 2e-16 ***
## TEAM_FIELDING_DP -1.111e-01  1.315e-02  -8.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.7 on 2151 degrees of freedom
## Multiple R-squared:  0.3361, Adjusted R-squared:  0.3331 
## F-statistic: 108.9 on 10 and 2151 DF,  p-value: < 2.2e-16

vif(model.3)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         2.230211         3.389546         4.682396        22.413215 
##  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB 
##         4.100904         2.663642         7.432730        18.756439 
##  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         4.223007         2.063646

# p-values says remove TEAM_PITCHING_BB

# -------------------
#eliminate TEAM_BATTING_BB
model.4 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.522  -7.843   0.386   7.968  39.441 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.286e+01  3.815e+00  13.856  < 2e-16 ***
## TEAM_BATTING_2B  -3.694e-03  7.545e-03  -0.490   0.6245    
## TEAM_BATTING_3B   1.696e-01  1.723e-02   9.847  < 2e-16 ***
## TEAM_BATTING_HR   9.684e-02  8.867e-03  10.921  < 2e-16 ***
## TEAM_BATTING_BB   2.722e-02  3.103e-03   8.771  < 2e-16 ***
## TEAM_BATTING_SO  -2.741e-02  2.220e-03 -12.349  < 2e-16 ***
## TEAM_BASERUN_SB   3.418e-02  4.527e-03   7.550 6.38e-14 ***
## TEAM_PITCHING_H   3.888e-03  1.584e-03   2.455   0.0142 *  
## TEAM_FIELDING_E   3.282e+03  2.084e+02  15.744  < 2e-16 ***
## TEAM_FIELDING_DP -1.022e-01  1.328e-02  -7.693 2.17e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.87 on 2152 degrees of freedom
## Multiple R-squared:  0.3169, Adjusted R-squared:  0.3141 
## F-statistic: 110.9 on 9 and 2152 DF,  p-value: < 2.2e-16

# p-values say remove doubles
model.5 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B - TEAM_PITCHING_BB - TEAM_BATTING_2B)
summary(model.5)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B - 
##     TEAM_PITCHING_BB - TEAM_BATTING_2B, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.512  -7.826   0.349   8.005  39.581 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.288e+01  3.815e+00  13.862  < 2e-16 ***
## TEAM_BATTING_3B   1.685e-01  1.707e-02   9.873  < 2e-16 ***
## TEAM_BATTING_HR   9.616e-02  8.757e-03  10.981  < 2e-16 ***
## TEAM_BATTING_BB   2.701e-02  3.075e-03   8.785  < 2e-16 ***
## TEAM_BATTING_SO  -2.741e-02  2.219e-03 -12.350  < 2e-16 ***
## TEAM_BASERUN_SB   3.443e-02  4.496e-03   7.658 2.84e-14 ***
## TEAM_PITCHING_H   3.568e-03  1.442e-03   2.474   0.0134 *  
## TEAM_FIELDING_E   3.252e+03  1.997e+02  16.288  < 2e-16 ***
## TEAM_FIELDING_DP -1.023e-01  1.328e-02  -7.708 1.94e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.86 on 2153 degrees of freedom
## Multiple R-squared:  0.3168, Adjusted R-squared:  0.3143 
## F-statistic: 124.8 on 8 and 2153 DF,  p-value: < 2.2e-16

vif(model.5)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         3.206158         4.116662         1.434886         3.855309 
##  TEAM_BASERUN_SB  TEAM_PITCHING_H  TEAM_FIELDING_E TEAM_FIELDING_DP 
##         2.622091         2.014360         3.877112         2.047278

# vif OK so STOP

# turn off scientific formatting of results
options(scipen=999)
model.5

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_BATTING_1B - 
##     TEAM_PITCHING_BB - TEAM_BATTING_2B, data = mb)
## 
## Coefficients:
##      (Intercept)   TEAM_BATTING_3B   TEAM_BATTING_HR   TEAM_BATTING_BB  
##        52.876911          0.168490          0.096164          0.027014  
##  TEAM_BATTING_SO   TEAM_BASERUN_SB   TEAM_PITCHING_H   TEAM_FIELDING_E  
##        -0.027410          0.034431          0.003568       3252.309198  
## TEAM_FIELDING_DP  
##        -0.102330

anova(model.5)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq  F value                Pr(>F)    
## TEAM_BATTING_3B     1   7851    7851  55.7747   0.00000000000011740 ***
## TEAM_BATTING_HR     1  55626   55626 395.1504 < 0.00000000000000022 ***
## TEAM_BATTING_BB     1  16652   16652 118.2947 < 0.00000000000000022 ***
## TEAM_BATTING_SO     1   6913    6913  49.1095   0.00000000000322538 ***
## TEAM_BASERUN_SB     1  12274   12274  87.1878 < 0.00000000000000022 ***
## TEAM_PITCHING_H     1    118     118   0.8379                0.3601    
## TEAM_FIELDING_E     1  32763   32763 232.7428 < 0.00000000000000022 ***
## TEAM_FIELDING_DP    1   8364    8364  59.4139   0.00000000000001937 ***
## Residuals        2153 303080     141                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Diagnostics

Plots are all linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.5, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Some lack of constant variability in Resid vs. Fitted. Normal QQ shows a bit of skew in lower left end but not drastic; Residuals not all within 2 std devs.

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.5)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for several variables: 3B, HR, BB, SO, SB, Pitch_H, Fielding_E

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.5)
par(mfrow=c(2,2))

plot(mb$TEAM_BATTING_3B, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_HR, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_PITCHING_H, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship whose slope might be slightly skewed by outliers in upper right of plot

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.5$fitted.values
# nrow(fit1)

par(mfrow = c(1,1))
plot(fit1, mb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb$TARGET_WINS),lty=2)

# clean up objects in memory
rm(list = ls())

Model 2: Total Bases

library(car)

mb_clean <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/621-HW1-Clean-Data.csv")

Build a model with Total Bases added and all of the other hitting vars removed

First, create the new variable and discard its components

mb_t <- mb_clean

mb_t$TOTAL_BASES <- mb_clean$TEAM_BATTING_1B + (2 * mb_clean$TEAM_BATTING_2B) + 
                       (3 * mb_clean$TEAM_BATTING_3B) + (4 * mb_clean$TEAM_BATTING_HR)

# plot histogram to check shape of distribution
par(mfrow = c(1,1))
hist(mb_t$TOTAL_BASES, breaks = 200)

# now drop 1B, 2B, 3B, HR
mb_tb <- mb_t[,c(1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 15)]

###################################################################
# check correlation with WINS and run simple linear model
cor(mb_tb$TARGET_WINS, mb_tb$TOTAL_BASES)

## [1] 0.3817283

mtest <- lm(data=mb_tb, TARGET_WINS ~ TOTAL_BASES)
summary(mtest)

## 
## Call:
## lm(formula = TARGET_WINS ~ TOTAL_BASES, data = mb_tb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.515  -9.242   0.330   9.134  49.895 
## 
## Coefficients:
##              Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 33.309929   2.498670   13.33 <0.0000000000000002 ***
## TOTAL_BASES  0.022526   0.001171   19.24 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.45 on 2170 degrees of freedom
## Multiple R-squared:  0.1457, Adjusted R-squared:  0.1453 
## F-statistic: 370.1 on 1 and 2170 DF,  p-value: < 0.00000000000000022

# shows .381 correlation and Adj R^2 of 0.1453

plot(mb_tb$TARGET_WINS ~ mb_tb$TOTAL_BASES)
abline(lm(mb_tb$TARGET_WINS ~ mb_tb$TOTAL_BASES), lty=2)

# plot doesn't show unusual relationship
######################################################################

Yields r^2= 0.3175, Adj. R^2 = 0.3153, F = 143.8

# fit model
model <- lm(data=mb_tb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb_tb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.709  -8.023  -0.060   7.601  53.946 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      39.488500   4.028435   9.802 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.150859   0.016604   9.085 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.068106   0.011085  -6.144  0.00000000095530289 ***
## TEAM_BASERUN_SB   0.066840   0.004931  13.554 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.017824   0.003839   4.643  0.00000363574932803 ***
## TEAM_PITCHING_BB -0.113929   0.014384  -7.921  0.00000000000000375 ***
## TEAM_PITCHING_SO  0.051088   0.009671   5.283  0.00000014005177514 ***
## TEAM_FIELDING_E  -0.063746   0.003731 -17.084 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.122049   0.013247  -9.213 < 0.0000000000000002 ***
## TOTAL_BASES       0.014634   0.002346   6.239  0.00000000052927091 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.87 on 2162 degrees of freedom
## Multiple R-squared:  0.3368, Adjusted R-squared:  0.3341 
## F-statistic:   122 on 9 and 2162 DF,  p-value: < 0.00000000000000022

# All p-values < .05 so check collinearity
vif(model)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        42.583711        96.685910         3.217711        14.833674 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        36.467493        71.783178         4.724229         2.055734 
##      TOTAL_BASES 
##         5.151420

# vif indicates remove TEAM_BATTING_SO or TEAM_PITCHING_SO. CHoose PITCHING_SO as in other models

# ----------------------
# remove TEAM_PITCHING_SO
model.2 <- lm(data=mb_tb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb_tb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.524  -7.899   0.111   7.765  53.216 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      32.957133   3.857791   8.543 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.099826   0.013589   7.346    0.000000000000287 ***
## TEAM_BATTING_SO  -0.010106   0.001537  -6.576    0.000000000060537 ***
## TEAM_BASERUN_SB   0.060228   0.004799  12.549 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.027950   0.003347   8.352 < 0.0000000000000002 ***
## TEAM_PITCHING_BB -0.067548   0.011464  -5.892    0.000000004411900 ***
## TEAM_FIELDING_E  -0.061895   0.003738 -16.559 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.118816   0.013315  -8.923 < 0.0000000000000002 ***
## TOTAL_BASES       0.009245   0.002125   4.350    0.000014249957148 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.94 on 2163 degrees of freedom
## Multiple R-squared:  0.3283, Adjusted R-squared:  0.3258 
## F-statistic: 132.1 on 8 and 2163 DF,  p-value: < 0.00000000000000022

# All p-values < .05 so check collinearity
vif(model.2)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        28.169559         1.835552         3.010433        11.135442 
## TEAM_PITCHING_BB  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##        22.880704         4.682562         2.051346         4.177202

# vif indicates remove TEAM_PITCHING_BB

# ----------------------
# remove TEAM_PITCHING_BB
model.3 <- lm(data=mb_tb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB, 
##     data = mb_tb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.761  -8.220   0.160   7.833  58.011 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      45.087126   3.287882  13.713 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.021880   0.003130   6.990     0.00000000000365 ***
## TEAM_BATTING_SO  -0.013037   0.001465  -8.897 < 0.0000000000000002 ***
## TEAM_BASERUN_SB   0.060536   0.004836  12.517 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.011527   0.001867   6.175     0.00000000078674 ***
## TEAM_FIELDING_E  -0.059620   0.003747 -15.912 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.114299   0.013396  -8.532 < 0.0000000000000002 ***
## TOTAL_BASES       0.017639   0.001590  11.097 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.04 on 2164 degrees of freedom
## Multiple R-squared:  0.3175, Adjusted R-squared:  0.3153 
## F-statistic: 143.8 on 7 and 2164 DF,  p-value: < 0.00000000000000022

# All p-values < .05 so check collinearity
vif(model.3)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##         1.471732         1.643149         3.010077         3.411334 
##  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##         4.632607         2.044547         2.300575

# no further collinearity issues so STOP

# check 95% confidence intervals for coefficients
confint(model.3)

##                         2.5 %      97.5 %
## (Intercept)      38.639390134 51.53486272
## TEAM_BATTING_BB   0.015741199  0.02801786
## TEAM_BATTING_SO  -0.015910936 -0.01016372
## TEAM_BASERUN_SB   0.051051063  0.07002015
## TEAM_PITCHING_H   0.007866465  0.01518776
## TEAM_FIELDING_E  -0.066967986 -0.05227251
## TEAM_FIELDING_DP -0.140569909 -0.08802864
## TOTAL_BASES       0.014522018  0.02075646

Diagnostics

Plots for Fielding_E shows skew. Others are pretty linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.3, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots look good except for outliers. Lack of Constant variability in Resid vs. Fitted at very large values of Yhat; normal distribution of residuals except for outliers, most residuals within 2 std dev and well within Cook’s distance

#Figure 5.6 on page 129 MARR text
par(mfrow=c(2,2))
plot(model.3)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for BASERUN_SB, PITCHING_H, PITCHING_SO, FIELDING_E, FIELDING_DP

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.3)
par(mfrow=c(2,2))

plot(mb_tb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")
plot(mb_tb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb_tb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb_tb$TEAM_PITCHING_H, StanRes1, ylab="Standardized Residuals")

plot(mb_tb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb_tb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb_tb$TOTAL_BASES, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Might be some skew due to outliers

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.3$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb_tb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb_tb$TARGET_WINS),lty=2)

REMOVE OUTLIERS AND REFIT

Per Cooks Distance, remove items 1920, 1737, 393, 1515

############ FIRST SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem <- mb_tb[-c(1920, 1737, 393, 1515),]

# renumber rows
rownames(mb_rem) <- 1:nrow(mb_rem)

Now refit first model from above: all variables

Yields r^2= 0.3287, Adj r^2 = 0.3265, F = 151.1

# keep the clean data set pure
mb <- mb_rem

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.516  -8.071   0.049   7.522  47.810 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      42.441249   3.987025  10.645 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.154335   0.016410   9.405 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.072357   0.010950  -6.608 0.000000000048869495 ***
## TEAM_BASERUN_SB   0.069009   0.004860  14.199 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.016802   0.003788   4.435 0.000009658905673535 ***
## TEAM_PITCHING_BB -0.117154   0.014229  -8.234 0.000000000000000311 ***
## TEAM_PITCHING_SO  0.053942   0.009551   5.648 0.000000018371250928 ***
## TEAM_FIELDING_E  -0.067499   0.003699 -18.248 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.123715   0.013030  -9.495 < 0.0000000000000002 ***
## TOTAL_BASES       0.014732   0.002311   6.375 0.000000000222616213 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.67 on 2158 degrees of freedom
## Multiple R-squared:  0.3493, Adjusted R-squared:  0.3466 
## F-statistic: 128.7 on 9 and 2158 DF,  p-value: < 0.00000000000000022

# pvals all < .05 so check collinearity

vif(model)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        42.781793        97.442002         3.199402        14.634840 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        36.800848        72.280509         4.670280         2.048269 
##      TOTAL_BASES 
##         5.161481

# vif indicates remove TEAM_BATTING_SO and PITCHING_SO. CHoose PITCHING_SO again

# --------------------
# remove TEAM_PITCHING_SO
model.2 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.867  -7.942   0.093   7.845  44.545 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      35.499141   3.819886   9.293 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.100306   0.013428   7.470    0.000000000000116 ***
## TEAM_BATTING_SO  -0.011103   0.001519  -7.308    0.000000000000379 ***
## TEAM_BASERUN_SB   0.061945   0.004730  13.097 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.027469   0.003307   8.305 < 0.0000000000000002 ***
## TEAM_PITCHING_BB -0.068029   0.011341  -5.998    0.000000002329868 ***
## TEAM_FIELDING_E  -0.065403   0.003707 -17.645 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.120236   0.013108  -9.173 < 0.0000000000000002 ***
## TOTAL_BASES       0.009060   0.002096   4.323    0.000016126085257 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.76 on 2159 degrees of freedom
## Multiple R-squared:  0.3397, Adjusted R-squared:  0.3373 
## F-statistic: 138.8 on 8 and 2159 DF,  p-value: < 0.00000000000000022

# p-values are OK so check collinearity
vif(model.2)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        28.243542         1.849306         2.987512        10.997742 
## TEAM_PITCHING_BB  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##        23.050031         4.623291         2.043691         4.186652

# vif says remove TEAM_BATTING_BB

# -------------------
#eliminate TEAM_BATTING_BB or PITCHING_BB so choose PITCHING_BB
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_SO)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_SO, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.456  -8.196   0.174   7.785  43.234 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      47.746708   3.254440  14.671 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.021878   0.003085   7.092     0.00000000000178 ***
## TEAM_BATTING_SO  -0.014068   0.001448  -9.715 < 0.0000000000000002 ***
## TEAM_BASERUN_SB   0.062245   0.004768  13.055 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.010923   0.001840   5.937     0.00000000336791 ***
## TEAM_FIELDING_E  -0.063123   0.003717 -16.983 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.115737   0.013192  -8.773 < 0.0000000000000002 ***
## TOTAL_BASES       0.017495   0.001567  11.166 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.85 on 2160 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.3265 
## F-statistic: 151.1 on 7 and 2160 DF,  p-value: < 0.00000000000000022

# p-values OK so check collinearity
vif(model.3)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##         1.466753         1.653485         2.987180         3.348759 
##  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##         4.574645         2.036999         2.302144

# vif and pvals OK so STOP HERE

# check 95% confidence intervals for coefficients
confint(model.3)

##                         2.5 %      97.5 %
## (Intercept)      41.364546361 54.12886979
## TEAM_BATTING_BB   0.015828537  0.02792739
## TEAM_BATTING_SO  -0.016907860 -0.01122823
## TEAM_BASERUN_SB   0.052894909  0.07159443
## TEAM_PITCHING_H   0.007315473  0.01453144
## TEAM_FIELDING_E  -0.070411557 -0.05583395
## TEAM_FIELDING_DP -0.141607287 -0.08986612
## TOTAL_BASES       0.014422320  0.02056738

Diagnostics

Plots for Fielding_E shows skew. Others are pretty linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.3, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Lack of constant variability in Resid vs. Fitted. Normal QQ shows a bit of skew in upper right end but not drastic; Some residuals appear to be outside 2 std devs. Might not be a good model. Outliers at 1528, 1922, 820, 1933, 1733, 835

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.3)

Plots show outliers so remove them and re-fit

Per Cooks Distance, remove 1528, 1922, 820, 1933, 1733, 835

############ SECOND SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem2 <- mb[-c(1528, 1922, 820, 1933, 1733, 835),]

# renumber rows
rownames(mb_rem2) <- 1:nrow(mb_rem2)

Model using all remaining variables as a starting point

Yields r^2= 0.33655, Adj r^2 = 0.3343, F = 156

# keep the clean data set pure
mb <- mb_rem2

# use p-value elimination
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.179  -8.045   0.044   7.493  44.739 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      43.011185   3.992399  10.773 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.147363   0.016384   8.994 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.068624   0.010941  -6.272   0.0000000004293871 ***
## TEAM_BASERUN_SB   0.069068   0.004809  14.363 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.016311   0.003801   4.291   0.0000185345938702 ***
## TEAM_PITCHING_BB -0.110736   0.014225  -7.784   0.0000000000000108 ***
## TEAM_PITCHING_SO  0.049930   0.009554   5.226   0.0000001900846778 ***
## TEAM_FIELDING_E  -0.067668   0.003665 -18.461 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.123738   0.012888  -9.601 < 0.0000000000000002 ***
## TOTAL_BASES       0.015058   0.002300   6.546   0.0000000000734869 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.54 on 2152 degrees of freedom
## Multiple R-squared:  0.3547, Adjusted R-squared:  0.352 
## F-statistic: 131.4 on 9 and 2152 DF,  p-value: < 0.00000000000000022

# p-values all < .05 so check collinearity

vif(model)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        43.067052        98.922118         3.175291        14.605057 
## TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP 
##        37.353473        73.550826         4.602276         2.044406 
##      TOTAL_BASES 
##         5.191730

# vif says remove TEAM_BATTING_SO

# -------------------
#eliminate TEAM_BATTING_SO or PITCHING_SO so choose PITCHING_SO again
model.2 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.995  -7.977   0.065   7.750  42.855 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      36.541180   3.818699   9.569 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.097250   0.013365   7.276  0.00000000000047808 ***
## TEAM_BATTING_SO  -0.011989   0.001512  -7.928  0.00000000000000353 ***
## TEAM_BASERUN_SB   0.062616   0.004676  13.391 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.026191   0.003317   7.895  0.00000000000000459 ***
## TEAM_PITCHING_BB -0.065120   0.011301  -5.762  0.00000000948828590 ***
## TEAM_FIELDING_E  -0.065835   0.003671 -17.935 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.120626   0.012953  -9.313 < 0.0000000000000002 ***
## TOTAL_BASES       0.009848   0.002086   4.722  0.00000248533829969 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.61 on 2153 degrees of freedom
## Multiple R-squared:  0.3465, Adjusted R-squared:  0.3441 
## F-statistic: 142.7 on 8 and 2153 DF,  p-value: < 0.00000000000000022

# p-values OK so check collinearity
vif(model.2)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##        28.313487         1.866693         2.965972        10.991773 
## TEAM_PITCHING_BB  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##        23.288881         4.560154         2.040042         4.216460

# vif says remove TEAM_BATTING_BB or PITCHING_BB so choose PITCHING_BB again

# -------------------
#eliminate TEAM_PITCHING_BB
model.3 <- lm(data=mb, TARGET_WINS ~. - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_SO - TEAM_PITCHING_BB, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.210  -8.207   0.091   7.683  41.423 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      48.485759   3.231012  15.006 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.022237   0.003050   7.291    0.000000000000431 ***
## TEAM_BATTING_SO  -0.014906   0.001436 -10.384 < 0.0000000000000002 ***
## TEAM_BASERUN_SB   0.062907   0.004711  13.355 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.010197   0.001831   5.571    0.000000028565450 ***
## TEAM_FIELDING_E  -0.063675   0.003679 -17.308 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.116577   0.013030  -8.947 < 0.0000000000000002 ***
## TOTAL_BASES       0.017963   0.001550  11.591 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.7 on 2154 degrees of freedom
## Multiple R-squared:  0.3365, Adjusted R-squared:  0.3343 
## F-statistic:   156 on 7 and 2154 DF,  p-value: < 0.00000000000000022

vif(model.3)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_H 
##         1.452828         1.657492         2.965626         3.297456 
##  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##         4.512577         2.034037         2.293841

# p-values and VIF OK so STOP

anova(model.3)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq F value                Pr(>F)    
## TEAM_BATTING_BB     1  30649   30649 223.999 < 0.00000000000000022 ***
## TEAM_BATTING_SO     1   7298    7298  53.341  0.000000000000392795 ***
## TEAM_BASERUN_SB     1   9655    9655  70.563 < 0.00000000000000022 ***
## TEAM_PITCHING_H     1  12282   12282  89.763 < 0.00000000000000022 ***
## TEAM_FIELDING_E     1  62709   62709 458.310 < 0.00000000000000022 ***
## TEAM_FIELDING_DP    1   8464    8464  61.863  0.000000000000005775 ***
## TOTAL_BASES         1  18384   18384 134.357 < 0.00000000000000022 ***
## Residuals        2154 294726     137                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# check 95% confidence intervals for coefficients
confint(model.3)

##                         2.5 %      97.5 %
## (Intercept)      42.149531058 54.82198731
## TEAM_BATTING_BB   0.016256026  0.02821876
## TEAM_BATTING_SO  -0.017720942 -0.01209071
## TEAM_BASERUN_SB   0.053669147  0.07214436
## TEAM_PITCHING_H   0.006607378  0.01378706
## TEAM_FIELDING_E  -0.070888990 -0.05646022
## TEAM_FIELDING_DP -0.142128959 -0.09102440
## TOTAL_BASES       0.014924337  0.02100264

Diagnostics

Plots for Fielding_E shows skew. Others are pretty linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.3, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Lack of constant variability in Resid vs. Fitted but only for extreme outliers. Normal QQ looks very good; Some Residuals appear to be outside 2 std devs

# plot summary residual plots
par(mfrow=c(2,2))
plot(model.3)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for BASERUN_SB, PITCHING_H, PITCHING_SO, FIELDING_E, FIELDING_DP

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.3)
par(mfrow=c(2,2))

plot(mb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_PITCHING_H, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb$TOTAL_BASES, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Might be some skew due to outliers

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.3$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb$TARGET_WINS),lty=2)

# ——————————————————————————————- # ——————————————————————————————-

Now try same model but with FIELD_E transformed according to Box-Cox

# TEAM_FIELDING_E: Box-Cox yields power xform of -1 => 1/y

mb$TEAM_FIELDING_E <- 1/mb$TEAM_FIELDING_E

Now refit first model from above: all variables

Model using all remaining variables as a starting point

Yields r^2= 0.3048, Adj r^2 = 0.3029, F = 157.5

# fit model
model <- lm(data=mb, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.449  -7.663   0.017   7.875  44.521 
## 
## Coefficients:
##                     Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)        33.940450    4.062200   8.355 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.109123    0.016792   6.499    0.000000000100466 ***
## TEAM_BATTING_SO    -0.053628    0.011230  -4.775    0.000001914819121 ***
## TEAM_BASERUN_SB     0.040792    0.004460   9.146 < 0.0000000000000002 ***
## TEAM_PITCHING_H     0.004967    0.003821   1.300              0.19381    
## TEAM_PITCHING_BB   -0.074106    0.014533  -5.099    0.000000370699407 ***
## TEAM_PITCHING_SO    0.027384    0.009820   2.789              0.00534 ** 
## TEAM_FIELDING_E  2695.772954  195.354383  13.799 < 0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.117721    0.013328  -8.833 < 0.0000000000000002 ***
## TOTAL_BASES         0.017392    0.002373   7.328    0.000000000000329 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.91 on 2152 degrees of freedom
## Multiple R-squared:  0.3133, Adjusted R-squared:  0.3104 
## F-statistic: 109.1 on 9 and 2152 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_H

# ----------------------
# remove TEAM_PITCHING_H
model.2 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.790  -7.818   0.083   7.925  45.364 
## 
## Coefficients:
##                     Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)        37.568737    2.951626  12.728 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.103043    0.016130   6.388     0.00000000020485 ***
## TEAM_BATTING_SO    -0.062243    0.009067  -6.865     0.00000000000868 ***
## TEAM_BASERUN_SB     0.042673    0.004219  10.113 < 0.0000000000000002 ***
## TEAM_PITCHING_BB   -0.069557    0.014107  -4.931     0.00000088225033 ***
## TEAM_PITCHING_SO    0.034309    0.008250   4.159     0.00003328511411 ***
## TEAM_FIELDING_E  2664.401602  193.888720  13.742 < 0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.116848    0.013313  -8.777 < 0.0000000000000002 ***
## TOTAL_BASES         0.020029    0.001231  16.267 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.91 on 2153 degrees of freedom
## Multiple R-squared:  0.3128, Adjusted R-squared:  0.3102 
## F-statistic: 122.5 on 8 and 2153 DF,  p-value: < 0.00000000000000022

vif(model.2)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB TEAM_PITCHING_BB 
##        39.211674        63.814882         2.296332        34.506646 
## TEAM_PITCHING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##        51.516155         3.631039         2.049298         1.397321

# vif says remove TEAM_BATTING_SO or PITCHING_SO so discard PITCHING_SO again

# ----------------------
# remove TEAM_PITCHING_SO
model.3 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.511  -7.762   0.113   8.049  44.351 
## 
## Coefficients:
##                     Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)        39.139798    2.938399  13.320 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.042183    0.006807   6.197       0.000000000688 ***
## TEAM_BATTING_SO    -0.025194    0.001690 -14.906 < 0.0000000000000002 ***
## TEAM_BASERUN_SB     0.041624    0.004228   9.845 < 0.0000000000000002 ***
## TEAM_PITCHING_BB   -0.016370    0.005975  -2.740               0.0062 ** 
## TEAM_FIELDING_E  2640.049236  194.531863  13.571 < 0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.113486    0.013339  -8.508 < 0.0000000000000002 ***
## TOTAL_BASES         0.019976    0.001236  16.164 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.95 on 2154 degrees of freedom
## Multiple R-squared:  0.3072, Adjusted R-squared:  0.305 
## F-statistic: 136.5 on 7 and 2154 DF,  p-value: < 0.00000000000000022

vif(model.3)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB TEAM_PITCHING_BB 
##         6.931405         2.200940         2.288133         6.142926 
##  TEAM_FIELDING_E TEAM_FIELDING_DP      TOTAL_BASES 
##         3.627727         2.041739         1.397168

# vif say remove TEAM_BATTING_BB or PITCHING_BB - go with PITCHING_BB again

# ----------------------
# remove TEAM_PITCHING_BB
model.4 <- lm(data=mb, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.763  -7.899   0.209   8.121  43.323 
## 
## Coefficients:
##                     Estimate  Std. Error t value            Pr(>|t|)    
## (Intercept)        39.164119    2.942818  13.308 <0.0000000000000002 ***
## TEAM_BATTING_BB     0.025404    0.002976   8.535 <0.0000000000000002 ***
## TEAM_BATTING_SO    -0.024551    0.001676 -14.645 <0.0000000000000002 ***
## TEAM_BASERUN_SB     0.038058    0.004028   9.447 <0.0000000000000002 ***
## TEAM_FIELDING_E  2714.549336  192.912713  14.071 <0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.114953    0.013348  -8.612 <0.0000000000000002 ***
## TOTAL_BASES         0.019678    0.001233  15.961 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.97 on 2155 degrees of freedom
## Multiple R-squared:  0.3048, Adjusted R-squared:  0.3029 
## F-statistic: 157.5 on 6 and 2155 DF,  p-value: < 0.00000000000000022

# pvals all < .05 so check collinearity
vif(model.4)

##  TEAM_BATTING_BB  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_FIELDING_E 
##         1.321097         2.158564         2.071297         3.556849 
## TEAM_FIELDING_DP      TOTAL_BASES 
##         2.038449         1.386329

options(scipen=999)
model.4

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Coefficients:
##      (Intercept)   TEAM_BATTING_BB   TEAM_BATTING_SO   TEAM_BASERUN_SB  
##         39.16412           0.02540          -0.02455           0.03806  
##  TEAM_FIELDING_E  TEAM_FIELDING_DP       TOTAL_BASES  
##       2714.54934          -0.11495           0.01968

anova(model.4)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq F value                Pr(>F)    
## TEAM_BATTING_BB     1  30649   30649 213.905 < 0.00000000000000022 ***
## TEAM_BATTING_SO     1   7298    7298  50.937 0.0000000000012976902 ***
## TEAM_BASERUN_SB     1   9655    9655  67.383 0.0000000000000003804 ***
## TEAM_FIELDING_E     1  46368   46368 323.609 < 0.00000000000000022 ***
## TEAM_FIELDING_DP    1   4920    4920  34.334 0.0000000053558270759 ***
## TOTAL_BASES         1  36500   36500 254.739 < 0.00000000000000022 ***
## Residuals        2155 308778     143                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# no collinearity so STOP
# check 95% confidence intervals for coefficients
confint(model.4)

##                          2.5 %        97.5 %
## (Intercept)        33.39305950   44.93517803
## TEAM_BATTING_BB     0.01956697    0.03124054
## TEAM_BATTING_SO    -0.02783859   -0.02126358
## TEAM_BASERUN_SB     0.03015808    0.04595838
## TEAM_FIELDING_E  2336.23488507 3092.86378594
## TEAM_FIELDING_DP   -0.14112951   -0.08877633
## TOTAL_BASES         0.01725985    0.02209542

Diagnostics

Plots show all variables are linear to response

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.4, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Some lack of Constant variability in Resid vs. Fitted at both ends; normal distribution of residuals; many residuals outside of 2 std devs PROBABLY NOT A GOOD MODEL

#Figure 5.6 on page 129 MARR text
par(mfrow=c(2,2))
plot(model.4)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for PITCHING_SO, FIELDING_E

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.4)
par(mfrow=c(2,2))

plot(mb$TEAM_BATTING_BB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_BASERUN_SB, StanRes1, ylab="Standardized Residuals")
plot(mb$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")

plot(mb$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb$TOTAL_BASES, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship with no pattern or skew

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.4$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb$TARGET_WINS),lty=2)

# clean up objects in memory
rm(list = ls())

Model 3: Total Bases PLUS

library(car)

# read clean data set from Github

mb_clean <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/621-HW1-Clean-Data.csv")

Build a model with Total Bases + SB + BB added and all of the other hitting vars removed

# create new variable and drop its components
mb_t <- mb_clean

mb_t$TB_PLUS <- mb_clean$TEAM_BATTING_1B + (2 * mb_clean$TEAM_BATTING_2B) + 
                       (3 * mb_clean$TEAM_BATTING_3B) + (4 * mb_clean$TEAM_BATTING_HR) + 
                       mb_clean$TEAM_BATTING_BB + mb_clean$TEAM_BASERUN_SB

par(mfrow = c(1,1))
hist(mb_t$TB_PLUS, breaks = 200)

# now drop 1B, 2B, 3B, HR, BB, SB
mb_tbp <- mb_t[,c(1, 2, 7, 9, 10, 11, 12, 13, 15)]

###################################################################
# check correlation with WINS and run simple linear model
cor(mb_tbp$TARGET_WINS, mb_tbp$TB_PLUS)

## [1] 0.4478658

mtest <- lm(data=mb_tbp, TARGET_WINS ~ TB_PLUS)
summary(mtest)

## 
## Call:
## lm(formula = TARGET_WINS ~ TB_PLUS, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.805  -9.087   0.294   8.845  47.414 
## 
## Coefficients:
##               Estimate Std. Error t value             Pr(>|t|)    
## (Intercept) 18.7055981  2.6867939   6.962     0.00000000000443 ***
## TB_PLUS      0.0225394  0.0009659  23.334 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.01 on 2170 degrees of freedom
## Multiple R-squared:  0.2006, Adjusted R-squared:  0.2002 
## F-statistic: 544.5 on 1 and 2170 DF,  p-value: < 0.00000000000000022

# shows .448 correlation and Adj R^2 of 0.2002 => better than TOTAL_BASES

plot(mb_tbp$TARGET_WINS ~ mb_tbp$TB_PLUS)
abline(lm(mb_tbp$TARGET_WINS ~ mb_tbp$TB_PLUS), lty=2)

# plot doesn't show unusual relationship
######################################################################

Yields r^2= 0.2845, Adj. R^2 = 0.2832, F = 215.4

# fit model
model <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.983  -8.050   0.222   8.210  55.388 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      51.167966   3.948869  12.958 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.036400   0.010160  -3.583             0.000347 ***
## TEAM_PITCHING_H   0.001971   0.003587   0.549             0.582772    
## TEAM_PITCHING_BB -0.006469   0.003749  -1.726             0.084516 .  
## TEAM_PITCHING_SO  0.020377   0.008595   2.371             0.017836 *  
## TEAM_FIELDING_E  -0.042058   0.003130 -13.437 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.157922   0.012357 -12.780 < 0.0000000000000002 ***
## TB_PLUS           0.026357   0.002152  12.247 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.22 on 2164 degrees of freedom
## Multiple R-squared:  0.2965, Adjusted R-squared:  0.2942 
## F-statistic: 130.3 on 7 and 2164 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_H

# ----------------------
# remove TEAM_PITCHING_H
model.2 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.105  -8.066   0.209   8.169  55.518 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      52.382736   3.271336  16.013 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.041392   0.004546  -9.106 < 0.0000000000000002 ***
## TEAM_PITCHING_BB -0.007564   0.003175  -2.383               0.0173 *  
## TEAM_PITCHING_SO  0.024536   0.004070   6.028        0.00000000195 ***
## TEAM_FIELDING_E  -0.041729   0.003072 -13.584 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.157238   0.012293 -12.791 < 0.0000000000000002 ***
## TB_PLUS           0.027353   0.001161  23.559 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.22 on 2165 degrees of freedom
## Multiple R-squared:  0.2964, Adjusted R-squared:  0.2945 
## F-statistic:   152 on 6 and 2165 DF,  p-value: < 0.00000000000000022

# All p-values < .05 so check collinearity
vif(model.2)

##  TEAM_BATTING_SO TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##        15.346530         1.676592        12.003206         3.021956 
## TEAM_FIELDING_DP          TB_PLUS 
##         1.670760         1.637689

# vif indicates remove TEAM_BATTING_SO or TEAM_PITCHING_SO, so try removing PITCHING_SO

# remove TEAM_PITCHING_SO
model.3 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO, 
##     data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.713  -8.090   0.168   8.157  58.048 
## 
## Coefficients:
##                   Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)      49.697441   3.267190  15.211 <0.0000000000000002 ***
## TEAM_BATTING_SO  -0.015274   0.001386 -11.022 <0.0000000000000002 ***
## TEAM_PITCHING_BB -0.001905   0.003057  -0.623               0.533    
## TEAM_FIELDING_E  -0.030940   0.002517 -12.293 <0.0000000000000002 ***
## TEAM_FIELDING_DP -0.150607   0.012343 -12.202 <0.0000000000000002 ***
## TB_PLUS           0.026000   0.001148  22.641 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.32 on 2166 degrees of freedom
## Multiple R-squared:  0.2846, Adjusted R-squared:  0.283 
## F-statistic: 172.3 on 5 and 2166 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_BB

# ----------------------
# remove TEAM_PITCHING_BB
model.4 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.491  -8.058   0.195   8.127  58.467 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)      49.7754486  3.2643296   15.25 <0.0000000000000002 ***
## TEAM_BATTING_SO  -0.0151681  0.0013752  -11.03 <0.0000000000000002 ***
## TEAM_FIELDING_E  -0.0309293  0.0025164  -12.29 <0.0000000000000002 ***
## TEAM_FIELDING_DP -0.1512808  0.0122935  -12.31 <0.0000000000000002 ***
## TB_PLUS           0.0255985  0.0009504   26.93 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.32 on 2167 degrees of freedom
## Multiple R-squared:  0.2845, Adjusted R-squared:  0.2832 
## F-statistic: 215.4 on 4 and 2167 DF,  p-value: < 0.00000000000000022

# pvals all < .05 so check collinearity
vif(model.4)

##  TEAM_BATTING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP          TB_PLUS 
##         1.382409         1.995902         1.644666         1.080068

# no collinearity so STOP

Diagnostics

Plots for Fielding_E shows skew. Others are pretty linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.4, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: outliers at 2012, 1820, 859. Lack of Constant variability in Resid vs. Fitted at very large values of Yhat; normal distribution of residuals except for outliers, most residuals within 2 std dev and well within Cook’s distance

#Figure 5.6 on page 129 MARR text
par(mfrow=c(2,2))
plot(model.4)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for PITCHING_SO, FIELDING_E, FIELDING_DP, TB_PLUS

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.4)
par(mfrow=c(2,2))

plot(mb_tbp$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TB_PLUS, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship with no pattern or skew

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.4$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb_tbp$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb_tbp$TARGET_WINS),lty=2)

REMOVE OUTLIERS AND REFIT

Per Cooks Distance, remove items 836, 821, 1920, 1737, 1515

############ FIRST SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem <- mb_tbp[-c(836, 821, 1920, 1737, 1515),]

# save first data set
mb_tbp_orig <- mb_tbp

# renumber rows
rownames(mb_rem) <- 1:nrow(mb_rem)

Now refit first model from above: all variables

Yields r^2= 0.2944, Adj r^2 = 0.2931, F = 225.5

# keep the clean data set pure
mb_tbp <- mb_rem

# fit model
model <- lm(data=mb_tbp, TARGET_WINS ~ .- INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.431  -8.016   0.145   8.107  46.678 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      54.3616387  3.9168318  13.879 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.0391427  0.0100487  -3.895             0.000101 ***
## TEAM_PITCHING_H   0.0004962  0.0035469   0.140             0.888754    
## TEAM_PITCHING_BB -0.0063810  0.0037061  -1.722             0.085255 .  
## TEAM_PITCHING_SO  0.0216633  0.0084996   2.549             0.010880 *  
## TEAM_FIELDING_E  -0.0441685  0.0031036 -14.232 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.1600617  0.0121816 -13.140 < 0.0000000000000002 ***
## TB_PLUS           0.0266454  0.0021256  12.536 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.03 on 2159 degrees of freedom
## Multiple R-squared:  0.3048, Adjusted R-squared:  0.3025 
## F-statistic: 135.2 on 7 and 2159 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_H

# ----------------------
# remove TEAM_PITCHING_H
model.2 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.427  -7.980   0.143   8.125  46.678 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      54.670270   3.235709  16.896 < 0.0000000000000002 ***
## TEAM_BATTING_SO  -0.040401   0.004482  -9.014 < 0.0000000000000002 ***
## TEAM_PITCHING_BB -0.006657   0.003139  -2.121                0.034 *  
## TEAM_PITCHING_SO  0.022711   0.004019   5.651         0.0000000181 ***
## TEAM_FIELDING_E  -0.044089   0.003051 -14.451 < 0.0000000000000002 ***
## TEAM_FIELDING_DP -0.159894   0.012120 -13.193 < 0.0000000000000002 ***
## TB_PLUS           0.026896   0.001146  23.459 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.03 on 2160 degrees of freedom
## Multiple R-squared:  0.3048, Adjusted R-squared:  0.3028 
## F-statistic: 157.8 on 6 and 2160 DF,  p-value: < 0.00000000000000022

# All p-values < .05 so check collinearity
vif(model.2)

##  TEAM_BATTING_SO TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##        15.343276         1.680924        12.009587         2.981041 
## TEAM_FIELDING_DP          TB_PLUS 
##         1.666368         1.641975

# vif indicates remove TEAM_BATTING_SO or TEAM_PITCHING_SO, so remove PITCHING_SO again

# ----------------------
# remove TEAM_PITCHING_SO
model.3 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO, 
##     data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.795  -8.018   0.197   8.113  46.888 
## 
## Coefficients:
##                   Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)      52.267886   3.230538  16.179 <0.0000000000000002 ***
## TEAM_BATTING_SO  -0.016264   0.001368 -11.886 <0.0000000000000002 ***
## TEAM_PITCHING_BB -0.001378   0.003018  -0.457               0.648    
## TEAM_FIELDING_E  -0.034211   0.002518 -13.585 <0.0000000000000002 ***
## TEAM_FIELDING_DP -0.153843   0.012159 -12.653 <0.0000000000000002 ***
## TB_PLUS           0.025628   0.001132  22.633 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.12 on 2161 degrees of freedom
## Multiple R-squared:  0.2945, Adjusted R-squared:  0.2928 
## F-statistic: 180.4 on 5 and 2161 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_BB


# ----------------------
# remove TEAM_PITCHING_BB
model.4 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_H - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_BB, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.678  -7.990   0.207   8.069  46.752 
## 
## Coefficients:
##                    Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)      52.3303439  3.2270486   16.22 <0.0000000000000002 ***
## TEAM_BATTING_SO  -0.0161904  0.0013585  -11.92 <0.0000000000000002 ***
## TEAM_FIELDING_E  -0.0342138  0.0025179  -13.59 <0.0000000000000002 ***
## TEAM_FIELDING_DP -0.1543417  0.0121072  -12.75 <0.0000000000000002 ***
## TB_PLUS           0.0253371  0.0009362   27.06 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.12 on 2162 degrees of freedom
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.2931 
## F-statistic: 225.5 on 4 and 2162 DF,  p-value: < 0.00000000000000022

# pvals all < .05 so check collinearity
vif(model.4)

##  TEAM_BATTING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP          TB_PLUS 
##         1.390169         2.002462         1.640012         1.079847

anova(model)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq F value                Pr(>F)    
## TEAM_BATTING_SO     1   2189    2189  15.114             0.0001043 ***
## TEAM_PITCHING_H     1  13926   13926  96.157 < 0.00000000000000022 ***
## TEAM_PITCHING_BB    1  21937   21937 151.472 < 0.00000000000000022 ***
## TEAM_PITCHING_SO    1  36638   36638 252.979 < 0.00000000000000022 ***
## TEAM_FIELDING_E     1  14063   14063  97.103 < 0.00000000000000022 ***
## TEAM_FIELDING_DP    1  25556   25556 176.456 < 0.00000000000000022 ***
## TB_PLUS             1  22758   22758 157.139 < 0.00000000000000022 ***
## Residuals        2159 312681     145                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# no collinearity so STOP

Diagnostics

Plots for Fielding_E shows skew. Others are pretty linear

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.4, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Some lack of Constant variability in Resid vs. Fitted at very large values of Yhat; normal distribution of residuals; most residuals within 2 std dev and well within Cook’s distance

#Figure 5.6 on page 129 MARR text
par(mfrow=c(2,2))
plot(model.4)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for PITCHING_SO, FIELDING_E, FIELDING_DP, TB_PLUS

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.4)
par(mfrow=c(2,2))

plot(mb_tbp$TEAM_BATTING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TB_PLUS, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship with no pattern or skew

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.4$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb_tbp$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb_tbp$TARGET_WINS),lty=2)

Now try same model but with FIELD_E transformed using Box-Cox

# TEAM_FIELDING_E: Box-cox says -1 power transform => 1/y 

mb_tbp$TEAM_FIELDING_E <- 1/mb_tbp$TEAM_FIELDING_E

Now refit first model from above: Start with all variables

Yields r^2= 0.2932, Adj r^2 = 0.2919, F = 224.3

# fit model
model <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX)
summary(model)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.857  -7.906   0.310   8.238  43.204 
## 
## Coefficients:
##                     Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)        44.224515    3.776056  11.712 < 0.0000000000000002 ***
## TEAM_BATTING_SO    -0.033405    0.010094  -3.310              0.00095 ***
## TEAM_PITCHING_H    -0.003282    0.003541  -0.927              0.35409    
## TEAM_PITCHING_BB   -0.002060    0.003735  -0.552              0.58128    
## TEAM_PITCHING_SO    0.008654    0.008520   1.016              0.30983    
## TEAM_FIELDING_E  2367.159014  184.199571  12.851 < 0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.138889    0.011786 -11.784 < 0.0000000000000002 ***
## TB_PLUS             0.023995    0.002172  11.049 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.13 on 2159 degrees of freedom
## Multiple R-squared:  0.2936, Adjusted R-squared:  0.2913 
## F-statistic: 128.2 on 7 and 2159 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_BB

# ----------------------
# remove TEAM_PITCHING_BB
model.2 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB)
summary(model.2)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.992  -7.888   0.292   8.267  43.111 
## 
## Coefficients:
##                     Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)        43.622192    3.614144  12.070 < 0.0000000000000002 ***
## TEAM_BATTING_SO    -0.030130    0.008160  -3.692             0.000228 ***
## TEAM_PITCHING_H    -0.002208    0.002957  -0.747             0.455333    
## TEAM_PITCHING_SO    0.005899    0.006900   0.855             0.392714    
## TEAM_FIELDING_E  2371.413436  184.008382  12.888 < 0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.139854    0.011654 -12.001 < 0.0000000000000002 ***
## TB_PLUS             0.023137    0.001515  15.268 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.13 on 2160 degrees of freedom
## Multiple R-squared:  0.2935, Adjusted R-squared:  0.2915 
## F-statistic: 149.5 on 6 and 2160 DF,  p-value: < 0.00000000000000022

# p-values say remove TEAM_PITCHING_H

# ----------------------
# remove TEAM_PITCHING_H
model.3 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H)
summary(model.3)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H, 
##     data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.307  -7.918   0.345   8.182  42.508 
## 
## Coefficients:
##                      Estimate   Std. Error t value             Pr(>|t|)
## (Intercept)        42.0041157    2.8921302  14.524 < 0.0000000000000002
## TEAM_BATTING_SO    -0.0247576    0.0038507  -6.429       0.000000000157
## TEAM_PITCHING_SO    0.0013897    0.0033380   0.416                0.677
## TEAM_FIELDING_E  2387.2244422  182.7672567  13.062 < 0.0000000000000002
## TEAM_FIELDING_DP   -0.1396085    0.0116479 -11.986 < 0.0000000000000002
## TB_PLUS             0.0222592    0.0009561  23.280 < 0.0000000000000002
##                     
## (Intercept)      ***
## TEAM_BATTING_SO  ***
## TEAM_PITCHING_SO    
## TEAM_FIELDING_E  ***
## TEAM_FIELDING_DP ***
## TB_PLUS          ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.13 on 2161 degrees of freedom
## Multiple R-squared:  0.2933, Adjusted R-squared:  0.2917 
## F-statistic: 179.4 on 5 and 2161 DF,  p-value: < 0.00000000000000022

# p-vals say remove TEAM_PITCHING_SO

# ----------------------
# remove TEAM_PITCHING_SO
model.4 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H - TEAM_PITCHING_SO)
summary(model.4)

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H - 
##     TEAM_PITCHING_SO, data = mb_tbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.310  -7.909   0.320   8.191  42.644 
## 
## Coefficients:
##                      Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)        42.1597649    2.8673139   14.70 <0.0000000000000002 ***
## TEAM_BATTING_SO    -0.0233150    0.0016789  -13.89 <0.0000000000000002 ***
## TEAM_FIELDING_E  2366.8259932  176.0432309   13.45 <0.0000000000000002 ***
## TEAM_FIELDING_DP   -0.1400776    0.0115910  -12.09 <0.0000000000000002 ***
## TB_PLUS             0.0222811    0.0009545   23.34 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.13 on 2162 degrees of freedom
## Multiple R-squared:  0.2932, Adjusted R-squared:  0.2919 
## F-statistic: 224.3 on 4 and 2162 DF,  p-value: < 0.00000000000000022

# pvals all < .05 so check collinearity
vif(model.4)

##  TEAM_BATTING_SO  TEAM_FIELDING_E TEAM_FIELDING_DP          TB_PLUS 
##         2.119682         2.906955         1.500660         1.120613

# no collinearity so STOP
options(scipen=999)
model.4

## 
## Call:
## lm(formula = TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H - 
##     TEAM_PITCHING_SO, data = mb_tbp)
## 
## Coefficients:
##      (Intercept)   TEAM_BATTING_SO   TEAM_FIELDING_E  TEAM_FIELDING_DP  
##         42.15976          -0.02331        2366.82599          -0.14008  
##          TB_PLUS  
##          0.02228

anova(model.4)

## Analysis of Variance Table
## 
## Response: TARGET_WINS
##                    Df Sum Sq Mean Sq F value                Pr(>F)    
## TEAM_BATTING_SO     1   2189    2189  14.888             0.0001174 ***
## TEAM_FIELDING_E     1  34681   34681 235.888 < 0.00000000000000022 ***
## TEAM_FIELDING_DP    1  14903   14903 101.365 < 0.00000000000000022 ***
## TB_PLUS             1  80112   80112 544.895 < 0.00000000000000022 ***
## Residuals        2162 317863     147                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Diagnostics

Plots show all variables are linear to response

# CREATE ADDED VARIABLE PLOTS TO ASSESS predictor vs response
avPlots(model.4, id.n = 2)

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots: Some lack of Constant variability in Resid vs. Fitted at very large values of Yhat; normal distribution of residuals; most residuals within 2 std dev and well within Cook’s distance

#Figure 5.6 on page 129 MARR text
par(mfrow=c(2,2))
plot(model.4)

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

Results show lack of constant variability for PITCHING_SO, FIELDING_E, FIELDING_DP, TB_PLUS

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

StanRes1 <- rstandard(model.4)
par(mfrow=c(2,2))

plot(mb_tbp$TEAM_PITCHING_SO, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_E, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TEAM_FIELDING_DP, StanRes1, ylab="Standardized Residuals")
plot(mb_tbp$TB_PLUS, StanRes1, ylab="Standardized Residuals")

PLOT Y AGAINST FITTED VALUES

Plot shows a linear relationship with no pattern or skew

# get rows have no NA's from data frame
# NoNA <- mb_mods[!rowSums(is.na(mb_mods[1:13])), ]

fit1 <- model.4$fitted.values

par(mfrow = c(1,1))
plot(fit1, mb_tbp$TARGET_WINS,xlab="Fitted Values")
abline(lsfit(fit1, mb_tbp$TARGET_WINS),lty=2)

# clean up objects in memory
rm(list = ls())

Model 4: Sabermetrics Model

library(car)

# read EVALUATION data set
eval_data <- read.csv("https://raw.githubusercontent.com/jtopor/CUNY-MSDA-621/master/HW-1/moneyball-evaluation-data.csv")

# read training data set
mb_e <- read.csv("https://raw.githubusercontent.com/jtopor/CUNY-MSDA-621/master/HW-1/moneyball-training-data.csv")

#eliminate index column
# mb_e1 <- mb_e[,-1]
mb_e1 <- mb_e

#####Creating a new column for batting singles and eliminating hits for batting

#add singles column for hitting
mb_e1$TEAM_BATTING_1B <- as.numeric(mb_e1$TEAM_BATTING_H-mb_e1$TEAM_BATTING_2B-mb_e1$TEAM_BATTING_3B-mb_e1$TEAM_BATTING_HR)
mb_e1 <- mb_e1[,-3]
mb_e1 <- as.data.frame(mb_e1)

eval_data$TEAM_BATTING_1B <- as.numeric(eval_data$TEAM_BATTING_H - eval_data$TEAM_BATTING_2B - eval_data$TEAM_BATTING_3B - eval_data$TEAM_BATTING_HR)

# HITS is in second column in eval data
eval_data <- eval_data[,-2]

# ADD A DUMMY COLUMN TO EVAL DATA FOR TARGET WINS
eval_data$TARGET_WINS <- 0

Eliminate HBP, CS, and pitching HR’s.

mb <- mb_e1[,-c(9,10,12)]
# summary(mb)

eval_data <- eval_data[,-c(8,9,11)]
# summary(eval_data)

Build model for batting SO using Gelman approach

#take out double plays + pitching SO + SB as data set is incomplete + Wins as they are not present in the evaluation data

BSO.1 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB -TARGET_WINS)
summary(BSO.1)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -367.76  -82.46   -3.12   76.44  401.62 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      1978.17499   44.47257  44.481 < 0.0000000000000002 ***
## TEAM_BATTING_2B    -0.17459    0.17430  -1.002             0.316646    
## TEAM_BATTING_3B    -2.29879    0.24381  -9.429 < 0.0000000000000002 ***
## TEAM_BATTING_HR     0.88912    0.18212   4.882           0.00000114 ***
## TEAM_BATTING_BB     0.98359    0.43292   2.272             0.023203 *  
## TEAM_PITCHING_H     0.36442    0.15123   2.410             0.016066 *  
## TEAM_PITCHING_BB   -1.17748    0.41011  -2.871             0.004137 ** 
## TEAM_FIELDING_E    -0.29146    0.07594  -3.838             0.000128 ***
## TEAM_BATTING_1B    -1.45473    0.16446  -8.845 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.1 on 1826 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7248, Adjusted R-squared:  0.7236 
## F-statistic: 601.1 on 8 and 1826 DF,  p-value: < 0.00000000000000022

#eliminate doubles
BSO.2 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B)
summary(BSO.2)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B, 
##     data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -366.77  -82.05   -2.74   76.89  399.16 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      1975.11400   44.36750  44.517 < 0.0000000000000002 ***
## TEAM_BATTING_3B    -2.17815    0.21199 -10.275 < 0.0000000000000002 ***
## TEAM_BATTING_HR     1.02471    0.12183   8.411 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.59635    0.19483   3.061             0.002240 ** 
## TEAM_PITCHING_H     0.22723    0.06414   3.543             0.000406 ***
## TEAM_PITCHING_BB   -0.81040    0.18409  -4.402            0.0000113 ***
## TEAM_FIELDING_E    -0.26762    0.07212  -3.711             0.000213 ***
## TEAM_BATTING_1B    -1.31541    0.08777 -14.988 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.1 on 1827 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7246, Adjusted R-squared:  0.7236 
## F-statistic: 686.8 on 7 and 1827 DF,  p-value: < 0.00000000000000022

vif(BSO.2)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_PITCHING_H 
##         2.984944         6.215565        39.110249        17.594555 
## TEAM_PITCHING_BB  TEAM_FIELDING_E  TEAM_BATTING_1B 
##        45.384546         2.448073         8.602393

# vif says remove TEAM_PITCHING_BB
BSO.3 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B - TEAM_PITCHING_BB)
summary(BSO.3)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_PITCHING_BB, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -385.33  -80.03   -1.99   77.48  399.57 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     1980.01367   44.57596  44.419 < 0.0000000000000002 ***
## TEAM_BATTING_3B   -1.79602    0.19437  -9.240 < 0.0000000000000002 ***
## TEAM_BATTING_HR    1.42972    0.08027  17.812 < 0.0000000000000002 ***
## TEAM_BATTING_BB   -0.24810    0.03431  -7.230    0.000000000000706 ***
## TEAM_PITCHING_H   -0.03911    0.02139  -1.828               0.0677 .  
## TEAM_FIELDING_E   -0.29047    0.07229  -4.018    0.000061107070801 ***
## TEAM_BATTING_1B   -0.99171    0.04816 -20.591 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.6 on 1828 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7217, Adjusted R-squared:  0.7208 
## F-statistic: 790.1 on 6 and 1828 DF,  p-value: < 0.00000000000000022

# pvals say remove PITCHING_H
BSO.4 <- lm(data=mb, TEAM_BATTING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_SO -TEAM_BASERUN_SB - TARGET_WINS -TEAM_BATTING_2B - TEAM_PITCHING_BB - TEAM_PITCHING_H)
summary(BSO.4)

## 
## Call:
## lm(formula = TEAM_BATTING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_SO - TEAM_BASERUN_SB - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_PITCHING_BB - TEAM_PITCHING_H, data = mb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -384.09  -79.88   -1.51   77.07  393.44 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     1986.07279   44.48099  44.650 < 0.0000000000000002 ***
## TEAM_BATTING_3B   -1.86025    0.19129  -9.725 < 0.0000000000000002 ***
## TEAM_BATTING_HR    1.37824    0.07521  18.325 < 0.0000000000000002 ***
## TEAM_BATTING_BB   -0.25047    0.03431  -7.300    0.000000000000428 ***
## TEAM_FIELDING_E   -0.29947    0.07217  -4.149    0.000034869615225 ***
## TEAM_BATTING_1B   -1.04331    0.03905 -26.719 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 114.7 on 1829 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.7212, Adjusted R-squared:  0.7204 
## F-statistic: 946.2 on 5 and 1829 DF,  p-value: < 0.00000000000000022

vif(BSO.4)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_FIELDING_E 
##        2.403283        2.342414        1.199339        2.424099 
## TEAM_BATTING_1B 
##        1.683596

##All p-values are low with a 686.8 F-statistic and adjusted R squared of 0.7236
#take a look
par(mfrow=c(2,2))
plot(BSO.2)

# ---------------------------------------
# function definition for impute function
impute <- function (a, a.impute){
  ifelse (is.na(a), a.impute,a)
}
# ---------------------------------------

#prediction function
pred.BSO <- round(predict(BSO.4, mb))
BSO.imp <- impute(mb$TEAM_BATTING_SO, pred.BSO)

# impute the evaluation data
pred_eval.BSO <- round(predict(BSO.4, eval_data))
eval.BSO.imp <- impute(eval_data$TEAM_BATTING_SO, pred_eval.BSO)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb$TEAM_BATTING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   548.0   750.0   735.6   930.0  1399.0     102

summary(BSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   543.8   730.0   727.6   925.0  1399.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb$TEAM_BATTING_SO, breaks = 200)
hist(BSO.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data$TEAM_BATTING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   545.0   686.0   709.3   912.0  1268.0      18

summary(eval.BSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   532.5   677.0   699.8   904.5  1268.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data$TEAM_BATTING_SO, breaks = 30)
hist(eval.BSO.imp, breaks = 30)
###################################################

# update dataframes with imputed values
mb1 <- mb
mb1$TEAM_BATTING_SO <- BSO.imp

eval_data.1 <- eval_data
eval_data.1$TEAM_BATTING_SO <- eval.BSO.imp

Build model for Pitching SO

#take out double plays + SB as data set is incomplete and wins as they are not present in evaluation data

PSO.1 <- lm(data=mb1, TEAM_PITCHING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_BASERUN_SB - TARGET_WINS)
summary(PSO.1)

## 
## Call:
## lm(formula = TEAM_PITCHING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_BASERUN_SB - TARGET_WINS, data = mb1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -130.988   -3.674    1.299    4.675  147.753 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       0.693339   8.122849   0.085                0.932    
## TEAM_BATTING_2B  -0.291561   0.022062 -13.216 < 0.0000000000000002 ***
## TEAM_BATTING_3B  -0.268622   0.031593  -8.503 < 0.0000000000000002 ***
## TEAM_BATTING_HR  -0.269698   0.023194 -11.628 < 0.0000000000000002 ***
## TEAM_BATTING_BB  -0.883985   0.054857 -16.114 < 0.0000000000000002 ***
## TEAM_BATTING_SO   1.044883   0.002961 352.860 < 0.0000000000000002 ***
## TEAM_PITCHING_H   0.270090   0.019167  14.092 < 0.0000000000000002 ***
## TEAM_PITCHING_BB  0.850575   0.052011  16.354 < 0.0000000000000002 ***
## TEAM_FIELDING_E  -0.057521   0.009648  -5.962        0.00000000299 ***
## TEAM_BATTING_1B  -0.280665   0.021251 -13.207 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.43 on 1825 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.9958, Adjusted R-squared:  0.9958 
## F-statistic: 4.809e+04 on 9 and 1825 DF,  p-value: < 0.00000000000000022

vif(PSO.1)

##  TEAM_BATTING_2B  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB 
##         7.903547         4.140572        14.070952       193.640509 
##  TEAM_BATTING_SO  TEAM_PITCHING_H TEAM_PITCHING_BB  TEAM_FIELDING_E 
##         3.633459        98.139615       226.271134         2.736403 
##  TEAM_BATTING_1B 
##        31.499003

# vif says remove TEAM_PITCHING_BB

PSO.2 <- lm(data=mb1, TEAM_PITCHING_SO~. - INDEX -TEAM_FIELDING_DP -TEAM_BASERUN_SB - TARGET_WINS - TEAM_PITCHING_BB)
summary(PSO.2)

## 
## Call:
## lm(formula = TEAM_PITCHING_SO ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_BASERUN_SB - TARGET_WINS - TEAM_PITCHING_BB, data = mb1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -126.748   -4.347    1.094    5.334  149.232 
## 
## Coefficients:
##                  Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)     11.722036   8.665285   1.353              0.1763    
## TEAM_BATTING_2B -0.613714   0.010633 -57.719 <0.0000000000000002 ***
## TEAM_BATTING_3B -0.578732   0.027050 -21.395 <0.0000000000000002 ***
## TEAM_BATTING_HR -0.601828   0.011993 -50.181 <0.0000000000000002 ***
## TEAM_BATTING_BB  0.010252   0.004704   2.180              0.0294 *  
## TEAM_BATTING_SO  1.041636   0.003163 329.346 <0.0000000000000002 ***
## TEAM_PITCHING_H  0.580032   0.003060 189.545 <0.0000000000000002 ***
## TEAM_FIELDING_E -0.097555   0.009990  -9.765 <0.0000000000000002 ***
## TEAM_BATTING_1B -0.610139   0.007239 -84.289 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.45 on 1826 degrees of freedom
##   (441 observations deleted due to missingness)
## Multiple R-squared:  0.9952, Adjusted R-squared:  0.9952 
## F-statistic: 4.719e+04 on 8 and 1826 DF,  p-value: < 0.00000000000000022

vif(PSO.2)

## TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB 
##        1.602077        2.648887        3.282938        1.242359 
## TEAM_BATTING_SO TEAM_PITCHING_H TEAM_FIELDING_E TEAM_BATTING_1B 
##        3.617130        2.183095        2.560245        3.189218

#all low P value and F statistic of 4719 with adj R squared of 0.9952
#take a look
par(mfrow=c(2,2))
plot(PSO.2)

#place back in the data base with imputed data for SO's
pred.PSO <- round(predict(PSO.2, mb1))
PSO.imp <- impute(mb1$TEAM_PITCHING_SO, pred.PSO)

# impute the evaluation data
pred_eval.PSO <- round(predict(PSO.2, eval_data.1))
eval.PSO.imp <- impute(eval_data.1$TEAM_PITCHING_SO, pred_eval.PSO)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb1$TEAM_PITCHING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   615.0   813.5   817.7   968.0 19280.0     102

summary(PSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   606.0   797.0   807.4   957.0 19280.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb1$TEAM_PITCHING_SO, breaks = 200)
hist(PSO.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.1$TEAM_PITCHING_SO)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   613.0   745.0   799.7   938.0  9963.0      18

summary(eval.PSO.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   596.0   728.0   785.4   927.5  9963.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.1$TEAM_PITCHING_SO, breaks = 30)
hist(eval.PSO.imp, breaks = 30)

###################################################

# update dataframes with imputed values 

mb2 <- mb1
mb2$TEAM_PITCHING_SO <- PSO.imp

eval_data.2 <- eval_data.1
eval_data.2$TEAM_PITCHING_SO <- eval.PSO.imp

Build model for SB

#Take out DP as incomplete data and target wins
SB.1 <- lm(data=mb2, TEAM_BASERUN_SB~. -INDEX -TEAM_FIELDING_DP - TARGET_WINS)
summary(SB.1)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.054  -32.496   -5.714   28.546  206.666 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      -181.56290   25.61997  -7.087 0.000000000001918208 ***
## TEAM_BATTING_2B    -0.21829    0.07305  -2.988             0.002841 ** 
## TEAM_BATTING_3B     0.31700    0.10053   3.153             0.001640 ** 
## TEAM_BATTING_HR    -0.61700    0.07599  -8.119 0.000000000000000827 ***
## TEAM_BATTING_BB     0.12841    0.18355   0.700             0.484255    
## TEAM_BATTING_SO     0.43941    0.07872   5.582 0.000000027139177568 ***
## TEAM_PITCHING_H     0.20258    0.06368   3.181             0.001489 ** 
## TEAM_PITCHING_BB   -0.08197    0.17430  -0.470             0.638206    
## TEAM_PITCHING_SO   -0.25116    0.07480  -3.358             0.000801 ***
## TEAM_FIELDING_E     0.32079    0.02796  11.473 < 0.0000000000000002 ***
## TEAM_BATTING_1B    -0.12224    0.07046  -1.735             0.082937 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.26 on 1926 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3552, Adjusted R-squared:  0.3518 
## F-statistic: 106.1 on 10 and 1926 DF,  p-value: < 0.00000000000000022

#eliminate pitching BB's
SB.2 <- lm(data=mb2, TEAM_BASERUN_SB~. -INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS)
summary(SB.2)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.234  -32.317   -5.706   28.512  206.540 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      -182.64502   25.51127  -7.159     0.00000000000115 ***
## TEAM_BATTING_2B    -0.19473    0.05316  -3.663             0.000256 ***
## TEAM_BATTING_3B     0.34027    0.08749   3.889             0.000104 ***
## TEAM_BATTING_HR    -0.59236    0.05503 -10.765 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.04234    0.01376   3.076             0.002127 ** 
## TEAM_BATTING_SO     0.45256    0.07357   6.151     0.00000000093186 ***
## TEAM_PITCHING_H     0.17992    0.04164   4.321     0.00001629409305 ***
## TEAM_PITCHING_SO   -0.26348    0.07004  -3.762             0.000174 ***
## TEAM_FIELDING_E     0.32263    0.02768  11.655 < 0.0000000000000002 ***
## TEAM_BATTING_1B    -0.09787    0.04774  -2.050             0.040499 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.25 on 1927 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3551, Adjusted R-squared:  0.3521 
## F-statistic: 117.9 on 9 and 1927 DF,  p-value: < 0.00000000000000022

#eliminate singles
SB.3 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB -TEAM_BATTING_1B - TARGET_WINS)
summary(SB.3)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TEAM_BATTING_1B - TARGET_WINS, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -113.836  -32.731   -5.467   28.523  205.971 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      -201.72737   23.77215  -8.486 < 0.0000000000000002 ***
## TEAM_BATTING_2B    -0.11845    0.03799  -3.118              0.00185 ** 
## TEAM_BATTING_3B     0.40962    0.08075   5.073       0.000000429809 ***
## TEAM_BATTING_HR    -0.51178    0.03855 -13.277 < 0.0000000000000002 ***
## TEAM_BATTING_BB     0.04187    0.01377   3.040              0.00240 ** 
## TEAM_BATTING_SO     0.32253    0.03731   8.645 < 0.0000000000000002 ***
## TEAM_PITCHING_H     0.10155    0.01650   6.154       0.000000000917 ***
## TEAM_PITCHING_SO   -0.13495    0.03125  -4.318       0.000016553542 ***
## TEAM_FIELDING_E     0.33595    0.02693  12.475 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.29 on 1928 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3537, Adjusted R-squared:  0.351 
## F-statistic: 131.9 on 8 and 1928 DF,  p-value: < 0.00000000000000022

#simplify the model by taking out pitching
SB.4 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB -TEAM_BATTING_1B - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H)
summary(SB.4)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TEAM_BATTING_1B - TARGET_WINS - TEAM_PITCHING_SO - 
##     TEAM_PITCHING_H, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -117.914  -33.217   -5.623   29.822  199.657 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     -83.147587  14.021226  -5.930     0.00000000357797 ***
## TEAM_BATTING_2B   0.041592   0.028337   1.468               0.1423    
## TEAM_BATTING_3B   0.572336   0.077093   7.424     0.00000000000017 ***
## TEAM_BATTING_HR  -0.405437   0.034672 -11.694 < 0.0000000000000002 ***
## TEAM_BATTING_BB   0.035111   0.013862   2.533               0.0114 *  
## TEAM_BATTING_SO   0.151522   0.008054  18.814 < 0.0000000000000002 ***
## TEAM_FIELDING_E   0.360947   0.026930  13.403 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.77 on 1930 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3395, Adjusted R-squared:  0.3374 
## F-statistic: 165.3 on 6 and 1930 DF,  p-value: < 0.00000000000000022

#add singles back in
SB.5 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H)
summary(SB.5)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H, 
##     data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.276  -33.011   -5.119   28.546  203.041 
## 
## Coefficients:
##                    Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)     -187.229045   25.631485  -7.305    0.000000000000405 ***
## TEAM_BATTING_2B   -0.001075    0.029522  -0.036              0.97096    
## TEAM_BATTING_3B    0.519499    0.077423   6.710    0.000000000025523 ***
## TEAM_BATTING_HR   -0.407981    0.034476 -11.834 < 0.0000000000000002 ***
## TEAM_BATTING_BB    0.039950    0.013818   2.891              0.00388 ** 
## TEAM_BATTING_SO    0.176220    0.009496  18.558 < 0.0000000000000002 ***
## TEAM_FIELDING_E    0.357712    0.026783  13.356 < 0.0000000000000002 ***
## TEAM_BATTING_1B    0.091975    0.019007   4.839    0.000001408714177 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.5 on 1929 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3474, Adjusted R-squared:  0.345 
## F-statistic: 146.7 on 7 and 1929 DF,  p-value: < 0.00000000000000022

#eliminate doubles
SB.6 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - TEAM_BATTING_2B)
summary(SB.6)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - 
##     TEAM_BATTING_2B, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.233  -32.996   -5.134   28.584  203.038 
## 
## Coefficients:
##                    Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)     -187.197711   25.610401  -7.309    0.000000000000391 ***
## TEAM_BATTING_3B    0.518983    0.076096   6.820    0.000000000012126 ***
## TEAM_BATTING_HR   -0.408387    0.032612 -12.522 < 0.0000000000000002 ***
## TEAM_BATTING_BB    0.039905    0.013760   2.900              0.00377 ** 
## TEAM_BATTING_SO    0.176199    0.009475  18.596 < 0.0000000000000002 ***
## TEAM_FIELDING_E    0.357907    0.026237  13.642 < 0.0000000000000002 ***
## TEAM_BATTING_1B    0.091768    0.018135   5.060    0.000000458248716 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.49 on 1930 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3474, Adjusted R-squared:  0.3454 
## F-statistic: 171.2 on 6 and 1930 DF,  p-value: < 0.00000000000000022

#eliminate walks
SB.7 <- lm(data=mb2, TEAM_BASERUN_SB~. - INDEX -TEAM_FIELDING_DP -TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - TEAM_BATTING_2B - TEAM_BATTING_BB)
summary(SB.7)

## 
## Call:
## lm(formula = TEAM_BASERUN_SB ~ . - INDEX - TEAM_FIELDING_DP - 
##     TEAM_PITCHING_BB - TARGET_WINS - TEAM_PITCHING_SO - TEAM_PITCHING_H - 
##     TEAM_BATTING_2B - TEAM_BATTING_BB, data = mb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.972  -32.412   -5.034   29.206  206.559 
## 
## Coefficients:
##                    Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)     -160.935881   24.001884  -6.705      0.0000000000263 ***
## TEAM_BATTING_3B    0.519666    0.076241   6.816      0.0000000000125 ***
## TEAM_BATTING_HR   -0.380719    0.031245 -12.185 < 0.0000000000000002 ***
## TEAM_BATTING_SO    0.171459    0.009351  18.336 < 0.0000000000000002 ***
## TEAM_FIELDING_E    0.345292    0.025923  13.320 < 0.0000000000000002 ***
## TEAM_BATTING_1B    0.089234    0.018148   4.917      0.0000009536160 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.58 on 1931 degrees of freedom
##   (339 observations deleted due to missingness)
## Multiple R-squared:  0.3446, Adjusted R-squared:  0.3429 
## F-statistic:   203 on 5 and 1931 DF,  p-value: < 0.00000000000000022

#all low P value and F statistic of 202.9 with adj R squared of 0.3427
#take a look
par(mfrow=c(2,2))
plot(SB.7)

#place back in the data base with imputed data for SB's
pred.SB <- round(predict(SB.7, mb2))
SB.imp <- impute(mb2$TEAM_BASERUN_SB, pred.SB)

# impute the evaluation data
pred_eval.SB <- round(predict(SB.7, eval_data.2))
eval.SB.imp <- impute(eval_data.2$TEAM_BASERUN_SB, pred_eval.SB)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb2$TEAM_BASERUN_SB)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    66.0   101.0   124.8   156.0   697.0     131

summary(SB.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    67.0   106.0   137.8   172.0   697.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb2$TEAM_BASERUN_SB, breaks = 200)
hist(SB.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.2$TEAM_BASERUN_SB)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    59.0    92.0   123.7   151.8   580.0      13

summary(eval.SB.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    60.5    96.0   134.9   164.5   580.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.2$TEAM_BASERUN_SB, breaks = 30)
hist(eval.SB.imp, breaks = 30)
###################################################

# update dataframes with imputed values
mb3 <- mb2
mb3$TEAM_BASERUN_SB <- SB.imp

eval_data.3 <- eval_data.2
eval_data.3$TEAM_BASERUN_SB <- eval.SB.imp

Build model to replace DP

#remove target wins
DP.1 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS)
summary(DP.1)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.685 -13.815  -0.923  13.056  64.642 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      119.570410  11.008885  10.861 < 0.0000000000000002 ***
## TEAM_BATTING_2B    0.015947   0.013100   1.217              0.22360    
## TEAM_BATTING_3B   -0.095545   0.031158  -3.066              0.00220 ** 
## TEAM_BATTING_HR    0.062465   0.016446   3.798              0.00015 ***
## TEAM_BATTING_BB    0.195238   0.027750   7.036     0.00000000000272 ***
## TEAM_BATTING_SO   -0.107809   0.015781  -6.831     0.00000000001116 ***
## TEAM_BASERUN_SB   -0.126130   0.009838 -12.820 < 0.0000000000000002 ***
## TEAM_PITCHING_H    0.009159   0.004069   2.251              0.02448 *  
## TEAM_PITCHING_BB  -0.153349   0.025468  -6.021     0.00000000205698 ***
## TEAM_PITCHING_SO   0.090351   0.014318   6.310     0.00000000034296 ***
## TEAM_FIELDING_E   -0.068602   0.009653  -7.107     0.00000000000165 ***
## TEAM_BATTING_1B    0.024316   0.010083   2.412              0.01598 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.05 on 1978 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4185, Adjusted R-squared:  0.4153 
## F-statistic: 129.4 on 11 and 1978 DF,  p-value: < 0.00000000000000022

#remove batting 2B's
DP.2 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS - TEAM_BATTING_2B)
summary(DP.2)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.620 -14.051  -1.043  12.979  64.061 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      120.547069  10.980952  10.978 < 0.0000000000000002 ***
## TEAM_BATTING_3B   -0.087844   0.030513  -2.879              0.00403 ** 
## TEAM_BATTING_HR    0.065419   0.016268   4.021  0.00006003296882195 ***
## TEAM_BATTING_BB    0.200115   0.027463   7.287  0.00000000000045639 ***
## TEAM_BATTING_SO   -0.107359   0.015779  -6.804  0.00000000001344522 ***
## TEAM_BASERUN_SB   -0.126208   0.009839 -12.827 < 0.0000000000000002 ***
## TEAM_PITCHING_H    0.010913   0.003806   2.868              0.00418 ** 
## TEAM_PITCHING_BB  -0.157287   0.025265  -6.226  0.00000000058410192 ***
## TEAM_PITCHING_SO   0.090027   0.014318   6.288  0.00000000039501280 ***
## TEAM_FIELDING_E   -0.072227   0.009183  -7.865  0.00000000000000601 ***
## TEAM_BATTING_1B    0.024008   0.010081   2.381              0.01734 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.06 on 1979 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4181, Adjusted R-squared:  0.4152 
## F-statistic: 142.2 on 10 and 1979 DF,  p-value: < 0.00000000000000022

# results show that EVERYTHING ELSE is statistically signficant, so:

# run vif to check for collinearity
vif(DP.2)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##         2.378033         4.489647        40.081217        61.937276 
##  TEAM_BASERUN_SB  TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO 
##         2.207539        10.533005        33.459414        51.795257 
##  TEAM_FIELDING_E  TEAM_BATTING_1B 
##         6.392623         4.748132

# results show TEAM_BATTING_SO should be removed

# remove TEAM_BATTING_SO
DP.3 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO)
summary(DP.3)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.858 -14.365  -0.308  13.180  68.663 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      107.364168  10.931590   9.821 < 0.0000000000000002 ***
## TEAM_BATTING_3B   -0.075566   0.030806  -2.453              0.01425 *  
## TEAM_BATTING_HR    0.049820   0.016289   3.059              0.00225 ** 
## TEAM_BATTING_BB    0.059620   0.018312   3.256              0.00115 ** 
## TEAM_BASERUN_SB   -0.135254   0.009860 -13.717 < 0.0000000000000002 ***
## TEAM_PITCHING_H    0.016359   0.003763   4.348       0.000014452154 ***
## TEAM_PITCHING_BB  -0.022297   0.015820  -1.409              0.15887    
## TEAM_PITCHING_SO  -0.003527   0.004037  -0.874              0.38244    
## TEAM_FIELDING_E   -0.057520   0.009027  -6.372       0.000000000231 ***
## TEAM_BATTING_1B    0.020596   0.010183   2.023              0.04326 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 1980 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4045, Adjusted R-squared:  0.4018 
## F-statistic: 149.4 on 9 and 1980 DF,  p-value: < 0.00000000000000022

# p-value says remove TEAM_PITCHING_SO;


# remove TEAM_PITCHING_SO
DP.4 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TEAM_BATTING_2B -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO)
summary(DP.4)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TEAM_BATTING_2B - 
##     TARGET_WINS - TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.801 -14.399  -0.536  13.355  68.091 
## 
## Coefficients:
##                    Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)      100.469748   7.563815  13.283 < 0.0000000000000002 ***
## TEAM_BATTING_3B   -0.067701   0.029459  -2.298             0.021658 *  
## TEAM_BATTING_HR    0.045067   0.015352   2.936             0.003368 ** 
## TEAM_BATTING_BB    0.063875   0.017651   3.619             0.000303 ***
## TEAM_BASERUN_SB   -0.138490   0.009137 -15.157 < 0.0000000000000002 ***
## TEAM_PITCHING_H    0.015675   0.003680   4.259       0.000021452526 ***
## TEAM_PITCHING_BB  -0.025444   0.015403  -1.652             0.098711 .  
## TEAM_FIELDING_E   -0.055612   0.008758  -6.350       0.000000000266 ***
## TEAM_BATTING_1B    0.025138   0.008755   2.871             0.004134 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.28 on 1981 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4043, Adjusted R-squared:  0.4018 
## F-statistic:   168 on 8 and 1981 DF,  p-value: < 0.00000000000000022

vif(DP.4)

##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BASERUN_SB 
##         2.167319         3.909442        16.189150         1.861261 
##  TEAM_PITCHING_H TEAM_PITCHING_BB  TEAM_FIELDING_E  TEAM_BATTING_1B 
##         9.630593        12.159984         5.684878         3.501740

# P values and vif both indicate remove TEAM_PITCHING_BB

# remove TEAM_PITCHING_BB
DP.5 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB)
summary(DP.5)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB, data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.547 -14.352  -0.402  13.240  66.253 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     98.606000   7.482458  13.178 < 0.0000000000000002 ***
## TEAM_BATTING_3B -0.071477   0.029383  -2.433               0.0151 *  
## TEAM_BATTING_HR  0.057086   0.013524   4.221       0.000025413117 ***
## TEAM_BATTING_BB  0.036352   0.005830   6.235       0.000000000551 ***
## TEAM_BASERUN_SB -0.138165   0.009139 -15.118 < 0.0000000000000002 ***
## TEAM_PITCHING_H  0.011078   0.002409   4.598       0.000004528538 ***
## TEAM_FIELDING_E -0.051135   0.008332  -6.137       0.000000001010 ***
## TEAM_BATTING_1B  0.032220   0.007637   4.219       0.000025688127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.29 on 1982 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.4034, Adjusted R-squared:  0.4013 
## F-statistic: 191.5 on 7 and 1982 DF,  p-value: < 0.00000000000000022

vif(DP.5)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.154275        3.031272        1.764845        1.860396 
## TEAM_PITCHING_H TEAM_FIELDING_E TEAM_BATTING_1B 
##        4.123897        5.140583        2.662224

# vif says remove TEAM_FIELDING_E; p-values all < .05 so remove TEAM_FIELDING_E

DP.6 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E)
summary(DP.6)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E, 
##     data = mb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.558 -14.399  -0.152  13.362  64.018 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     86.427823   7.280960  11.870 < 0.0000000000000002 ***
## TEAM_BATTING_3B -0.093997   0.029422  -3.195              0.00142 ** 
## TEAM_BATTING_HR  0.091627   0.012411   7.383  0.00000000000022678 ***
## TEAM_BATTING_BB  0.045730   0.005678   8.053  0.00000000000000138 ***
## TEAM_BASERUN_SB -0.155983   0.008745 -17.836 < 0.0000000000000002 ***
## TEAM_PITCHING_H  0.001250   0.001817   0.688              0.49148    
## TEAM_BATTING_1B  0.044340   0.007446   5.955  0.00000000306506646 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.48 on 1983 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.3921, Adjusted R-squared:  0.3903 
## F-statistic: 213.2 on 6 and 1983 DF,  p-value: < 0.00000000000000022

vif(DP.6)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.120678        2.506351        1.643620        1.672663 
## TEAM_PITCHING_H TEAM_BATTING_1B 
##        2.301907        2.484225

# now no collinearity but p-values say remove TEAM_PITCHING_H

DP.7 <- lm(data=mb3, TEAM_FIELDING_DP~. - INDEX -TARGET_WINS -TEAM_BATTING_2B - TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E - TEAM_PITCHING_H)
summary(DP.7)

## 
## Call:
## lm(formula = TEAM_FIELDING_DP ~ . - INDEX - TARGET_WINS - TEAM_BATTING_2B - 
##     TEAM_BATTING_SO - TEAM_PITCHING_SO - TEAM_PITCHING_BB - TEAM_FIELDING_E - 
##     TEAM_PITCHING_H, data = mb3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.58 -14.41  -0.19  13.38  63.83 
## 
## Coefficients:
##                  Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)     85.333602   7.104233  12.012 < 0.0000000000000002 ***
## TEAM_BATTING_3B -0.094593   0.029405  -3.217              0.00132 ** 
## TEAM_BATTING_HR  0.094530   0.011670   8.100 0.000000000000000947 ***
## TEAM_BATTING_BB  0.044401   0.005339   8.316 < 0.0000000000000002 ***
## TEAM_BASERUN_SB -0.153554   0.008000 -19.194 < 0.0000000000000002 ***
## TEAM_BATTING_1B  0.047370   0.006003   7.892 0.000000000000004884 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.48 on 1984 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.3919, Adjusted R-squared:  0.3904 
## F-statistic: 255.8 on 5 and 1984 DF,  p-value: < 0.00000000000000022

vif(DP.7)

## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB 
##        2.118844        2.216648        1.453424        1.400175 
## TEAM_BATTING_1B 
##        1.615010

# no collinearity, all p-values < .05 so stop


#all low P value and F statistic of 255.8 with adj R squared of 0.3904
#take a look
par(mfrow=c(2,2))
plot(DP.7)

#place back in the data base with imputed data for SB's
# NOTE: Changed DP.4 to DP.7 here
pred.DP <- round(predict(DP.7, mb3))
DP.imp <- impute(mb3$TEAM_FIELDING_DP, pred.DP)

# impute the evaluation data
pred_eval.DP <- round(predict(DP.7, eval_data.3))
eval.DP.imp <- impute(eval_data.3$TEAM_FIELDING_DP, pred_eval.DP)

###################################################
# Jims added code for diagnostics of imputation

# first, check summaries to ensure similar values
summary(mb3$TEAM_FIELDING_DP)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    52.0   131.0   149.0   146.4   164.0   228.0     286

summary(DP.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    48.0   124.0   145.0   141.5   162.0   228.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))
hist(mb3$TEAM_FIELDING_DP, breaks = 200)
hist(DP.imp, breaks = 200)

# ------------------ eval data checks ------------------------
# first, check summaries to ensure similar values
summary(eval_data.3$TEAM_FIELDING_DP)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    69.0   131.0   148.0   146.1   164.0   204.0      31

summary(eval.DP.imp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    52.0   123.5   146.0   141.3   160.5   204.0

# now plot side-by-side histograms to check similarity of distributions
par(mfrow = c(2,2))

hist(eval_data.3$TEAM_FIELDING_DP, breaks = 30)
hist(eval.DP.imp, breaks = 30)
###################################################

# update data frames with imputed values
mb4 <- mb3
mb4$TEAM_FIELDING_DP <- DP.imp

eval_data.4 <- eval_data.3
eval_data.4$TEAM_FIELDING_DP <- eval.DP.imp

Eliminate unhistorical outliers - DO THIS FOR THE EVAL DATA AS WELL

# check rowcount before removal of outliers
nrow(mb4)

## [1] 2276

nrow(eval_data.4)

## [1] 259

############## TEAM PITCHING_SO ############################
#most pitching SO's is 1450.  So delete all records with more than 1450 pitching SO's
mb5 <- mb4

# fixed error in this line: dataframe in 'which' call was mb1 so changed to mb5
mb5 <- mb5[which(mb5$TEAM_PITCHING_SO < 1450),]

# eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_PITCHING_SO < 1450),]

# check rowcount
nrow(mb5)

## [1] 2251

nrow(eval_data.4)

## [1] 259

############ TEAM_PITCHING_H ##############################
#most ever hits by a team is 1730.  So delete all pitching hits >3000 to be conservative with the median
mb6 <- mb5
mb6 <- mb6[which(mb6$TEAM_PITCHING_H < 3001),]

# eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_PITCHING_H < 3001),]

# check rowcount
nrow(mb6)

## [1] 2174

nrow(eval_data.4)

## [1] 259

############ TEAM_FIELDING_E ##############################
#most ever errors by a team is 639 by 1883 Philadelphia.  Prorating to 162 games gives a value of 1046.
mb7 <- mb6
mb7 <- mb7[which(mb7$TEAM_FIELDING_E < 1047),]

# eval_data.4 <- eval_data.4[which(eval_data.4$TEAM_FIELDING_E < 1047),]

# ----------------------------------------------------------------------
# ----------------------------------------------------------------------

# check rowcount: result is 2172 => removed total of 104 rows
nrow(mb7)

## [1] 2172

nrow(eval_data.4)

## [1] 259

dim(mb)-dim(mb7)

## [1] 104   0

#we removed 104 rows total due to outliers in TRAINING data set.

# we removed 11 rows from the EVALUATION data set

# now renumber rows of dataframe so that there are no gaps in row numbers
rownames(mb7) <- 1:nrow(mb7)
rownames(eval_data.4) <- 1:nrow(eval_data.4)

# drop INDEX column from training set
# mb7 <- mb7[,-1]

# now drop dummy column from evaluation data
# eval_data.4 <- eval_data.4[,-14]

# create CSV files containing updated data sets

write.csv(mb7, file = "C:/SQLData/621-HW1-Clean-Data.csv", row.names = FALSE, col.names = TRUE)

write.csv(eval_data.4, file = "C:/SQLData/621-HW1-Clean-EvalData-.csv", row.names = FALSE, col.names = TRUE)

Model 5: Box-Cox First

Eliminate HBP, CS, and pitching HR’s.

Build model for batting SO using Gelman approach

Build model for Pitching SO

Build model for SB

Build model to replace DP

Eliminate unhistorical outliers - DO THIS FOR THE EVAL DATA AS WELL

# create CSV files containing updated data sets
write.csv(mb7, file = "/Users/scottkarr/IS621Summer2016/HW1/621-HW1-Clean-Data.csv", row.names = FALSE, col.names = TRUE)

write.csv(eval_data.4, file = "/Users/scottkarr/IS621Summer2016/HW1/621-HW1-Clean-EvalData-.csv", row.names = FALSE, col.names = TRUE)

SINGLE PREDICTOR ANALYSIS & TRANSFORMATIONS

Model SMK Generlized Equation

Review descriptive statistics to confirm each variable is within acceptable bounds and contains no missing data. Review Density plots of 13 variables for skewness to identify which may require transformation.

Evaluate Correlations

Evaluate Correlation between predictors so as to not introduce collinearity into the model.

## [1] 2157

Model Selection Strategy

Two common strategies for adding or removing variables in a multiple regression model are called backward elimination and forward selection. These techniques are often referred to as stepwise model selection strategies, because they add or delete one variable at a time as they âstepâ through the candidate predictors. Model 1 uses the forward selection strategy which adds variables two-at-a-time until variables cannot be found that improve the model as measured by adjusted \(R^2\). Diez, D.M., Barr, C.D., & Ãetinkaya-Rundel, M. (2015). OpenIntro Statistics (3rd Ed). pg. 378

Start with p.Hits & p.Walks

Add b.Singles & b.Doubles

Removed p.Hits

Added Stolen Bases and Double Plays

Added b.Walks and p.Strikeouts

Remove b.Walks

Add b.StrikeOuts, & b.Slugging

Add b.Fielding

\[ \begin{aligned} \widehat{wins} &= \hat{\beta}_0 + \hat{\beta}_1 \times p.Walks \hat{\beta}_2 \times b.Singles + \hat{\beta}_3 \times b.Doubles + \hat{\beta}_4 \times b.Stolen Bases + \end{aligned} \] \[ \begin{aligned} \hat{\beta}_5 \times f.Double Plays + \hat{\beta}_6 \times b.Strike Outs + \hat{\beta}_7 \times b.Slugging + \hat{\beta}_8 \times b.Fielding Yield + \end{aligned} \] nrow(lm.smk)

#VARIABLES
#variables have been transformed first as individual predictors
Wins  <- mW
Index <- lm.smk$INDEX 
p.Walks <- tmp.BB
b.Singles <- tmb.1B
b.Doubles <- tmb.2B
b.StolenBases <- tmb.SB
b.StrikeOuts <- tmb.SO
b.Slugging <- tmb.SL
f.Fielding <- tmf.FY

m1 <- lm(Wins ~ -Index+p.Walks+b.Singles+b.Doubles+b.StolenBases+b.StrikeOuts+b.Slugging+f.Fielding)

#PAIRWISE PLOT
par(mfrow=c(1,1))
pairs(Wins ~ p.Walks+b.Singles+b.StolenBases+b.StrikeOuts+b.Slugging+f.Fielding)

#MODEL DIAGNOSTICS
summary(m1)

## 
## Call:
## lm(formula = Wins ~ -Index + p.Walks + b.Singles + b.Doubles + 
##     b.StolenBases + b.StrikeOuts + b.Slugging + f.Fielding)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.318  -7.845  -0.053   7.816  39.368 
## 
## Coefficients:
##                      Estimate      Std. Error t value             Pr(>|t|)
## (Intercept)          67.03042        12.67433   5.289      0.0000001357003
## p.Walks              27.93429         3.00029   9.311 < 0.0000000000000002
## b.Singles     -19003746.29526   2670169.82691  -7.117      0.0000000000015
## b.Doubles             0.03065         0.21302   0.144                0.886
## b.StolenBases      -121.06738        12.99326  -9.318 < 0.0000000000000002
## b.StrikeOuts         -0.07317         0.01409  -5.195      0.0000002239702
## b.Slugging            3.01538         0.18292  16.485 < 0.0000000000000002
## f.Fielding      4032977.37953    225896.61232  17.853 < 0.0000000000000002
##                  
## (Intercept)   ***
## p.Walks       ***
## b.Singles     ***
## b.Doubles        
## b.StolenBases ***
## b.StrikeOuts  ***
## b.Slugging    ***
## f.Fielding    ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.95 on 2149 degrees of freedom
## Multiple R-squared:  0.3168, Adjusted R-squared:  0.3146 
## F-statistic: 142.4 on 7 and 2149 DF,  p-value: < 0.00000000000000022

#DIAGNOSTIC1. show collinearity of variables after checking p-values to < 0.05.
vif(m1)

##       p.Walks     b.Singles     b.Doubles b.StolenBases  b.StrikeOuts 
##      1.106592      2.857435      1.469729      1.205956      3.210354 
##    b.Slugging    f.Fielding 
##      1.301083      1.801535

#p-values are all < 0.05 and no VIFs  > 5

#DIAGNOSTIC2.  generate Added Variable Plots: should show linear relationship between response & predictors:
par(mfrow=c(2,2))
avPlots(m1, ~.,ask=FALSE, id.n = 2)

#relationship is linear

#DIAGNOSTIC3.  generate Summary Diagnostic Plots
par(mfrow=c(2,2))
plot(m1)

#Upper Left plot "Residuals vs Fitted" 
# clear predictable pattern
# uniform variability for all fitted values
#Upper Right 
#  normality in residuals
#Lower Right plot "Residuals vs. Leverage"
#  normal distribution, and uniform distribution of residuals
#  no significant leverage points

#DIAGNOSTIC4.  generate Standardized Residual Plots against each predictor
par(mfrow=c(2,2))
StanRest <- rstandard(m1)
plot(p.Walks,StanRest,ylab="Standardized Residuals")
plot(b.Singles,StanRest,ylab="Standardized Residuals")
plot(b.Doubles,StanRest,ylab="Standardized Residuals")
plot(b.StolenBases,StanRest,ylab="Standardized Residuals")

plot(f.DoublePlays,StanRest,ylab="Standardized Residuals")
plot(b.StrikeOuts,StanRest,ylab="Standardized Residuals")
plot(b.Slugging,StanRest,ylab="Standardized Residuals")
plot(f.Fielding,StanRest,ylab="Standardized Residuals")

plot(m1$fitted.values,StanRest,ylab="Standardized Residuals",xlab="Fitted Values")
#Examine plots for constant variability of residuals across ALL predictor. 
# uniform distribution of residuals

#DIAGNOSTIC5.  generate plot of Y "response variable"" against Fitted Values "regression model"
par(mfrow = c(2,2))

plot(m1$fitted.values,Wins,xlab="Fitted Values",ylab=expression(Wins^lambda))
abline(lsfit(m1$fitted.values,Wins))
plot(m1)

# normal distribution, and uniform distribution of residuals

#pred_eval.m1 <- round(predict(m1, eval_data))
#eval.BSO.imp <- impute(eval_data$TARGET_WINS, pred_eval.m1)

Part 4. Select Models

#########################################################################################
# 
# This file loads our preferred predictive model (TOTAL BASES PLUS)
# and uses that model to predict the TARGET_WINS variable of the MLB Evaluation 
# data set. 
#
# When finished, two separate files are written to a local hard disk directory:
#
# - one containing the entire EVALUATION data set after the TARGET_WINS variable has
#   been updated with the predicted values for each record;
#
# - one containing ONLY the INDEX and TARGET_WINS variables from the EVALUATION data set
#
# - NO screen output is generated at all by this code
#
#########################################################################################

# --------------------------------------------------------------
# read clean data set from Github

mb_clean <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/621-HW1-Clean-Data.csv")  

# ---------------------------------------------------------------

# Build a model with Total Bases + SB + BB added and all of the other hitting vars removed

# create new variable and drop its components
mb_t <- mb_clean

mb_t$TB_PLUS <- mb_clean$TEAM_BATTING_1B + (2 * mb_clean$TEAM_BATTING_2B) + 
                       (3 * mb_clean$TEAM_BATTING_3B) + (4 * mb_clean$TEAM_BATTING_HR) + 
                       mb_clean$TEAM_BATTING_BB + mb_clean$TEAM_BASERUN_SB

# par(mfrow = c(1,1))
# hist(mb_t$TB_PLUS, breaks = 200)

# now drop 1B, 2B, 3B, HR, BB, SB
mb_tbp <- mb_t[,c(1, 2, 7, 9, 10, 11, 12, 13, 15)]


# -----------------------------------------------------------------------------------------

# REMOVE OUTLIERS AND REFIT

# Per Cooks Distance, remove items 836, 821, 1920, 1737, 1515


############ FIRST SET OF OUTLIERS ######################
# drop outlier records from data set
mb_rem <- mb_tbp[-c(836, 821, 1920, 1737, 1515),]

# save first data set
mb_tbp_orig <- mb_tbp

# renumber rows
rownames(mb_rem) <- 1:nrow(mb_rem)

# keep the clean data set pure
mb_tbp <- mb_rem

# -------------------------------------------------------------------------------------------

## Now try same model but with FIELD_E transformed using Box-Cox

# TEAM_FIELDING_E: Box-cox says -1 power transform => 1/y 

mb_tbp$TEAM_FIELDING_E <- 1/mb_tbp$TEAM_FIELDING_E

# Now refit first model from above: Start with all variables
model.4 <- lm(data=mb_tbp, TARGET_WINS ~ . - INDEX - TEAM_PITCHING_BB - TEAM_PITCHING_H - TEAM_PITCHING_SO)

# summary(model.4)

# Now load evaluation data set and predict TARGET WINS

# load EVAL data set
eval.d <- read.csv("https://raw.githubusercontent.com/spsstudent15/2016-02-621-W1/master/621-HW1-Clean-EvalData-.csv") 

# save original data
eval.2 <- eval.d

# creaet TB_PLUS and drop component variables
eval.2$TB_PLUS <- eval.2$TEAM_BATTING_1B + (2 * eval.2$TEAM_BATTING_2B) + 
                       (3 * eval.2$TEAM_BATTING_3B) + (4 * eval.2$TEAM_BATTING_HR) + 
                       eval.2$TEAM_BATTING_BB + eval.2$TEAM_BASERUN_SB

# par(mfrow = c(1,1))
# hist(eval.d$TB_PLUS, breaks = 30)

# now drop 1B, 2B, 3B, HR, BB, SB
eval.2 <- eval.2[,c(1, 6, 8, 9, 10, 11, 12, 14, 15)]

# transform TEAM_FIELDING_E using 1/y
eval.2$TEAM_FIELDING_E <- 1/eval.2$TEAM_FIELDING_E


# now predict TARGET_WINS using model.4
pred.TW <- round(predict(model.4, eval.2))

# add predicted variables to TARGET_WINS variable
eval.2$TARGET_WINS <- pred.TW
eval.d$TARGET_WINS <- pred.TW

# write entire updated EVAL data set to a CSV
write.csv(eval.d, file = "C:/SQLData/HW1-PRED-EVAL-ALLDATA.csv", row.names = FALSE)

# write full model EVAL data to a CSV file
write.csv(eval.d, file = "C:/SQLData/HW1-PRED-EVAL-ALL_M_DATA.csv", row.names = FALSE)

# now write just INDEX and TARGET_WINS to a separate file
eval.3 <- eval.2[,c(1,8)]

write.csv(eval.3, file = "C:/SQLData/HW1-PRED-EVAL-WINS-ONLY.csv", row.names = FALSE)

# end

# clean up objects in memory
rm(list = ls())

Data 621 Homework 1: Code Appendix

Jeff Nieman, Scott Karr, James Topor, Armenoush Aslanian-Persico

Part 1. Data Exploration

Part 2. Data Preparation

New Variable Creation

Creating a new column for batting singles and eliminating hits for batting

Building a regression model and filling in NA’s for SB.

Building a linear regression model and filling in NA’s for CS

Building a regression model and filling in NA’s for batting SO’s

Building a regression model and filling in NA’s for pitching SO’s

Building a linear regression and filling in NA’s for DP

Data Imputation

Eliminate HBP, CS, and pitching HR’s.

Build model for batting SO using Gelman approach

Build model for Pitching SO

Build model for SB

Build model to replace DP

Eliminate unhistorical outliers - DO THIS FOR THE EVAL DATA AS WELL

Part 3. Build Models

Model 1: General Model Using Backward Selection

Build Models

Model using all remaining variables as a starting point

DIAGNOSTICS

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

REMOVE OUTLIERS AND REFIT

Now refit first model from above: all variables

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots show outliers so remove them and re-fit

Model using all remaining variables as a starting point

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Now try same model but with FIELD_E transformed by Box-Cox recommended power transform

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Model 2: Total Bases

Build a model with Total Bases added and all of the other hitting vars removed

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

REMOVE OUTLIERS AND REFIT

Now refit first model from above: all variables

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

Plots show outliers so remove them and re-fit

Model using all remaining variables as a starting point

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Now try same model but with FIELD_E transformed according to Box-Cox

Model using all remaining variables as a starting point

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Model 3: Total Bases PLUS

Build a model with Total Bases + SB + BB added and all of the other hitting vars removed

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

REMOVE OUTLIERS AND REFIT

Now refit first model from above: all variables

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Now try same model but with FIELD_E transformed using Box-Cox

Diagnostics

SUMMARY MODEL DIAGNOSTIC PLOTS

PLOT STANDARDIZED RESIDUALS AGAINST EACH PREDICTOR

PLOT Y AGAINST FITTED VALUES

Model 4: Sabermetrics Model