Moneyball Data Modeling

library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(psych)
library(reshape)
library(corrgram)
library(mice)
library(caret)
library(e1071)

Data Exploration

Data Load

Exploring data by implementing statistical measures & plots

moneyball_df <- read.csv("https://raw.githubusercontent.com/josephsimone/Data621/master/HW1/moneyball-training-data.csv")
count(moneyball_df)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  2276
names(moneyball_df)
##  [1] "INDEX"            "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B" 
##  [5] "TEAM_BATTING_3B"  "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO" 
##  [9] "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H" 
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" 
## [17] "TEAM_FIELDING_DP"
summary(moneyball_df)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286

The MoneyBall Data-Set is made up of 17 elements, with 2276 distinct cases.

A signifcant amount of variables contain ‘Na’s’ values or missing data points

TEAM-BATTING_HBP has the highest ‘NA’s’ values.

Exploring Outliers

ggplot(stack(moneyball_df), aes(x = ind, y = values)) + 
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 1000)) +
  theme(legend.position="none") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) + 
  theme(panel.background = element_rect(fill = 'grey'))

Skewness in the data

moneyball_df1 = melt(moneyball_df)
ggplot(moneyball_df1, aes(x= value)) + 
    geom_density(fill='blue') + facet_wrap(~variable, scales = 'free') 

There are several variables that are skewed

In addition, there are some outliers.

Finding correlations:

moneyball_df2 <- moneyball_df[,-1 ]
names(moneyball_df2)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [5] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
##  [9] "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H"  "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
cor(drop_na(moneyball_df2))
##                  TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## TARGET_WINS       1.00000000     0.46994665      0.31298400     -0.12434586
## TEAM_BATTING_H    0.46994665     1.00000000      0.56177286      0.21391883
## TEAM_BATTING_2B   0.31298400     0.56177286      1.00000000      0.04203441
## TEAM_BATTING_3B  -0.12434586     0.21391883      0.04203441      1.00000000
## TEAM_BATTING_HR   0.42241683     0.39627593      0.25099045     -0.21879927
## TEAM_BATTING_BB   0.46868793     0.19735234      0.19749256     -0.20584392
## TEAM_BATTING_SO  -0.22889273    -0.34174328     -0.06415123     -0.19291841
## TEAM_BASERUN_SB   0.01483639     0.07167495     -0.18768279      0.16946086
## TEAM_BASERUN_CS  -0.17875598    -0.09377545     -0.20413884      0.23213978
## TEAM_BATTING_HBP  0.07350424    -0.02911218      0.04608475     -0.17424715
## TEAM_PITCHING_H   0.47123431     0.99919269      0.56045355      0.21250322
## TEAM_PITCHING_HR  0.42246683     0.39495630      0.24999875     -0.21973263
## TEAM_PITCHING_BB  0.46839882     0.19529071      0.19592157     -0.20675383
## TEAM_PITCHING_SO -0.22936481    -0.34445001     -0.06616615     -0.19386654
## TEAM_FIELDING_E  -0.38668800    -0.25381638     -0.19427027     -0.06513145
## TEAM_FIELDING_DP -0.19586601     0.01776946     -0.02488808      0.13314758
##                  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## TARGET_WINS           0.42241683      0.46868793     -0.22889273
## TEAM_BATTING_H        0.39627593      0.19735234     -0.34174328
## TEAM_BATTING_2B       0.25099045      0.19749256     -0.06415123
## TEAM_BATTING_3B      -0.21879927     -0.20584392     -0.19291841
## TEAM_BATTING_HR       1.00000000      0.45638161      0.21045444
## TEAM_BATTING_BB       0.45638161      1.00000000      0.21833871
## TEAM_BATTING_SO       0.21045444      0.21833871      1.00000000
## TEAM_BASERUN_SB      -0.19021893     -0.08806123     -0.07475974
## TEAM_BASERUN_CS      -0.27579838     -0.20878051     -0.05613035
## TEAM_BATTING_HBP      0.10618116      0.04746007      0.22094219
## TEAM_PITCHING_H       0.39549390      0.19848687     -0.34145321
## TEAM_PITCHING_HR      0.99993259      0.45659283      0.21111617
## TEAM_PITCHING_BB      0.45542468      0.99988140      0.21895783
## TEAM_PITCHING_SO      0.20829574      0.21793253      0.99976835
## TEAM_FIELDING_E       0.01567397     -0.07847126      0.30814540
## TEAM_FIELDING_DP     -0.06182222     -0.07929078     -0.12319072
##                  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP
## TARGET_WINS           0.01483639    -0.178755979       0.07350424
## TEAM_BATTING_H        0.07167495    -0.093775445      -0.02911218
## TEAM_BATTING_2B      -0.18768279    -0.204138837       0.04608475
## TEAM_BATTING_3B       0.16946086     0.232139777      -0.17424715
## TEAM_BATTING_HR      -0.19021893    -0.275798375       0.10618116
## TEAM_BATTING_BB      -0.08806123    -0.208780510       0.04746007
## TEAM_BATTING_SO      -0.07475974    -0.056130355       0.22094219
## TEAM_BASERUN_SB       1.00000000     0.624737808      -0.06400498
## TEAM_BASERUN_CS       0.62473781     1.000000000      -0.07051390
## TEAM_BATTING_HBP     -0.06400498    -0.070513896       1.00000000
## TEAM_PITCHING_H       0.07395373    -0.092977893      -0.02769699
## TEAM_PITCHING_HR     -0.18948057    -0.275471495       0.10675878
## TEAM_PITCHING_BB     -0.08741902    -0.208470154       0.04785137
## TEAM_PITCHING_SO     -0.07351325    -0.055308336       0.22157375
## TEAM_FIELDING_E       0.04292341     0.207701189       0.04178971
## TEAM_FIELDING_DP     -0.13023054    -0.006764233      -0.07120824
##                  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## TARGET_WINS           0.47123431       0.42246683       0.46839882
## TEAM_BATTING_H        0.99919269       0.39495630       0.19529071
## TEAM_BATTING_2B       0.56045355       0.24999875       0.19592157
## TEAM_BATTING_3B       0.21250322      -0.21973263      -0.20675383
## TEAM_BATTING_HR       0.39549390       0.99993259       0.45542468
## TEAM_BATTING_BB       0.19848687       0.45659283       0.99988140
## TEAM_BATTING_SO      -0.34145321       0.21111617       0.21895783
## TEAM_BASERUN_SB       0.07395373      -0.18948057      -0.08741902
## TEAM_BASERUN_CS      -0.09297789      -0.27547150      -0.20847015
## TEAM_BATTING_HBP     -0.02769699       0.10675878       0.04785137
## TEAM_PITCHING_H       1.00000000       0.39463199       0.19703302
## TEAM_PITCHING_HR      0.39463199       1.00000000       0.45580983
## TEAM_PITCHING_BB      0.19703302       0.45580983       1.00000000
## TEAM_PITCHING_SO     -0.34330646       0.20920115       0.21887700
## TEAM_FIELDING_E      -0.25073028       0.01689330      -0.07692315
## TEAM_FIELDING_DP      0.01416807      -0.06292475      -0.08040645
##                  TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## TARGET_WINS           -0.22936481     -0.38668800     -0.195866006
## TEAM_BATTING_H        -0.34445001     -0.25381638      0.017769456
## TEAM_BATTING_2B       -0.06616615     -0.19427027     -0.024888081
## TEAM_BATTING_3B       -0.19386654     -0.06513145      0.133147578
## TEAM_BATTING_HR        0.20829574      0.01567397     -0.061822219
## TEAM_BATTING_BB        0.21793253     -0.07847126     -0.079290775
## TEAM_BATTING_SO        0.99976835      0.30814540     -0.123190715
## TEAM_BASERUN_SB       -0.07351325      0.04292341     -0.130230537
## TEAM_BASERUN_CS       -0.05530834      0.20770119     -0.006764233
## TEAM_BATTING_HBP       0.22157375      0.04178971     -0.071208241
## TEAM_PITCHING_H       -0.34330646     -0.25073028      0.014168073
## TEAM_PITCHING_HR       0.20920115      0.01689330     -0.062924751
## TEAM_PITCHING_BB       0.21887700     -0.07692315     -0.080406452
## TEAM_PITCHING_SO       1.00000000      0.31008407     -0.124923213
## TEAM_FIELDING_E        0.31008407      1.00000000      0.040205814
## TEAM_FIELDING_DP      -0.12492321      0.04020581      1.000000000
pairs.panels(moneyball_df2[1:8]) 

pairs.panels(moneyball_df2[9:16]) 

There are both Positively and Negatively Correlated Variables in this Data-Set

Data Preperation

Removing variables

moneyball_df_f <- moneyball_df[,-1 ]
names(moneyball_df_f)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [5] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
##  [9] "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H"  "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"

The variable TEAM_BATTING_HBP is mostly made up of missing values

Therefore, this variable will be removed completely.

moneyball_df_f <- moneyball_df_f[,-10 ]
names(moneyball_df_f )
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [5] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
##  [9] "TEAM_BASERUN_CS"  "TEAM_PITCHING_H"  "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"

TEAM_PITCHING_HR and TEAM_BATTING_HR are highly correlated, therefore we will remove one of them.

moneyball_df_f <- moneyball_df_f[,-11 ]
names(moneyball_df_f)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [5] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
##  [9] "TEAM_BASERUN_CS"  "TEAM_PITCHING_H"  "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"

Imputing the ‘NA’s’ values

imputed_moneyball_df_Data <- mice(moneyball_df_f, m=5, maxit = 5, method = 'pmm')
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
imputed_moneyball_df_Data <- complete(imputed_moneyball_df_Data)
summary(imputed_moneyball_df_Data)
##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 545.0   1st Qu.: 67.0  
##  Median :102.00   Median :512.0   Median : 735.5   Median :106.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 728.6   Mean   :137.6  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 925.0   3rd Qu.:170.2  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##  TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO 
##  Min.   :  0.00   Min.   : 1137   Min.   :   0.0   Min.   :    0.0  
##  1st Qu.: 43.00   1st Qu.: 1419   1st Qu.: 476.0   1st Qu.:  611.0  
##  Median : 57.00   Median : 1518   Median : 536.5   Median :  805.5  
##  Mean   : 75.38   Mean   : 1779   Mean   : 553.0   Mean   :  811.6  
##  3rd Qu.: 91.00   3rd Qu.: 1682   3rd Qu.: 611.0   3rd Qu.:  958.2  
##  Max.   :201.00   Max.   :30132   Max.   :3645.0   Max.   :19278.0  
##  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :  65.0   Min.   : 52.0   
##  1st Qu.: 127.0   1st Qu.:125.0   
##  Median : 159.0   Median :146.0   
##  Mean   : 246.5   Mean   :141.8   
##  3rd Qu.: 249.2   3rd Qu.:162.0   
##  Max.   :1898.0   Max.   :228.0

Scaling & Centering

mb = preProcess(imputed_moneyball_df_Data, 
                   c("BoxCox", "center", "scale"))
moneyball_df_final = data.frame(
      mb = predict(mb, imputed_moneyball_df_Data))
summary(moneyball_df_final)
##  mb.TARGET_WINS     mb.TEAM_BATTING_H   mb.TEAM_BATTING_2B mb.TEAM_BATTING_3B
##  Min.   :-5.12888   Min.   :-7.537074   Min.   :-4.48108   Min.   :-1.9776   
##  1st Qu.:-0.62156   1st Qu.:-0.573089   1st Qu.:-0.68949   1st Qu.:-0.7606   
##  Median : 0.07676   Median :-0.003988   Median :-0.03019   Median :-0.2953   
##  Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000   
##  3rd Qu.: 0.71159   3rd Qu.: 0.586908   3rd Qu.: 0.69827   3rd Qu.: 0.5995   
##  Max.   : 4.13970   Max.   : 4.390097   Max.   : 4.05391   Max.   : 6.0042   
##  mb.TEAM_BATTING_HR mb.TEAM_BATTING_BB mb.TEAM_BATTING_SO mb.TEAM_BASERUN_SB
##  Min.   :-1.64521   Min.   :-4.08866   Min.   :-2.95701   Min.   :-1.3047   
##  1st Qu.:-0.95153   1st Qu.:-0.41215   1st Qu.:-0.74525   1st Qu.:-0.6695   
##  Median : 0.03944   Median : 0.08511   Median : 0.02785   Median :-0.2997   
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   
##  3rd Qu.: 0.78267   3rd Qu.: 0.63944   3rd Qu.: 0.79689   3rd Qu.: 0.3094   
##  Max.   : 2.71505   Max.   : 3.06871   Max.   : 2.72051   Max.   : 5.3033   
##  mb.TEAM_BASERUN_CS mb.TEAM_PITCHING_H mb.TEAM_PITCHING_BB mb.TEAM_PITCHING_SO
##  Min.   :-1.5092    Min.   :-2.8556    Min.   :-3.32422    Min.   :-1.49634   
##  1st Qu.:-0.6483    1st Qu.:-0.6710    1st Qu.:-0.46291    1st Qu.:-0.36981   
##  Median :-0.3680    Median :-0.1765    Median :-0.09923    Median :-0.01121   
##  Mean   : 0.0000    Mean   : 0.0000    Mean   : 0.00000    Mean   : 0.00000   
##  3rd Qu.: 0.3128    3rd Qu.: 0.4602    3rd Qu.: 0.34860    3rd Qu.: 0.27042   
##  Max.   : 2.5152    Max.   : 3.2387    Max.   :18.58645    Max.   :34.04726   
##  mb.TEAM_FIELDING_E mb.TEAM_FIELDING_DP
##  Min.   :-3.3092    Min.   :-2.59195   
##  1st Qu.:-0.7163    1st Qu.:-0.61598   
##  Median :-0.1424    Median : 0.09239   
##  Mean   : 0.0000    Mean   : 0.00000   
##  3rd Qu.: 0.7096    3rd Qu.: 0.66785   
##  Max.   : 2.1432    Max.   : 3.33606
moneyball_df_final1 = melt(moneyball_df_final)
ggplot(moneyball_df_final1, aes(x= value)) + 
    geom_density(fill='blue') + facet_wrap(~variable, scales = 'free') 

Build Models

Model1

All Vatiables
model1 <- lm(mb.TARGET_WINS ~., moneyball_df_final)
summary(model1)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ ., data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5427 -0.5134 -0.0043  0.5188  3.9201 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.459e-11  1.691e-02   0.000   1.0000    
## mb.TEAM_BATTING_H    3.893e-01  3.628e-02  10.729  < 2e-16 ***
## mb.TEAM_BATTING_2B  -2.724e-02  2.738e-02  -0.995   0.3200    
## mb.TEAM_BATTING_3B   1.545e-01  2.985e-02   5.177 2.46e-07 ***
## mb.TEAM_BATTING_HR   2.400e-01  3.788e-02   6.336 2.83e-10 ***
## mb.TEAM_BATTING_BB   1.590e-01  3.434e-02   4.632 3.83e-06 ***
## mb.TEAM_BATTING_SO  -3.712e-01  4.049e-02  -9.167  < 2e-16 ***
## mb.TEAM_BASERUN_SB   2.692e-01  3.196e-02   8.422  < 2e-16 ***
## mb.TEAM_BASERUN_CS   8.049e-02  3.282e-02   2.452   0.0143 *  
## mb.TEAM_PITCHING_H  -1.796e-01  3.802e-02  -4.723 2.47e-06 ***
## mb.TEAM_PITCHING_BB -2.562e-02  3.257e-02  -0.787   0.4315    
## mb.TEAM_PITCHING_SO  1.242e-01  2.905e-02   4.273 2.01e-05 ***
## mb.TEAM_FIELDING_E  -4.990e-01  3.810e-02 -13.097  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.749e-01  2.310e-02  -7.571 5.35e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8066 on 2262 degrees of freedom
## Multiple R-squared:  0.3532, Adjusted R-squared:  0.3494 
## F-statistic:    95 on 13 and 2262 DF,  p-value: < 2.2e-16

Model2

Significant Variables
model2 <- lm(mb.TARGET_WINS ~ mb.TEAM_BATTING_H  + mb.TEAM_BATTING_3B  + mb.TEAM_BATTING_HR  + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + mb.TEAM_BASERUN_SB + mb.TEAM_PITCHING_SO + mb.TEAM_PITCHING_H + mb.TEAM_PITCHING_SO + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, moneyball_df_final)
summary(model2)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ mb.TEAM_BATTING_H + mb.TEAM_BATTING_3B + 
##     mb.TEAM_BATTING_HR + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + 
##     mb.TEAM_BASERUN_SB + mb.TEAM_PITCHING_SO + mb.TEAM_PITCHING_H + 
##     mb.TEAM_PITCHING_SO + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, 
##     data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7964 -0.5098  0.0067  0.5120  3.8462 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.572e-11  1.693e-02   0.000        1    
## mb.TEAM_BATTING_H    3.705e-01  3.028e-02  12.234  < 2e-16 ***
## mb.TEAM_BATTING_3B   1.693e-01  2.931e-02   5.777 8.67e-09 ***
## mb.TEAM_BATTING_HR   2.341e-01  3.752e-02   6.239 5.23e-10 ***
## mb.TEAM_BATTING_BB   1.385e-01  2.213e-02   6.259 4.61e-10 ***
## mb.TEAM_BATTING_SO  -3.729e-01  3.895e-02  -9.573  < 2e-16 ***
## mb.TEAM_BASERUN_SB   3.159e-01  2.628e-02  12.022  < 2e-16 ***
## mb.TEAM_PITCHING_SO  1.011e-01  2.174e-02   4.648 3.54e-06 ***
## mb.TEAM_PITCHING_H  -1.970e-01  3.498e-02  -5.631 2.01e-08 ***
## mb.TEAM_FIELDING_E  -4.852e-01  3.738e-02 -12.981  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.856e-01  2.275e-02  -8.155 5.72e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8075 on 2265 degrees of freedom
## Multiple R-squared:  0.3509, Adjusted R-squared:  0.348 
## F-statistic: 122.4 on 10 and 2265 DF,  p-value: < 2.2e-16

Model3

Reduction of Variables
  • TEAM_PITCHING_SO & TEAM_BATTING_SO are highly correlated

  • TEAM_BATTING_H & TEAM_PITCHING_H are also highly correlated

  • TEAM_BATTING_SO $ TEAM_PITCHING_SO are also highly correlated

model3 <- lm(mb.TARGET_WINS ~ mb.TEAM_BATTING_H  + mb.TEAM_BATTING_3B  + mb.TEAM_BATTING_HR  + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + mb.TEAM_BASERUN_SB  + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, moneyball_df_final)
summary(model3)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ mb.TEAM_BATTING_H + mb.TEAM_BATTING_3B + 
##     mb.TEAM_BATTING_HR + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + 
##     mb.TEAM_BASERUN_SB + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, 
##     data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7794 -0.5153 -0.0075  0.5215  3.8388 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.324e-12  1.706e-02   0.000        1    
## mb.TEAM_BATTING_H    2.656e-01  2.451e-02  10.838  < 2e-16 ***
## mb.TEAM_BATTING_3B   1.752e-01  2.943e-02   5.952 3.05e-09 ***
## mb.TEAM_BATTING_HR   1.946e-01  3.720e-02   5.232 1.84e-07 ***
## mb.TEAM_BATTING_BB   1.739e-01  2.090e-02   8.321  < 2e-16 ***
## mb.TEAM_BATTING_SO  -2.693e-01  3.507e-02  -7.678 2.39e-14 ***
## mb.TEAM_BASERUN_SB   2.842e-01  2.516e-02  11.295  < 2e-16 ***
## mb.TEAM_FIELDING_E  -5.207e-01  3.641e-02 -14.301  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.857e-01  2.260e-02  -8.219 3.40e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8139 on 2267 degrees of freedom
## Multiple R-squared:  0.3399, Adjusted R-squared:  0.3376 
## F-statistic: 145.9 on 8 and 2267 DF,  p-value: < 2.2e-16

Select Model

summary(model1)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ ., data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5427 -0.5134 -0.0043  0.5188  3.9201 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.459e-11  1.691e-02   0.000   1.0000    
## mb.TEAM_BATTING_H    3.893e-01  3.628e-02  10.729  < 2e-16 ***
## mb.TEAM_BATTING_2B  -2.724e-02  2.738e-02  -0.995   0.3200    
## mb.TEAM_BATTING_3B   1.545e-01  2.985e-02   5.177 2.46e-07 ***
## mb.TEAM_BATTING_HR   2.400e-01  3.788e-02   6.336 2.83e-10 ***
## mb.TEAM_BATTING_BB   1.590e-01  3.434e-02   4.632 3.83e-06 ***
## mb.TEAM_BATTING_SO  -3.712e-01  4.049e-02  -9.167  < 2e-16 ***
## mb.TEAM_BASERUN_SB   2.692e-01  3.196e-02   8.422  < 2e-16 ***
## mb.TEAM_BASERUN_CS   8.049e-02  3.282e-02   2.452   0.0143 *  
## mb.TEAM_PITCHING_H  -1.796e-01  3.802e-02  -4.723 2.47e-06 ***
## mb.TEAM_PITCHING_BB -2.562e-02  3.257e-02  -0.787   0.4315    
## mb.TEAM_PITCHING_SO  1.242e-01  2.905e-02   4.273 2.01e-05 ***
## mb.TEAM_FIELDING_E  -4.990e-01  3.810e-02 -13.097  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.749e-01  2.310e-02  -7.571 5.35e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8066 on 2262 degrees of freedom
## Multiple R-squared:  0.3532, Adjusted R-squared:  0.3494 
## F-statistic:    95 on 13 and 2262 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ mb.TEAM_BATTING_H + mb.TEAM_BATTING_3B + 
##     mb.TEAM_BATTING_HR + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + 
##     mb.TEAM_BASERUN_SB + mb.TEAM_PITCHING_SO + mb.TEAM_PITCHING_H + 
##     mb.TEAM_PITCHING_SO + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, 
##     data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7964 -0.5098  0.0067  0.5120  3.8462 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.572e-11  1.693e-02   0.000        1    
## mb.TEAM_BATTING_H    3.705e-01  3.028e-02  12.234  < 2e-16 ***
## mb.TEAM_BATTING_3B   1.693e-01  2.931e-02   5.777 8.67e-09 ***
## mb.TEAM_BATTING_HR   2.341e-01  3.752e-02   6.239 5.23e-10 ***
## mb.TEAM_BATTING_BB   1.385e-01  2.213e-02   6.259 4.61e-10 ***
## mb.TEAM_BATTING_SO  -3.729e-01  3.895e-02  -9.573  < 2e-16 ***
## mb.TEAM_BASERUN_SB   3.159e-01  2.628e-02  12.022  < 2e-16 ***
## mb.TEAM_PITCHING_SO  1.011e-01  2.174e-02   4.648 3.54e-06 ***
## mb.TEAM_PITCHING_H  -1.970e-01  3.498e-02  -5.631 2.01e-08 ***
## mb.TEAM_FIELDING_E  -4.852e-01  3.738e-02 -12.981  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.856e-01  2.275e-02  -8.155 5.72e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8075 on 2265 degrees of freedom
## Multiple R-squared:  0.3509, Adjusted R-squared:  0.348 
## F-statistic: 122.4 on 10 and 2265 DF,  p-value: < 2.2e-16
summary(model3)
## 
## Call:
## lm(formula = mb.TARGET_WINS ~ mb.TEAM_BATTING_H + mb.TEAM_BATTING_3B + 
##     mb.TEAM_BATTING_HR + mb.TEAM_BATTING_BB + mb.TEAM_BATTING_SO + 
##     mb.TEAM_BASERUN_SB + mb.TEAM_FIELDING_E + mb.TEAM_FIELDING_DP, 
##     data = moneyball_df_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7794 -0.5153 -0.0075  0.5215  3.8388 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.324e-12  1.706e-02   0.000        1    
## mb.TEAM_BATTING_H    2.656e-01  2.451e-02  10.838  < 2e-16 ***
## mb.TEAM_BATTING_3B   1.752e-01  2.943e-02   5.952 3.05e-09 ***
## mb.TEAM_BATTING_HR   1.946e-01  3.720e-02   5.232 1.84e-07 ***
## mb.TEAM_BATTING_BB   1.739e-01  2.090e-02   8.321  < 2e-16 ***
## mb.TEAM_BATTING_SO  -2.693e-01  3.507e-02  -7.678 2.39e-14 ***
## mb.TEAM_BASERUN_SB   2.842e-01  2.516e-02  11.295  < 2e-16 ***
## mb.TEAM_FIELDING_E  -5.207e-01  3.641e-02 -14.301  < 2e-16 ***
## mb.TEAM_FIELDING_DP -1.857e-01  2.260e-02  -8.219 3.40e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8139 on 2267 degrees of freedom
## Multiple R-squared:  0.3399, Adjusted R-squared:  0.3376 
## F-statistic: 145.9 on 8 and 2267 DF,  p-value: < 2.2e-16

After exploring these three models, it is clear to use model3 for the predictions.

This is due to the fact that this model is more parsimonious.

In additon, There is no significant difference in \(R2\), Adjusted \(R2\) and \(RMSE\).

Predictive Values

moneyball_evaulated_df <- read.csv("https://raw.githubusercontent.com/josephsimone/Data621/master/HW1/moneyball-evaluation-data.csv")

Removing variables

moneyball_evaulated_df_f <- moneyball_evaulated_df[,-1 ]
names(moneyball_evaulated_df_f)
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [5] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
##  [9] "TEAM_BATTING_HBP" "TEAM_PITCHING_H"  "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
moneyball_evaulated_df_f <- moneyball_evaulated_df_f[,-10 ]
names(moneyball_evaulated_df_f )
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [5] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
##  [9] "TEAM_BATTING_HBP" "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
moneyball_evaulated_df_f <- moneyball_evaulated_df_f[,-11 ]
names(moneyball_evaulated_df_f)
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [5] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
##  [9] "TEAM_BATTING_HBP" "TEAM_PITCHING_HR" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" 
## [13] "TEAM_FIELDING_DP"
Imputing ‘NA’s’ values
imputed_moneyball_evaulated_df_Data <- mice(moneyball_evaulated_df_f, m=5, maxit = 5, method = 'pmm')
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
## Warning: Number of logged events: 25
imputed_moneyball_evaulated_df_Data <- complete(imputed_moneyball_evaulated_df_Data)
summary(imputed_moneyball_evaulated_df_Data)
##  TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B  TEAM_BATTING_HR 
##  Min.   : 819   Min.   : 44.0   Min.   : 14.00   Min.   :  0.00  
##  1st Qu.:1387   1st Qu.:210.0   1st Qu.: 35.00   1st Qu.: 44.50  
##  Median :1455   Median :239.0   Median : 52.00   Median :101.00  
##  Mean   :1469   Mean   :241.3   Mean   : 55.91   Mean   : 95.63  
##  3rd Qu.:1548   3rd Qu.:278.5   3rd Qu.: 72.00   3rd Qu.:135.50  
##  Max.   :2170   Max.   :376.0   Max.   :155.00   Max.   :242.00  
##  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB TEAM_BASERUN_CS 
##  Min.   : 15.0   Min.   :   0.0   Min.   :  0.0   Min.   :  0.00  
##  1st Qu.:436.5   1st Qu.: 527.0   1st Qu.: 59.5   1st Qu.: 41.00  
##  Median :509.0   Median : 675.0   Median : 96.0   Median : 56.00  
##  Mean   :499.0   Mean   : 698.0   Mean   :130.3   Mean   : 65.23  
##  3rd Qu.:565.5   3rd Qu.: 904.5   3rd Qu.:159.0   3rd Qu.: 76.00  
##  Max.   :792.0   Max.   :1268.0   Max.   :580.0   Max.   :154.00  
##  TEAM_BATTING_HBP TEAM_PITCHING_HR TEAM_PITCHING_SO TEAM_FIELDING_E 
##  Min.   :42.0     Min.   :  0.0    Min.   :   0.0   Min.   :  73.0  
##  1st Qu.:49.0     1st Qu.: 52.0    1st Qu.: 621.5   1st Qu.: 131.0  
##  Median :62.0     Median :104.0    Median : 758.0   Median : 163.0  
##  Mean   :61.4     Mean   :102.1    Mean   : 805.8   Mean   : 249.7  
##  3rd Qu.:67.0     3rd Qu.:142.5    3rd Qu.: 955.5   3rd Qu.: 252.0  
##  Max.   :96.0     Max.   :336.0    Max.   :9963.0   Max.   :1568.0  
##  TEAM_FIELDING_DP
##  Min.   : 69.0   
##  1st Qu.:123.0   
##  Median :146.0   
##  Mean   :140.7   
##  3rd Qu.:160.5   
##  Max.   :204.0

Scaling & Centering

mb = preProcess(imputed_moneyball_evaulated_df_Data, 
                   c("BoxCox", "center", "scale"))
moneyball_evaulated_df_final = data.frame(
      mb = predict(mb, imputed_moneyball_evaulated_df_Data))
summary(moneyball_evaulated_df_final)
##  mb.TEAM_BATTING_H  mb.TEAM_BATTING_2B mb.TEAM_BATTING_3B mb.TEAM_BATTING_HR
##  Min.   :-5.07603   Min.   :-3.26217   Min.   :-2.64215   Min.   :-1.69766  
##  1st Qu.:-0.52836   1st Qu.:-0.67016   1st Qu.:-0.73771   1st Qu.:-0.90771  
##  Median :-0.06571   Median :-0.09057   Median : 0.08513   Median : 0.09527  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.54648   3rd Qu.: 0.74495   3rd Qu.: 0.76149   3rd Qu.: 0.70771  
##  Max.   : 4.16429   Max.   : 3.00896   Max.   : 2.35514   Max.   : 2.59828  
##  mb.TEAM_BATTING_BB  mb.TEAM_BATTING_SO mb.TEAM_BASERUN_SB mb.TEAM_BASERUN_CS
##  Min.   :-2.859388   Min.   :-2.91029   Min.   :-1.3132    Min.   :-1.8451   
##  1st Qu.:-0.619108   1st Qu.:-0.71287   1st Qu.:-0.7136    1st Qu.:-0.6854   
##  Median : 0.008141   Median :-0.09576   Median :-0.3457    Median :-0.2611   
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000    Mean   : 0.0000   
##  3rd Qu.: 0.536019   3rd Qu.: 0.86118   3rd Qu.: 0.2892    3rd Qu.: 0.3046   
##  Max.   : 2.968415   Max.   : 2.37686   Max.   : 4.5320    Max.   : 2.5109   
##  mb.TEAM_BATTING_HBP mb.TEAM_PITCHING_HR mb.TEAM_PITCHING_SO mb.TEAM_FIELDING_E
##  Min.   :-1.6759     Min.   :-1.77169    Min.   :-1.30863    Min.   :-3.1354   
##  1st Qu.:-0.8697     1st Qu.:-0.86977    1st Qu.:-0.29934    1st Qu.:-0.7317   
##  Median : 0.2258     Median : 0.03214    Median :-0.07767    Median :-0.1378   
##  Mean   : 0.0000     Mean   : 0.00000    Mean   : 0.00000    Mean   : 0.0000   
##  3rd Qu.: 0.5543     3rd Qu.: 0.69991    3rd Qu.: 0.24306    3rd Qu.: 0.7208   
##  Max.   : 1.8928     Max.   : 4.05609    Max.   :14.87078    Max.   : 2.0409   
##  mb.TEAM_FIELDING_DP
##  Min.   :-2.2272    
##  1st Qu.:-0.6576    
##  Median : 0.1361    
##  Mean   : 0.0000    
##  3rd Qu.: 0.6704    
##  Max.   : 2.4173
eval_data <- predict(model3, newdata = moneyball_evaulated_df_final, interval="prediction")
eval_data
##              fit           lwr        upr
## 1   -1.204949252 -2.804458e+00  0.3945591
## 2   -0.960662791 -2.558789e+00  0.6374631
## 3   -0.611825601 -2.209369e+00  0.9857180
## 4    0.315243395 -1.282535e+00  1.9130216
## 5   -1.400574520 -3.001878e+00  0.2007290
## 6   -0.915830796 -2.515180e+00  0.6835183
## 7    0.209721898 -1.390966e+00  1.8104099
## 8   -0.714386909 -2.312684e+00  0.8839099
## 9   -0.683470016 -2.282011e+00  0.9150705
## 10  -0.587747895 -2.185055e+00  1.0095596
## 11  -0.901102931 -2.500176e+00  0.6979703
## 12  -0.017203229 -1.616737e+00  1.5823303
## 13   0.082370396 -1.518493e+00  1.6832337
## 14   0.022760225 -1.576903e+00  1.6224232
## 15   0.335425224 -1.265790e+00  1.9366400
## 16  -0.464435055 -2.062919e+00  1.1340486
## 17  -0.746101161 -2.344771e+00  0.8525690
## 18  -0.093865927 -1.691478e+00  1.5037457
## 19  -0.654852196 -2.254144e+00  0.9444392
## 20   0.353391307 -1.245425e+00  1.9522076
## 21   0.307083399 -1.291443e+00  1.9056100
## 22   0.189559866 -1.408885e+00  1.7880046
## 23   0.045089922 -1.553121e+00  1.6433012
## 24  -0.700977855 -2.299208e+00  0.8972526
## 25   0.177069732 -1.421693e+00  1.7758323
## 26   0.516501772 -1.083281e+00  2.1162849
## 27  -0.819534057 -2.428090e+00  0.7890223
## 28  -0.570680125 -2.168184e+00  1.0268233
## 29   0.242472629 -1.356831e+00  1.8417765
## 30  -0.610275689 -2.209812e+00  0.9892606
## 31   0.650435535 -9.487651e-01  2.2496361
## 32   0.344455555 -1.253178e+00  1.9420887
## 33   0.319923113 -1.278839e+00  1.9186857
## 34   0.165864464 -1.435154e+00  1.7668826
## 35   0.001962482 -1.595967e+00  1.5998922
## 36   0.070321296 -1.530355e+00  1.6709978
## 37  -0.261584960 -1.858390e+00  1.3352204
## 38   0.446527868 -1.153785e+00  2.0468403
## 39   0.039786120 -1.558405e+00  1.6379773
## 40   0.388577787 -1.210615e+00  1.9877706
## 41   0.153183381 -1.445985e+00  1.7523518
## 42   1.307789523 -2.936399e-01  2.9092190
## 43  -1.362227914 -2.975466e+00  0.2510100
## 44   1.685970471  7.591232e-02  3.2960286
## 45   0.631644254 -9.696813e-01  2.2329698
## 46   0.974843737 -6.247700e-01  2.5744574
## 47   0.904958005 -6.942203e-01  2.5041363
## 48  -0.474228937 -2.072016e+00  1.1235581
## 49  -0.843909009 -2.441855e+00  0.7540372
## 50  -0.162722956 -1.760130e+00  1.4346841
## 51  -0.347470752 -1.945172e+00  1.2502300
## 52   0.203734119 -1.394618e+00  1.8020863
## 53  -0.486325851 -2.085150e+00  1.1124986
## 54  -0.276984481 -1.875754e+00  1.3217854
## 55  -0.644382552 -2.241967e+00  0.9532019
## 56   0.073781825 -1.524639e+00  1.6722028
## 57   0.692182885 -9.068607e-01  2.2912264
## 58  -0.399448709 -1.997650e+00  1.1987524
## 59  -1.111577693 -2.711064e+00  0.4879090
## 60  -0.150947760 -1.748314e+00  1.4464189
## 61   0.414223819 -1.183529e+00  2.0119767
## 62   0.071206388 -1.531446e+00  1.6738589
## 63   0.396545110 -1.201161e+00  1.9942515
## 64   0.344683250 -1.255873e+00  1.9452398
## 65   0.457843894 -1.142797e+00  2.0584850
## 66   1.407828122 -1.947443e-01  3.0104006
## 67  -0.685139799 -2.283567e+00  0.9132870
## 68  -0.410850605 -2.009679e+00  1.1879774
## 69  -0.238652659 -1.837033e+00  1.3597276
## 70   0.420890528 -1.178811e+00  2.0205920
## 71   0.226238277 -1.374008e+00  1.8264850
## 72  -0.313568691 -1.915409e+00  1.2882717
## 73  -0.147247801 -1.747368e+00  1.4528723
## 74   0.533641806 -1.067886e+00  2.1351694
## 75  -0.329006360 -1.928883e+00  1.2708706
## 76  -0.298921069 -1.898748e+00  1.3009062
## 77   0.447521716 -1.150385e+00  2.0454281
## 78   0.064247343 -1.533549e+00  1.6620439
## 79  -0.528964662 -2.126976e+00  1.0690469
## 80  -0.126860456 -1.724984e+00  1.4712627
## 81   0.177843549 -1.420055e+00  1.7757425
## 82   0.262094890 -1.335852e+00  1.8600420
## 83   0.735013652 -8.638824e-01  2.3339097
## 84  -0.464040624 -2.063716e+00  1.1356346
## 85   0.189911913 -1.408529e+00  1.7883531
## 86  -0.158523399 -1.758202e+00  1.4411556
## 87   0.186696603 -1.412458e+00  1.7858514
## 88   0.307068228 -1.289886e+00  1.9040225
## 89   0.824663038 -7.751827e-01  2.4245087
## 90   0.772613479 -8.256433e-01  2.3708703
## 91   0.088710315 -1.509525e+00  1.6869457
## 92   1.398044835 -2.050514e-01  3.0011411
## 93  -0.516150155 -2.113837e+00  1.0815363
## 94  -0.046145499 -1.644681e+00  1.5523901
## 95   0.040866453 -1.557384e+00  1.6391171
## 96   0.099863143 -1.498034e+00  1.6977607
## 97   0.593121615 -1.008003e+00  2.1942459
## 98   1.014913328 -5.857952e-01  2.6156218
## 99   0.384620116 -1.214687e+00  1.9839269
## 100  0.382599883 -1.217480e+00  1.9826796
## 101 -0.123456284 -1.721863e+00  1.4749507
## 102 -0.510013593 -2.107963e+00  1.0879355
## 103  0.221908606 -1.375131e+00  1.8189485
## 104  0.234158253 -1.364170e+00  1.8324865
## 105 -0.502703219 -2.103342e+00  1.0979360
## 106 -0.854520223 -2.454217e+00  0.7451761
## 107 -1.411016368 -3.014420e+00  0.1923873
## 108 -0.039509123 -1.638496e+00  1.5594777
## 109  0.752377824 -8.459513e-01  2.3507070
## 110 -1.410583645 -3.011998e+00  0.1908305
## 111  0.382649781 -1.214837e+00  1.9801370
## 112  0.475581209 -1.122409e+00  2.0735713
## 113  0.777688213 -8.197885e-01  2.3751649
## 114  0.704439188 -8.936622e-01  2.3025406
## 115  0.054414284 -1.543616e+00  1.6524446
## 116  0.053467774 -1.544295e+00  1.6512306
## 117  0.307900241 -1.291292e+00  1.9070928
## 118  0.124222283 -1.472799e+00  1.7212432
## 119 -0.450557512 -2.048904e+00  1.1477890
## 120  0.002019377 -1.597428e+00  1.6014669
## 121  0.965032016 -6.344775e-01  2.5645416
## 122 -0.801122885 -2.399704e+00  0.7974580
## 123 -0.862394820 -2.461039e+00  0.7362497
## 124 -1.222632138 -2.824774e+00  0.3795102
## 125 -0.840806374 -2.439626e+00  0.7580135
## 126  0.143141940 -1.455227e+00  1.7415107
## 127  0.280742968 -1.317981e+00  1.8794670
## 128 -0.402261503 -1.999917e+00  1.1953942
## 129  0.564002700 -1.034159e+00  2.1621640
## 130  0.390883791 -1.207809e+00  1.9895763
## 131  0.139791767 -1.458072e+00  1.7376551
## 132  0.145970949 -1.452803e+00  1.7447447
## 133 -0.675729323 -2.278628e+00  0.9271692
## 134 -0.087958962 -1.686797e+00  1.5108788
## 135  1.294347296 -3.095100e-01  2.8982046
## 136 -0.355234712 -1.954719e+00  1.2442498
## 137 -0.225545687 -1.823626e+00  1.3725344
## 138 -0.230338492 -1.827644e+00  1.3669668
## 139  1.217983788 -3.882200e-01  2.8241876
## 140 -0.042361791 -1.640111e+00  1.5553871
## 141 -1.277065157 -2.877153e+00  0.3230223
## 142 -0.618337709 -2.216868e+00  0.9801921
## 143  0.582300135 -1.016518e+00  2.1811187
## 144 -0.620175512 -2.218688e+00  0.9783365
## 145 -0.190834347 -1.789472e+00  1.4078033
## 146 -0.411376130 -2.008644e+00  1.1858913
## 147 -0.440422685 -2.038531e+00  1.1576857
## 148  0.048700576 -1.548894e+00  1.6462956
## 149 -0.080452432 -1.679419e+00  1.5185141
## 150  0.331455840 -1.266059e+00  1.9289711
## 151  0.134686524 -1.463962e+00  1.7333348
## 152  0.492004037 -1.108879e+00  2.0928874
## 153 -1.282374628 -2.893204e+00  0.3284546
## 154 -1.099684261 -2.698677e+00  0.4993083
## 155 -0.023012790 -1.621459e+00  1.5754331
## 156 -0.935779404 -2.534900e+00  0.6633414
## 157  0.847286252 -7.520373e-01  2.4466098
## 158 -0.365896728 -1.964675e+00  1.2328812
## 159  0.485855027 -1.112615e+00  2.0843249
## 160 -0.374704365 -1.972750e+00  1.2233411
## 161  1.112886482 -4.891293e-01  2.7149023
## 162  1.602568555 -1.058603e-05  3.2051477
## 163  0.913340147 -6.857612e-01  2.5124415
## 164  1.312776860 -2.898140e-01  2.9153677
## 165  1.021487822 -5.809024e-01  2.6238781
## 166  0.927866237 -6.729192e-01  2.5286516
## 167  0.182600977 -1.416352e+00  1.7815538
## 168  0.223829928 -1.375694e+00  1.8233538
## 169 -0.664015385 -2.262914e+00  0.9348837
## 170  0.012988040 -1.585627e+00  1.6116031
## 171  0.617110859 -9.812179e-01  2.2154396
## 172  0.463766418 -1.134094e+00  2.0616267
## 173  0.070242790 -1.527298e+00  1.6677837
## 174  0.731586309 -8.669259e-01  2.3300986
## 175 -0.047837090 -1.645300e+00  1.5496262
## 176 -0.159163251 -1.757694e+00  1.4393678
## 177  0.109916256 -1.489614e+00  1.7094468
## 178 -0.846834800 -2.445965e+00  0.7522957
## 179 -0.341663318 -1.938606e+00  1.2552793
## 180 -0.160431539 -1.757963e+00  1.4371004
## 181  0.576146282 -1.025595e+00  2.1778873
## 182  0.300814907 -1.298499e+00  1.9001289
## 183  0.496625123 -1.101872e+00  2.0951223
## 184  0.536163677 -1.062069e+00  2.1343967
## 185  1.126569818 -4.763403e-01  2.7294799
## 186  0.992519080 -6.139535e-01  2.5989916
## 187  0.471125638 -1.130288e+00  2.0725389
## 188 -0.626483875 -2.227077e+00  0.9741093
## 189 -0.734675402 -2.333922e+00  0.8645711
## 190  1.536828811 -6.688687e-02  3.1405445
## 191 -0.646892861 -2.244832e+00  0.9510464
## 192 -0.059709576 -1.657108e+00  1.5376887
## 193 -0.607002551 -2.204687e+00  0.9906817
## 194 -0.478316757 -2.076157e+00  1.1195231
## 195 -0.457334155 -2.056597e+00  1.1419283
## 196 -1.092909047 -2.692292e+00  0.5064738
## 197 -0.477634658 -2.075077e+00  1.1198077
## 198  0.758201166 -8.423250e-01  2.3587274
## 199  0.040765300 -1.556962e+00  1.6384923
## 200  0.249795731 -1.348183e+00  1.8477749
## 201 -0.619838453 -2.219870e+00  0.9801927
## 202  0.123013484 -1.475440e+00  1.7214674
## 203 -0.100223039 -1.700626e+00  1.5001797
## 204  0.666719884 -9.311730e-01  2.2646128
## 205  0.098524797 -1.499549e+00  1.6965981
## 206  0.206276827 -1.391607e+00  1.8041610
## 207  0.114884319 -1.483883e+00  1.7136514
## 208  0.163130153 -1.435516e+00  1.7617760
## 209  0.157874422 -1.440160e+00  1.7559086
## 210 -0.176263778 -1.775089e+00  1.4225616
## 211  1.415982632 -1.842105e-01  3.0161757
## 212  0.583673599 -1.015319e+00  2.1826664
## 213  0.010639253 -1.587865e+00  1.6091438
## 214 -1.186660955 -2.785261e+00  0.4119395
## 215 -0.788266563 -2.387646e+00  0.8111128
## 216  0.120957776 -1.476863e+00  1.7187783
## 217 -0.199861304 -1.800655e+00  1.4009321
## 218  0.705616997 -8.927049e-01  2.3039389
## 219 -0.182353112 -1.779851e+00  1.4151448
## 220  0.087041959 -1.510510e+00  1.6845944
## 221 -0.372337360 -1.970585e+00  1.2259104
## 222 -0.556379083 -2.155578e+00  1.0428194
## 223 -0.077922175 -1.675817e+00  1.5199726
## 224 -0.273885312 -1.874472e+00  1.3267009
## 225  0.279445439 -1.332344e+00  1.8912350
## 226 -0.182511395 -1.779925e+00  1.4149021
## 227 -0.153286864 -1.751005e+00  1.4444312
## 228 -0.187254840 -1.786223e+00  1.4117137
## 229  0.429882299 -1.167874e+00  2.0276391
## 230 -0.412827614 -2.012820e+00  1.1871648
## 231 -0.095918941 -1.695166e+00  1.5033285
## 232  0.578135191 -1.020050e+00  2.1763202
## 233  0.049422904 -1.549893e+00  1.6487391
## 234  0.247373323 -1.351775e+00  1.8465216
## 235 -0.244573958 -1.841988e+00  1.3528396
## 236 -0.381578152 -1.978804e+00  1.2156472
## 237 -0.337993406 -1.937842e+00  1.2618554
## 238  0.115072221 -1.483986e+00  1.7141309
## 239  0.765655889 -8.339884e-01  2.3653001
## 240 -0.696054978 -2.293920e+00  0.9018097
## 241  0.263451726 -1.334107e+00  1.8610101
## 242  0.749194083 -8.499319e-01  2.3483201
## 243  0.260973376 -1.337135e+00  1.8590816
## 244  0.203135153 -1.395375e+00  1.8016456
## 245 -1.469623213 -3.071486e+00  0.1322397
## 246  0.132129941 -1.466827e+00  1.7310867
## 247 -0.198780547 -1.796316e+00  1.3987553
## 248  0.182762808 -1.415215e+00  1.7807410
## 249 -0.350022186 -1.947869e+00  1.2478249
## 250  0.475623256 -1.125223e+00  2.0764698
## 251  0.176426003 -1.422044e+00  1.7748962
## 252 -0.908361510 -2.508597e+00  0.6918736
## 253  0.820909307 -7.791909e-01  2.4210095
## 254 -1.920043528 -3.538537e+00 -0.3015502
## 255 -0.793410224 -2.391392e+00  0.8045712
## 256 -0.336833182 -1.937025e+00  1.2633581
## 257  0.220844585 -1.377685e+00  1.8193744
## 258  0.117576791 -1.480076e+00  1.7152297
## 259 -0.344841627 -1.943495e+00  1.2538119
summary(eval_data)
##       fit                lwr                upr         
##  Min.   :-1.92004   Min.   :-3.53854   Min.   :-0.3015  
##  1st Qu.:-0.41111   1st Qu.:-2.00916   1st Qu.: 1.1876  
##  Median : 0.05347   Median :-1.54429   Median : 1.6512  
##  Mean   : 0.00000   Mean   :-1.59937   Mean   : 1.5994  
##  3rd Qu.: 0.38262   3rd Qu.:-1.21616   3rd Qu.: 1.9814  
##  Max.   : 1.68597   Max.   : 0.07591   Max.   : 3.2960
#write.csv(eval_data, "C:/Users/jpsim/Documents/Data621/Prediction.csv", row.names = FALSE)