#HW #1 Assignment - Moneyball Model

Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 2200 records. Each record represents a professional baseball team from the years 1871 to 2006 inclusive. Each record has the performance of the team for the given year, with all of the statistics adjusted to match the performance of a 162 game season.

Your objective is to build a multiple linear regression model on the training data to predict the number of wins for the team. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:

library(knitr)
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(reshape)
## Warning: package 'reshape' was built under R version 3.2.5
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
library(mice)
## Warning: package 'mice' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/America/New_York'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::invoke' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5

#DATA EXPLORATION:

Load the data and understand the data by using some stats and plots.

mtd <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-1/master/moneyball-training-data.csv")
count(mtd)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  2276
names(mtd)
##  [1] "INDEX"            "TARGET_WINS"      "TEAM_BATTING_H"  
##  [4] "TEAM_BATTING_2B"  "TEAM_BATTING_3B"  "TEAM_BATTING_HR" 
##  [7] "TEAM_BATTING_BB"  "TEAM_BATTING_SO"  "TEAM_BASERUN_SB" 
## [10] "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP" "TEAM_PITCHING_H" 
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
summary(mtd)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286

The dataset consists of 17 elements, with 2276 total cases. There are multiple variables with missing (NA) values and TEAM-BATTING_HBP has the highest NAs.

Checking for outliers:

ggplot(stack(mtd), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 1000)) +
  theme(legend.position="none") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  theme(panel.background = element_rect(fill = 'grey'))
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).

Checking for skewness in the data

mtd1 = melt(mtd)
## Using  as id variables
ggplot(mtd1, aes(x= value)) +
    geom_density(fill='red') + facet_wrap(~variable, scales = 'free')
## Warning: Removed 3478 rows containing non-finite values (stat_density).

As seen there are several variables that are skewed and also there are outliers.

Finding correlations:

mtd2 <- mtd[,-1 ]
names(mtd2)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B" 
##  [4] "TEAM_BATTING_3B"  "TEAM_BATTING_HR"  "TEAM_BATTING_BB" 
##  [7] "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
## [10] "TEAM_BATTING_HBP" "TEAM_PITCHING_H"  "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" 
## [16] "TEAM_FIELDING_DP"
cor(drop_na(mtd2))
##                  TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## TARGET_WINS       1.00000000     0.46994665      0.31298400
## TEAM_BATTING_H    0.46994665     1.00000000      0.56177286
## TEAM_BATTING_2B   0.31298400     0.56177286      1.00000000
## TEAM_BATTING_3B  -0.12434586     0.21391883      0.04203441
## TEAM_BATTING_HR   0.42241683     0.39627593      0.25099045
## TEAM_BATTING_BB   0.46868793     0.19735234      0.19749256
## TEAM_BATTING_SO  -0.22889273    -0.34174328     -0.06415123
## TEAM_BASERUN_SB   0.01483639     0.07167495     -0.18768279
## TEAM_BASERUN_CS  -0.17875598    -0.09377545     -0.20413884
## TEAM_BATTING_HBP  0.07350424    -0.02911218      0.04608475
## TEAM_PITCHING_H   0.47123431     0.99919269      0.56045355
## TEAM_PITCHING_HR  0.42246683     0.39495630      0.24999875
## TEAM_PITCHING_BB  0.46839882     0.19529071      0.19592157
## TEAM_PITCHING_SO -0.22936481    -0.34445001     -0.06616615
## TEAM_FIELDING_E  -0.38668800    -0.25381638     -0.19427027
## TEAM_FIELDING_DP -0.19586601     0.01776946     -0.02488808
##                  TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## TARGET_WINS          -0.12434586      0.42241683      0.46868793
## TEAM_BATTING_H        0.21391883      0.39627593      0.19735234
## TEAM_BATTING_2B       0.04203441      0.25099045      0.19749256
## TEAM_BATTING_3B       1.00000000     -0.21879927     -0.20584392
## TEAM_BATTING_HR      -0.21879927      1.00000000      0.45638161
## TEAM_BATTING_BB      -0.20584392      0.45638161      1.00000000
## TEAM_BATTING_SO      -0.19291841      0.21045444      0.21833871
## TEAM_BASERUN_SB       0.16946086     -0.19021893     -0.08806123
## TEAM_BASERUN_CS       0.23213978     -0.27579838     -0.20878051
## TEAM_BATTING_HBP     -0.17424715      0.10618116      0.04746007
## TEAM_PITCHING_H       0.21250322      0.39549390      0.19848687
## TEAM_PITCHING_HR     -0.21973263      0.99993259      0.45659283
## TEAM_PITCHING_BB     -0.20675383      0.45542468      0.99988140
## TEAM_PITCHING_SO     -0.19386654      0.20829574      0.21793253
## TEAM_FIELDING_E      -0.06513145      0.01567397     -0.07847126
## TEAM_FIELDING_DP      0.13314758     -0.06182222     -0.07929078
##                  TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## TARGET_WINS          -0.22889273      0.01483639    -0.178755979
## TEAM_BATTING_H       -0.34174328      0.07167495    -0.093775445
## TEAM_BATTING_2B      -0.06415123     -0.18768279    -0.204138837
## TEAM_BATTING_3B      -0.19291841      0.16946086     0.232139777
## TEAM_BATTING_HR       0.21045444     -0.19021893    -0.275798375
## TEAM_BATTING_BB       0.21833871     -0.08806123    -0.208780510
## TEAM_BATTING_SO       1.00000000     -0.07475974    -0.056130355
## TEAM_BASERUN_SB      -0.07475974      1.00000000     0.624737808
## TEAM_BASERUN_CS      -0.05613035      0.62473781     1.000000000
## TEAM_BATTING_HBP      0.22094219     -0.06400498    -0.070513896
## TEAM_PITCHING_H      -0.34145321      0.07395373    -0.092977893
## TEAM_PITCHING_HR      0.21111617     -0.18948057    -0.275471495
## TEAM_PITCHING_BB      0.21895783     -0.08741902    -0.208470154
## TEAM_PITCHING_SO      0.99976835     -0.07351325    -0.055308336
## TEAM_FIELDING_E       0.30814540      0.04292341     0.207701189
## TEAM_FIELDING_DP     -0.12319072     -0.13023054    -0.006764233
##                  TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## TARGET_WINS            0.07350424      0.47123431       0.42246683
## TEAM_BATTING_H        -0.02911218      0.99919269       0.39495630
## TEAM_BATTING_2B        0.04608475      0.56045355       0.24999875
## TEAM_BATTING_3B       -0.17424715      0.21250322      -0.21973263
## TEAM_BATTING_HR        0.10618116      0.39549390       0.99993259
## TEAM_BATTING_BB        0.04746007      0.19848687       0.45659283
## TEAM_BATTING_SO        0.22094219     -0.34145321       0.21111617
## TEAM_BASERUN_SB       -0.06400498      0.07395373      -0.18948057
## TEAM_BASERUN_CS       -0.07051390     -0.09297789      -0.27547150
## TEAM_BATTING_HBP       1.00000000     -0.02769699       0.10675878
## TEAM_PITCHING_H       -0.02769699      1.00000000       0.39463199
## TEAM_PITCHING_HR       0.10675878      0.39463199       1.00000000
## TEAM_PITCHING_BB       0.04785137      0.19703302       0.45580983
## TEAM_PITCHING_SO       0.22157375     -0.34330646       0.20920115
## TEAM_FIELDING_E        0.04178971     -0.25073028       0.01689330
## TEAM_FIELDING_DP      -0.07120824      0.01416807      -0.06292475
##                  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## TARGET_WINS            0.46839882      -0.22936481     -0.38668800
## TEAM_BATTING_H         0.19529071      -0.34445001     -0.25381638
## TEAM_BATTING_2B        0.19592157      -0.06616615     -0.19427027
## TEAM_BATTING_3B       -0.20675383      -0.19386654     -0.06513145
## TEAM_BATTING_HR        0.45542468       0.20829574      0.01567397
## TEAM_BATTING_BB        0.99988140       0.21793253     -0.07847126
## TEAM_BATTING_SO        0.21895783       0.99976835      0.30814540
## TEAM_BASERUN_SB       -0.08741902      -0.07351325      0.04292341
## TEAM_BASERUN_CS       -0.20847015      -0.05530834      0.20770119
## TEAM_BATTING_HBP       0.04785137       0.22157375      0.04178971
## TEAM_PITCHING_H        0.19703302      -0.34330646     -0.25073028
## TEAM_PITCHING_HR       0.45580983       0.20920115      0.01689330
## TEAM_PITCHING_BB       1.00000000       0.21887700     -0.07692315
## TEAM_PITCHING_SO       0.21887700       1.00000000      0.31008407
## TEAM_FIELDING_E       -0.07692315       0.31008407      1.00000000
## TEAM_FIELDING_DP      -0.08040645      -0.12492321      0.04020581
##                  TEAM_FIELDING_DP
## TARGET_WINS          -0.195866006
## TEAM_BATTING_H        0.017769456
## TEAM_BATTING_2B      -0.024888081
## TEAM_BATTING_3B       0.133147578
## TEAM_BATTING_HR      -0.061822219
## TEAM_BATTING_BB      -0.079290775
## TEAM_BATTING_SO      -0.123190715
## TEAM_BASERUN_SB      -0.130230537
## TEAM_BASERUN_CS      -0.006764233
## TEAM_BATTING_HBP     -0.071208241
## TEAM_PITCHING_H       0.014168073
## TEAM_PITCHING_HR     -0.062924751
## TEAM_PITCHING_BB     -0.080406452
## TEAM_PITCHING_SO     -0.124923213
## TEAM_FIELDING_E       0.040205814
## TEAM_FIELDING_DP      1.000000000
pairs.panels(mtd2[1:8])

pairs.panels(mtd2[9:16])

We can see there are some positively and some negatively correlated variables.

#DATA PREPARATION

Removing the variables:

mtd_f <- mtd[,-1 ]
names(mtd_f)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B" 
##  [4] "TEAM_BATTING_3B"  "TEAM_BATTING_HR"  "TEAM_BATTING_BB" 
##  [7] "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
## [10] "TEAM_BATTING_HBP" "TEAM_PITCHING_H"  "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" 
## [16] "TEAM_FIELDING_DP"

The variable TEAM_BATTING_HBP is having mostly missing values so the variable will be removed completely.

mtd_f <- mtd_f[,-10 ]
names(mtd_f )
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B" 
##  [4] "TEAM_BATTING_3B"  "TEAM_BATTING_HR"  "TEAM_BATTING_BB" 
##  [7] "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
## [10] "TEAM_PITCHING_H"  "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"

TEAM_PITCHING_HR and TEAM_BATTING_HR are highly correlated, so we can remove one of them.

mtd_f <- mtd_f[,-11 ]
names(mtd_f)
##  [1] "TARGET_WINS"      "TEAM_BATTING_H"   "TEAM_BATTING_2B" 
##  [4] "TEAM_BATTING_3B"  "TEAM_BATTING_HR"  "TEAM_BATTING_BB" 
##  [7] "TEAM_BATTING_SO"  "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS" 
## [10] "TEAM_PITCHING_H"  "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"

Imputing the NAs using Mice(pmm - predictive mean matching)

imputed_mtd_Data <- mice(mtd_f, m=5, maxit = 5, method = 'pmm')
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_PITCHING_SO  TEAM_FIELDING_DP
imputed_mtd_Data <- complete(imputed_mtd_Data)
summary(imputed_mtd_Data)
##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 544.0   1st Qu.: 67.0  
##  Median :102.00   Median :512.0   Median : 733.0   Median :106.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 728.2   Mean   :135.1  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 925.0   3rd Qu.:170.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##  TEAM_BASERUN_CS  TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO 
##  Min.   :  0.00   Min.   : 1137   Min.   :   0.0   Min.   :    0.0  
##  1st Qu.: 42.00   1st Qu.: 1419   1st Qu.: 476.0   1st Qu.:  617.0  
##  Median : 57.00   Median : 1518   Median : 536.5   Median :  807.0  
##  Mean   : 74.93   Mean   : 1779   Mean   : 553.0   Mean   :  813.6  
##  3rd Qu.: 89.00   3rd Qu.: 1682   3rd Qu.: 611.0   3rd Qu.:  958.2  
##  Max.   :201.00   Max.   :30132   Max.   :3645.0   Max.   :19278.0  
##  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :  65.0   Min.   : 52     
##  1st Qu.: 127.0   1st Qu.:125     
##  Median : 159.0   Median :146     
##  Mean   : 246.5   Mean   :142     
##  3rd Qu.: 249.2   3rd Qu.:162     
##  Max.   :1898.0   Max.   :228

Centering and scaling was used to transform individual predictors in the dataset using the caret library.

t = preProcess(imputed_mtd_Data,
                   c("BoxCox", "center", "scale"))
mtd_final = data.frame(
      t = predict(t, imputed_mtd_Data))
 
summary(mtd_final)
##  t.TARGET_WINS      t.TEAM_BATTING_H    t.TEAM_BATTING_2B 
##  Min.   :-5.12888   Min.   :-7.537074   Min.   :-4.48108  
##  1st Qu.:-0.62156   1st Qu.:-0.573089   1st Qu.:-0.68949  
##  Median : 0.07676   Median :-0.003988   Median :-0.03019  
##  Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.00000  
##  3rd Qu.: 0.71159   3rd Qu.: 0.586908   3rd Qu.: 0.69827  
##  Max.   : 4.13970   Max.   : 4.390097   Max.   : 4.05391  
##  t.TEAM_BATTING_3B t.TEAM_BATTING_HR  t.TEAM_BATTING_BB 
##  Min.   :-1.9776   Min.   :-1.64521   Min.   :-4.08866  
##  1st Qu.:-0.7606   1st Qu.:-0.95153   1st Qu.:-0.41215  
##  Median :-0.2953   Median : 0.03944   Median : 0.08511  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.5995   3rd Qu.: 0.78267   3rd Qu.: 0.63944  
##  Max.   : 6.0042   Max.   : 2.71505   Max.   : 3.06871  
##  t.TEAM_BATTING_SO  t.TEAM_BASERUN_SB t.TEAM_BASERUN_CS t.TEAM_PITCHING_H
##  Min.   :-2.95639   Min.   :-1.3564   Min.   :-1.5197   Min.   :-2.8556  
##  1st Qu.:-0.74796   1st Qu.:-0.6840   1st Qu.:-0.6678   1st Qu.:-0.6710  
##  Median : 0.01931   Median :-0.2925   Median :-0.3636   Median :-0.1765  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.79876   3rd Qu.: 0.3498   3rd Qu.: 0.2854   3rd Qu.: 0.4602  
##  Max.   : 2.72302   Max.   : 5.6391   Max.   : 2.5569   Max.   : 3.2387  
##  t.TEAM_PITCHING_BB t.TEAM_PITCHING_SO t.TEAM_FIELDING_E
##  Min.   :-3.32422   Min.   :-1.50192   Min.   :-3.3092  
##  1st Qu.:-0.46291   1st Qu.:-0.36293   1st Qu.:-0.7163  
##  Median :-0.09923   Median :-0.01219   Median :-0.1424  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.34860   3rd Qu.: 0.26701   3rd Qu.: 0.7096  
##  Max.   :18.58645   Max.   :34.08525   Max.   : 2.1432  
##  t.TEAM_FIELDING_DP
##  Min.   :-2.6139   
##  1st Qu.:-0.6257   
##  Median : 0.0870   
##  Mean   : 0.0000   
##  3rd Qu.: 0.6660   
##  Max.   : 3.3506
mtd_final1 = melt(mtd_final)
## Using  as id variables
ggplot(mtd_final1, aes(x= value)) +
    geom_density(fill='red') + facet_wrap(~variable, scales = 'free')

#BUILD MODELS:

Model1:

With all variables:

model1 <- lm(t.TARGET_WINS ~., mtd_final)
summary(model1)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ ., data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5760 -0.5031 -0.0020  0.5213  3.5486 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.189e-11  1.711e-02   0.000 1.000000    
## t.TEAM_BATTING_H    3.977e-01  3.666e-02  10.851  < 2e-16 ***
## t.TEAM_BATTING_2B  -4.161e-02  2.763e-02  -1.506 0.132248    
## t.TEAM_BATTING_3B   1.842e-01  3.010e-02   6.121 1.09e-09 ***
## t.TEAM_BATTING_HR   2.141e-01  3.839e-02   5.578 2.72e-08 ***
## t.TEAM_BATTING_BB   1.568e-01  3.487e-02   4.497 7.23e-06 ***
## t.TEAM_BATTING_SO  -3.254e-01  4.033e-02  -8.067 1.16e-15 ***
## t.TEAM_BASERUN_SB   2.284e-01  3.196e-02   7.146 1.20e-12 ***
## t.TEAM_BASERUN_CS  -1.912e-02  3.420e-02  -0.559 0.576128    
## t.TEAM_PITCHING_H  -1.405e-01  3.808e-02  -3.691 0.000229 ***
## t.TEAM_PITCHING_BB -2.218e-02  3.307e-02  -0.671 0.502582    
## t.TEAM_PITCHING_SO  1.114e-01  2.925e-02   3.807 0.000144 ***
## t.TEAM_FIELDING_E  -4.698e-01  3.859e-02 -12.174  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.248e-01  2.328e-02  -9.656  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8162 on 2262 degrees of freedom
## Multiple R-squared:  0.3376, Adjusted R-squared:  0.3338 
## F-statistic: 88.69 on 13 and 2262 DF,  p-value: < 2.2e-16

Model2:

With only the significant variables:

model2 <- lm(t.TARGET_WINS ~ t.TEAM_BATTING_H  + t.TEAM_BATTING_3B  + t.TEAM_BATTING_HR  + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H + t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, mtd_final)
summary(model2)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + 
##     t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + 
##     t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H + 
##     t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, 
##     data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7306 -0.5073 -0.0023  0.5243  3.5123 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.282e-11  1.711e-02   0.000        1    
## t.TEAM_BATTING_H    3.723e-01  3.060e-02  12.166  < 2e-16 ***
## t.TEAM_BATTING_3B   1.829e-01  2.963e-02   6.175 7.83e-10 ***
## t.TEAM_BATTING_HR   2.221e-01  3.792e-02   5.856 5.42e-09 ***
## t.TEAM_BATTING_BB   1.351e-01  2.243e-02   6.025 1.97e-09 ***
## t.TEAM_BATTING_SO  -3.318e-01  3.877e-02  -8.558  < 2e-16 ***
## t.TEAM_BASERUN_SB   2.200e-01  2.571e-02   8.557  < 2e-16 ***
## t.TEAM_PITCHING_SO  9.408e-02  2.180e-02   4.316 1.66e-05 ***
## t.TEAM_PITCHING_H  -1.556e-01  3.491e-02  -4.457 8.71e-06 ***
## t.TEAM_FIELDING_E  -4.619e-01  3.764e-02 -12.269  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.268e-01  2.287e-02  -9.916  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8162 on 2265 degrees of freedom
## Multiple R-squared:  0.3368, Adjusted R-squared:  0.3339 
## F-statistic:   115 on 10 and 2265 DF,  p-value: < 2.2e-16

Model3:

Further reducing the variables(TEAM_PITCHING_SO and TEAM_BATTING_SO are having high correlation, TEAM_BATTING_H and TEAM_PITCHING_H are also having high correlation, TEAM_BATTING_SO and TEAM_PITCHING_SO are also having high correlation):

model3 <- lm(t.TARGET_WINS ~ t.TEAM_BATTING_H  + t.TEAM_BATTING_3B  + t.TEAM_BATTING_HR  + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + t.TEAM_BASERUN_SB  + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, mtd_final)
summary(model3)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + 
##     t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + 
##     t.TEAM_BASERUN_SB + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, 
##     data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6903 -0.5124  0.0009  0.5278  4.0952 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.432e-12  1.720e-02   0.000        1    
## t.TEAM_BATTING_H    2.869e-01  2.454e-02  11.692  < 2e-16 ***
## t.TEAM_BATTING_3B   1.858e-01  2.968e-02   6.259 4.61e-10 ***
## t.TEAM_BATTING_HR   1.917e-01  3.757e-02   5.103 3.62e-07 ***
## t.TEAM_BATTING_BB   1.625e-01  2.106e-02   7.719 1.74e-14 ***
## t.TEAM_BATTING_SO  -2.454e-01  3.508e-02  -6.996 3.45e-12 ***
## t.TEAM_BASERUN_SB   2.001e-01  2.489e-02   8.041 1.42e-15 ***
## t.TEAM_FIELDING_E  -4.901e-01  3.647e-02 -13.439  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.275e-01  2.286e-02  -9.953  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8207 on 2267 degrees of freedom
## Multiple R-squared:  0.3288, Adjusted R-squared:  0.3265 
## F-statistic: 138.8 on 8 and 2267 DF,  p-value: < 2.2e-16

#SELECT MODELS AND PREDICTION:

summary(model1)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ ., data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5760 -0.5031 -0.0020  0.5213  3.5486 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.189e-11  1.711e-02   0.000 1.000000    
## t.TEAM_BATTING_H    3.977e-01  3.666e-02  10.851  < 2e-16 ***
## t.TEAM_BATTING_2B  -4.161e-02  2.763e-02  -1.506 0.132248    
## t.TEAM_BATTING_3B   1.842e-01  3.010e-02   6.121 1.09e-09 ***
## t.TEAM_BATTING_HR   2.141e-01  3.839e-02   5.578 2.72e-08 ***
## t.TEAM_BATTING_BB   1.568e-01  3.487e-02   4.497 7.23e-06 ***
## t.TEAM_BATTING_SO  -3.254e-01  4.033e-02  -8.067 1.16e-15 ***
## t.TEAM_BASERUN_SB   2.284e-01  3.196e-02   7.146 1.20e-12 ***
## t.TEAM_BASERUN_CS  -1.912e-02  3.420e-02  -0.559 0.576128    
## t.TEAM_PITCHING_H  -1.405e-01  3.808e-02  -3.691 0.000229 ***
## t.TEAM_PITCHING_BB -2.218e-02  3.307e-02  -0.671 0.502582    
## t.TEAM_PITCHING_SO  1.114e-01  2.925e-02   3.807 0.000144 ***
## t.TEAM_FIELDING_E  -4.698e-01  3.859e-02 -12.174  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.248e-01  2.328e-02  -9.656  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8162 on 2262 degrees of freedom
## Multiple R-squared:  0.3376, Adjusted R-squared:  0.3338 
## F-statistic: 88.69 on 13 and 2262 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + 
##     t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + 
##     t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H + 
##     t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, 
##     data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7306 -0.5073 -0.0023  0.5243  3.5123 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.282e-11  1.711e-02   0.000        1    
## t.TEAM_BATTING_H    3.723e-01  3.060e-02  12.166  < 2e-16 ***
## t.TEAM_BATTING_3B   1.829e-01  2.963e-02   6.175 7.83e-10 ***
## t.TEAM_BATTING_HR   2.221e-01  3.792e-02   5.856 5.42e-09 ***
## t.TEAM_BATTING_BB   1.351e-01  2.243e-02   6.025 1.97e-09 ***
## t.TEAM_BATTING_SO  -3.318e-01  3.877e-02  -8.558  < 2e-16 ***
## t.TEAM_BASERUN_SB   2.200e-01  2.571e-02   8.557  < 2e-16 ***
## t.TEAM_PITCHING_SO  9.408e-02  2.180e-02   4.316 1.66e-05 ***
## t.TEAM_PITCHING_H  -1.556e-01  3.491e-02  -4.457 8.71e-06 ***
## t.TEAM_FIELDING_E  -4.619e-01  3.764e-02 -12.269  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.268e-01  2.287e-02  -9.916  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8162 on 2265 degrees of freedom
## Multiple R-squared:  0.3368, Adjusted R-squared:  0.3339 
## F-statistic:   115 on 10 and 2265 DF,  p-value: < 2.2e-16
summary(model3)
## 
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + 
##     t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + 
##     t.TEAM_BASERUN_SB + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, 
##     data = mtd_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6903 -0.5124  0.0009  0.5278  4.0952 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.432e-12  1.720e-02   0.000        1    
## t.TEAM_BATTING_H    2.869e-01  2.454e-02  11.692  < 2e-16 ***
## t.TEAM_BATTING_3B   1.858e-01  2.968e-02   6.259 4.61e-10 ***
## t.TEAM_BATTING_HR   1.917e-01  3.757e-02   5.103 3.62e-07 ***
## t.TEAM_BATTING_BB   1.625e-01  2.106e-02   7.719 1.74e-14 ***
## t.TEAM_BATTING_SO  -2.454e-01  3.508e-02  -6.996 3.45e-12 ***
## t.TEAM_BASERUN_SB   2.001e-01  2.489e-02   8.041 1.42e-15 ***
## t.TEAM_FIELDING_E  -4.901e-01  3.647e-02 -13.439  < 2e-16 ***
## t.TEAM_FIELDING_DP -2.275e-01  2.286e-02  -9.953  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8207 on 2267 degrees of freedom
## Multiple R-squared:  0.3288, Adjusted R-squared:  0.3265 
## F-statistic: 138.8 on 8 and 2267 DF,  p-value: < 2.2e-16

From the three models, I decided to use model3 for the predictions considering its more parsimonious model. There is no significant difference in R2, Adjusted R2 and RMSE even when i did the treatment for multi-collinearity.

#PREDICTION:

For the evaluation dataset also we will be doing all the preprocessing steps.

med <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-1/master/moneyball-evaluation-data.csv")

Removing the variables:

med_f <- med[,-1 ]
names(med_f)
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [4] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO" 
##  [7] "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_H"  "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
med_f <- med_f[,-10 ]
names(med_f )
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [4] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO" 
##  [7] "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E"  "TEAM_FIELDING_DP"
med_f <- med_f[,-11 ]
names(med_f)
##  [1] "TEAM_BATTING_H"   "TEAM_BATTING_2B"  "TEAM_BATTING_3B" 
##  [4] "TEAM_BATTING_HR"  "TEAM_BATTING_BB"  "TEAM_BATTING_SO" 
##  [7] "TEAM_BASERUN_SB"  "TEAM_BASERUN_CS"  "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_HR" "TEAM_PITCHING_SO" "TEAM_FIELDING_E" 
## [13] "TEAM_FIELDING_DP"

Imputing the NAs using Mice(pmm - predictive mean matching)

imputed_med_Data <- mice(med_f, m=5, maxit = 5, method = 'pmm')
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_BASERUN_CS  TEAM_BATTING_HBP  TEAM_PITCHING_SO  TEAM_FIELDING_DP
imputed_med_Data <- complete(imputed_med_Data)
summary(imputed_med_Data)
##  TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B  TEAM_BATTING_HR 
##  Min.   : 819   Min.   : 44.0   Min.   : 14.00   Min.   :  0.00  
##  1st Qu.:1387   1st Qu.:210.0   1st Qu.: 35.00   1st Qu.: 44.50  
##  Median :1455   Median :239.0   Median : 52.00   Median :101.00  
##  Mean   :1469   Mean   :241.3   Mean   : 55.91   Mean   : 95.63  
##  3rd Qu.:1548   3rd Qu.:278.5   3rd Qu.: 72.00   3rd Qu.:135.50  
##  Max.   :2170   Max.   :376.0   Max.   :155.00   Max.   :242.00  
##  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB TEAM_BASERUN_CS 
##  Min.   : 15.0   Min.   :   0.0   Min.   :  0.0   Min.   :  0.00  
##  1st Qu.:436.5   1st Qu.: 527.0   1st Qu.: 59.0   1st Qu.: 41.00  
##  Median :509.0   Median : 677.0   Median : 95.0   Median : 56.00  
##  Mean   :499.0   Mean   : 699.3   Mean   :126.5   Mean   : 64.27  
##  3rd Qu.:565.5   3rd Qu.: 904.5   3rd Qu.:156.0   3rd Qu.: 75.00  
##  Max.   :792.0   Max.   :1268.0   Max.   :580.0   Max.   :154.00  
##  TEAM_BATTING_HBP TEAM_PITCHING_HR TEAM_PITCHING_SO TEAM_FIELDING_E 
##  Min.   :42.00    Min.   :  0.0    Min.   :   0.0   Min.   :  73.0  
##  1st Qu.:46.00    1st Qu.: 52.0    1st Qu.: 621.0   1st Qu.: 131.0  
##  Median :52.00    Median :104.0    Median : 777.0   Median : 163.0  
##  Mean   :56.02    Mean   :102.1    Mean   : 806.5   Mean   : 249.7  
##  3rd Qu.:66.00    3rd Qu.:142.5    3rd Qu.: 953.0   3rd Qu.: 252.0  
##  Max.   :96.00    Max.   :336.0    Max.   :9963.0   Max.   :1568.0  
##  TEAM_FIELDING_DP
##  Min.   : 69.0   
##  1st Qu.:121.0   
##  Median :146.0   
##  Mean   :140.4   
##  3rd Qu.:160.5   
##  Max.   :204.0

Centering and scaling was used to transform individual predictors in the dataset using the caret library.

t = preProcess(imputed_med_Data,
                   c("BoxCox", "center", "scale"))
med_final = data.frame(
      t = predict(t, imputed_med_Data))
 
summary(med_final)
##  t.TEAM_BATTING_H   t.TEAM_BATTING_2B  t.TEAM_BATTING_3B 
##  Min.   :-5.07603   Min.   :-3.26217   Min.   :-2.64215  
##  1st Qu.:-0.52836   1st Qu.:-0.67016   1st Qu.:-0.73771  
##  Median :-0.06571   Median :-0.09057   Median : 0.08513  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.54648   3rd Qu.: 0.74495   3rd Qu.: 0.76149  
##  Max.   : 4.16429   Max.   : 3.00896   Max.   : 2.35514  
##  t.TEAM_BATTING_HR  t.TEAM_BATTING_BB   t.TEAM_BATTING_SO 
##  Min.   :-1.69766   Min.   :-2.859388   Min.   :-2.93148  
##  1st Qu.:-0.90771   1st Qu.:-0.619108   1st Qu.:-0.72213  
##  Median : 0.09527   Median : 0.008141   Median :-0.09328  
##  Mean   : 0.00000   Mean   : 0.000000   Mean   : 0.00000  
##  3rd Qu.: 0.70771   3rd Qu.: 0.536019   3rd Qu.: 0.86047  
##  Max.   : 2.59828   Max.   : 2.968415   Max.   : 2.38438  
##  t.TEAM_BASERUN_SB t.TEAM_BASERUN_CS t.TEAM_BATTING_HBP t.TEAM_PITCHING_HR
##  Min.   :-1.3407   Min.   :-1.8973   Min.   :-1.3581    Min.   :-1.77169  
##  1st Qu.:-0.7154   1st Qu.:-0.6870   1st Qu.:-0.8018    1st Qu.:-0.86977  
##  Median :-0.3339   Median :-0.2441   Median :-0.1349    Median : 0.03214  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000    Mean   : 0.00000  
##  3rd Qu.: 0.3126   3rd Qu.: 0.3167   3rd Qu.: 0.9305    3rd Qu.: 0.69991  
##  Max.   : 4.8062   Max.   : 2.6489   Max.   : 2.1308    Max.   : 4.05609  
##  t.TEAM_PITCHING_SO t.TEAM_FIELDING_E t.TEAM_FIELDING_DP
##  Min.   :-1.31288   Min.   :-3.1354   Min.   :-2.1966   
##  1st Qu.:-0.30198   1st Qu.:-0.7317   1st Qu.:-0.7065   
##  Median :-0.04804   Median :-0.1378   Median : 0.1451   
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   
##  3rd Qu.: 0.23846   3rd Qu.: 0.7208   3rd Qu.: 0.6745   
##  Max.   :14.90538   Max.   : 2.0409   Max.   : 2.4054
eval_data <- predict(model3, newdata = med_final, interval="prediction")
eval_data
##              fit          lwr         upr
## 1   -1.188703912 -2.801631098  0.42422327
## 2   -0.975852411 -2.587405318  0.63570050
## 3   -0.585833502 -2.196742233  1.02507523
## 4    0.282214067 -1.329046252  1.89347439
## 5   -1.237586845 -2.851812788  0.37663910
## 6   -1.018227203 -2.632402562  0.59594816
## 7    0.159187661 -1.455138991  1.77351431
## 8   -0.671020146 -2.282636111  0.94059582
## 9   -0.674110632 -2.286045016  0.93782375
## 10  -0.539014186 -2.149696146  1.07166777
## 11  -0.908734347 -2.521206765  0.70373807
## 12  -0.008168950 -1.621093522  1.60475562
## 13   0.104906055 -1.509380166  1.71919228
## 14   0.029780950 -1.583319259  1.64288116
## 15   0.368902624 -1.245713510  1.98351876
## 16  -0.479207877 -2.091109885  1.13269413
## 17  -0.704877184 -2.316855438  0.90710107
## 18  -0.122074980 -1.733100331  1.48895037
## 19  -0.745128405 -2.357311318  0.86705451
## 20   0.371914983 -1.240272455  1.98410242
## 21   0.309404144 -1.302542559  1.92135085
## 22   0.184283641 -1.427554877  1.79612216
## 23   0.089960494 -1.521718674  1.70163966
## 24  -0.696034436 -2.307681203  0.91561233
## 25   0.152415144 -1.459800645  1.76463093
## 26   0.471168682 -1.142109313  2.08444668
## 27  -0.782595772 -2.406150933  0.84095939
## 28  -0.504244973 -2.115090957  1.10660101
## 29   0.300364895 -1.312401842  1.91313163
## 30  -0.521175943 -2.134012595  1.09166071
## 31   0.685047403 -0.927620223  2.29771503
## 32   0.343697759 -1.267330046  1.95472556
## 33   0.311271780 -1.300909395  1.92345295
## 34   0.178522852 -1.435962660  1.79300837
## 35   0.052959132 -1.558334131  1.66425240
## 36   0.130007751 -1.484071831  1.74408733
## 37  -0.246329940 -1.856506149  1.36384627
## 38   0.477451381 -1.136248844  2.09115161
## 39   0.061566387 -1.550002204  1.67313498
## 40   0.495184111 -1.117337090  2.10770531
## 41   0.170942521 -1.441633769  1.78351881
## 42   1.327030319 -0.287826145  2.94188678
## 43  -1.436912856 -3.063628955  0.18980324
## 44   1.657339515  0.033738046  3.28094098
## 45   0.604521774 -1.010360386  2.21940393
## 46   0.930697121 -0.682412772  2.54380701
## 47   1.060283382 -0.552726934  2.67329370
## 48  -0.423778854 -2.034879371  1.18732166
## 49  -0.805048810 -2.416346923  0.80624930
## 50  -0.111199123 -1.721984002  1.49958576
## 51  -0.327496116 -1.938572804  1.28358057
## 52   0.209763524 -1.401988813  1.82151586
## 53  -0.426070507 -2.038206655  1.18606564
## 54  -0.247908468 -1.860061345  1.36424441
## 55  -0.611197155 -2.222117708  0.99972340
## 56   0.093144994 -1.518654246  1.70494423
## 57   0.617773553 -0.994801981  2.23034909
## 58  -0.399511598 -2.011106929  1.21208373
## 59  -1.067474921 -2.680453621  0.54550378
## 60  -0.271351571 -1.882430395  1.33972725
## 61   0.411150533 -1.200001886  2.02230295
## 62   0.008971026 -1.607084423  1.62502647
## 63   0.411557802 -1.199516053  2.02263166
## 64   0.308548785 -1.305488232  1.92258580
## 65   0.377231494 -1.236981598  1.99144459
## 66   1.291564181 -0.324872671  2.90800103
## 67  -0.607540694 -2.219195749  1.00411436
## 68  -0.339456176 -1.951510021  1.27259767
## 69  -0.167039064 -1.779040882  1.44496275
## 70   0.369686503 -1.243483304  1.98285631
## 71   0.274779556 -1.338855523  1.88841464
## 72  -0.384771750 -1.999982594  1.23043909
## 73  -0.207396877 -1.820855855  1.40606210
## 74   0.543432183 -1.071547012  2.15841138
## 75  -0.243530667 -1.856619020  1.36955769
## 76  -0.237590376 -1.850688313  1.37550756
## 77   0.391944421 -1.219345745  2.00323459
## 78   0.071845465 -1.539334619  1.68302555
## 79  -0.707455699 -2.319003057  0.90409166
## 80  -0.390873679 -2.001894870  1.22014751
## 81   0.191662483 -1.419630971  1.80295594
## 82   0.331875727 -1.279386092  1.94313755
## 83   0.798494850 -0.813761737  2.41075144
## 84  -0.487835474 -2.100862202  1.12519126
## 85   0.236059114 -1.375813599  1.84793183
## 86  -0.190661342 -1.803785449  1.42246277
## 87   0.172866215 -1.439663619  1.78539605
## 88   0.315253610 -1.295081481  1.92558870
## 89   0.771876896 -0.841297820  2.38505161
## 90   0.738058813 -0.873620461  2.34973809
## 91   0.127221950 -1.484646099  1.73909000
## 92   1.234894749 -0.381657668  2.85144717
## 93  -0.500930117 -2.112020003  1.11015977
## 94   0.067349892 -1.544344140  1.67904392
## 95   0.128406958 -1.483255817  1.74006973
## 96   0.177921646 -1.433556245  1.78939954
## 97   0.551033507 -1.063476455  2.16554347
## 98   1.129842456 -0.484313978  2.74399889
## 99   0.409059169 -1.203653915  2.02177225
## 100  0.334082664 -1.279320296  1.94748562
## 101 -0.105622652 -1.717374185  1.50612888
## 102 -0.473931456 -2.085319528  1.13745662
## 103  0.264393691 -1.346043946  1.87483133
## 104  0.269591258 -1.342156128  1.88133864
## 105 -0.487187224 -2.101228479  1.12685403
## 106 -0.844473452 -2.457586146  0.76863924
## 107 -1.529110955 -3.145785251  0.08756334
## 108 -0.066268807 -1.678720758  1.54618314
## 109  0.755330938 -0.856375893  2.36703777
## 110 -1.179629839 -2.794534176  0.43527450
## 111  0.336937738 -1.273928047  1.94780352
## 112  0.392931981 -1.218538450  2.00440241
## 113  0.744048052 -0.866884817  2.35498092
## 114  0.719973528 -0.891541003  2.33148806
## 115  0.060156800 -1.551268931  1.67158253
## 116  0.036204928 -1.574962926  1.64737278
## 117  0.253173653 -1.359456518  1.86580382
## 118  0.091442900 -1.518999191  1.70188499
## 119 -0.418907136 -2.030599210  1.19278494
## 120 -0.119719802 -1.732957060  1.49351746
## 121  0.599920829 -1.012434195  2.21227585
## 122 -0.928559816 -2.540774595  0.68365496
## 123 -0.664519205 -2.276994938  0.94795653
## 124 -0.975013516 -2.590351807  0.64032477
## 125 -0.830676284 -2.442893865  0.78154130
## 126  0.212659690 -1.399078127  1.82439751
## 127  0.400676007 -1.211411108  2.01276312
## 128 -0.340533499 -1.951532930  1.27046593
## 129  0.652151992 -0.959436715  2.26374070
## 130  0.438046767 -1.173961470  2.05005500
## 131  0.202610691 -1.408627104  1.81384849
## 132  0.117699268 -1.494461220  1.72985976
## 133 -0.666937111 -2.283235679  0.94936146
## 134 -0.063562489 -1.675769877  1.54864490
## 135  1.237544478 -0.379729562  2.85481852
## 136 -0.424598881 -2.037540603  1.18834284
## 137 -0.263794569 -1.875293359  1.34770422
## 138 -0.208344168 -1.819028616  1.40234028
## 139  1.062013491 -0.557749619  2.68177660
## 140 -0.077031170 -1.688201456  1.53413911
## 141 -1.202017518 -2.815325291  0.41129025
## 142 -0.543669952 -2.155525586  1.06818568
## 143  0.574721140 -1.037450267  2.18689255
## 144 -0.580114855 -2.191990481  1.03176077
## 145 -0.201476211 -1.813540101  1.41058768
## 146 -0.396270486 -2.006953811  1.21441284
## 147 -0.424079955 -2.035563519  1.18740361
## 148  0.012235815 -1.598735746  1.62320738
## 149 -0.134094498 -1.746418526  1.47822953
## 150  0.358206900 -1.252692253  1.96910605
## 151  0.121373157 -1.490687994  1.73343431
## 152  0.458047931 -1.156271021  2.07236688
## 153 -1.091101387 -2.715111868  0.53290909
## 154 -1.030449584 -2.642707851  0.58180868
## 155 -0.019741481 -1.631551499  1.59206854
## 156 -0.988478370 -2.601162038  0.62420530
## 157  0.833438637 -0.779314968  2.44619224
## 158 -0.737249321 -2.349351697  0.87485306
## 159  0.544022039 -1.067788981  2.15583306
## 160 -0.464209492 -2.075181647  1.14676266
## 161  1.192763045 -0.422709568  2.80823566
## 162  1.608078209 -0.007961502  3.22411792
## 163  0.974806731 -0.637808023  2.58742148
## 164  1.350078792 -0.265986965  2.96614455
## 165  1.077591667 -0.538343607  2.69352694
## 166  0.933343064 -0.680910433  2.54759656
## 167  0.144754363 -1.467547100  1.75705583
## 168  0.168398304 -1.444503557  1.78130016
## 169 -0.712666089 -2.325087477  0.89975530
## 170 -0.022671288 -1.634706482  1.58936391
## 171  0.570223658 -1.041554070  2.18200139
## 172  0.449488461 -1.161771373  2.06074829
## 173  0.120243507 -1.490742331  1.73122935
## 174  0.786653766 -0.825308821  2.39861635
## 175  0.025263833 -1.585580290  1.63610796
## 176 -0.167176497 -1.779147178  1.44479418
## 177  0.086591307 -1.526353437  1.69953605
## 178 -0.865603070 -2.478121732  0.74691559
## 179 -0.309436974 -1.919781098  1.30090715
## 180 -0.190839227 -1.801770266  1.42009181
## 181  0.399170213 -1.216207970  2.01454840
## 182  0.318134717 -1.294556657  1.93082609
## 183  0.440513332 -1.171394752  2.05242142
## 184  0.552131984 -1.059480847  2.16374482
## 185  1.103391260 -0.512298711  2.71908123
## 186  0.919819856 -0.700394214  2.54003393
## 187  0.530320633 -1.084560485  2.14520175
## 188 -0.469796995 -2.083396926  1.14380294
## 189 -1.002246473 -2.615017425  0.61052448
## 190  1.783289245  0.165378614  3.40119988
## 191 -0.603967482 -2.215200207  1.00726524
## 192 -0.126928397 -1.737999734  1.48414294
## 193 -0.551775195 -2.162777298  1.05922691
## 194 -0.444329737 -2.055526802  1.16686733
## 195 -0.384215468 -1.996769074  1.22833814
## 196 -1.074285278 -2.687088523  0.53851797
## 197 -0.409355606 -2.020136132  1.20142492
## 198  0.785419352 -0.828584646  2.39942335
## 199  0.069152915 -1.542005578  1.68031141
## 200  0.310159887 -1.301198400  1.92151817
## 201 -0.547262893 -2.160594645  1.06606886
## 202  0.162547089 -1.449273480  1.77436766
## 203 -0.063598367 -1.677399033  1.55020230
## 204  0.631162164 -0.980125780  2.24245011
## 205  0.072799475 -1.538671300  1.68427025
## 206  0.212851230 -1.398388817  1.82409128
## 207  0.108981150 -1.503185549  1.72114785
## 208  0.175126729 -1.436887358  1.78714082
## 209 -0.091293215 -1.702897446  1.52031102
## 210 -0.178430965 -1.790354750  1.43349282
## 211  1.592836102 -0.021588851  3.20726105
## 212  0.502322744 -1.109370883  2.11401637
## 213  0.016167798 -1.595723216  1.62805881
## 214 -1.194431949 -2.806461259  0.41759736
## 215 -0.778398648 -2.391184974  0.83438768
## 216  0.160704441 -1.450461237  1.77187012
## 217 -0.208031505 -1.822180420  1.40611741
## 218  0.643374006 -0.968575555  2.25532357
## 219 -0.170686526 -1.781556247  1.44018320
## 220  0.091191750 -1.519729181  1.70211268
## 221 -0.356844303 -1.968468330  1.25477972
## 222 -0.588014687 -2.200756089  1.02472671
## 223 -0.082170362 -1.693459299  1.52911858
## 224 -0.304005419 -1.918049883  1.31003905
## 225  0.456648669 -1.167714862  2.08101220
## 226 -0.203040390 -1.813841849  1.40776107
## 227 -0.128023986 -1.739091865  1.48304389
## 228 -0.197459618 -1.809863054  1.41494382
## 229  0.429681174 -1.181445213  2.04080756
## 230 -0.275925665 -1.888942923  1.33709159
## 231 -0.034969388 -1.647818378  1.57787960
## 232  0.569322662 -1.042239613  2.18088494
## 233  0.006375436 -1.606298315  1.61904919
## 234  0.271694288 -1.340869527  1.88425810
## 235 -0.206591382 -1.817363591  1.40418083
## 236 -0.346525083 -1.957093605  1.26404344
## 237 -0.303938405 -1.917120874  1.30924406
## 238  0.102396872 -1.510068137  1.71486188
## 239  0.749829744 -0.862960205  2.36261969
## 240 -0.688796106 -2.300046550  0.92245434
## 241  0.326092224 -1.284814139  1.93699859
## 242  0.745860360 -0.866678820  2.35839954
## 243  0.278486626 -1.333026401  1.88999965
## 244  0.167037606 -1.444900660  1.77897587
## 245 -1.527306256 -3.142697271  0.08808476
## 246  0.114727967 -1.497666595  1.72712253
## 247 -0.163295057 -1.774180029  1.44758991
## 248  0.196121627 -1.415252092  1.80749535
## 249 -0.340190856 -1.951442301  1.27106059
## 250  0.377720712 -1.236738070  1.99217949
## 251  0.188325815 -1.423531914  1.80018354
## 252 -0.751336980 -2.364881246  0.86220729
## 253  0.854123364 -0.759483249  2.46772998
## 254 -3.088483902 -4.717317539 -1.45965026
## 255 -0.796919418 -2.408282250  0.81444341
## 256 -0.364218157 -1.977809973  1.24937366
## 257  0.201884030 -1.410076170  1.81384423
## 258  0.038612801 -1.572571842  1.64979744
## 259 -0.303912358 -1.915901856  1.30807714
summary(eval_data)
##       fit                lwr               upr        
##  Min.   :-3.08848   Min.   :-4.7173   Min.   :-1.460  
##  1st Qu.:-0.40443   1st Qu.:-2.0156   1st Qu.: 1.207  
##  Median : 0.06157   Median :-1.5500   Median : 1.673  
##  Mean   : 0.00000   Mean   :-1.6128   Mean   : 1.613  
##  3rd Qu.: 0.37457   3rd Qu.:-1.2386   3rd Qu.: 1.988  
##  Max.   : 1.78329   Max.   : 0.1654   Max.   : 3.401