#HW #1 Assignment - Moneyball Model
Overview In this homework assignment, you will explore, analyze and model a data set containing approximately 2200 records. Each record represents a professional baseball team from the years 1871 to 2006 inclusive. Each record has the performance of the team for the given year, with all of the statistics adjusted to match the performance of a 162 game season.
Your objective is to build a multiple linear regression model on the training data to predict the number of wins for the team. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:
library(knitr)
library(stringr)
## Warning: package 'stringr' was built under R version 3.2.5
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(reshape)
## Warning: package 'reshape' was built under R version 3.2.5
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
library(mice)
## Warning: package 'mice' was built under R version 3.2.5
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.2.5
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Warning: replacing previous import by 'plyr::ddply' when loading 'caret'
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2018c.
## 1.0/zoneinfo/America/New_York'
## Warning: replacing previous import by 'tidyr::%>%' when loading 'broom'
## Warning: replacing previous import by 'tidyr::gather' when loading 'broom'
## Warning: replacing previous import by 'tidyr::spread' when loading 'broom'
## Warning: replacing previous import by 'rlang::!!' when loading 'recipes'
## Warning: replacing previous import by 'rlang::expr' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_lhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::f_rhs' when loading 'recipes'
## Warning: replacing previous import by 'rlang::invoke' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::is_empty' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::lang' when loading 'recipes'
## Warning: replacing previous import by 'rlang::na_dbl' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::names2' when loading
## 'recipes'
## Warning: replacing previous import by 'rlang::quos' when loading 'recipes'
## Warning: replacing previous import by 'rlang::sym' when loading 'recipes'
## Warning: replacing previous import by 'rlang::syms' when loading 'recipes'
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
#DATA EXPLORATION:
Load the data and understand the data by using some stats and plots.
mtd <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-1/master/moneyball-training-data.csv")
count(mtd)
## # A tibble: 1 x 1
## n
## <int>
## 1 2276
names(mtd)
## [1] "INDEX" "TARGET_WINS" "TEAM_BATTING_H"
## [4] "TEAM_BATTING_2B" "TEAM_BATTING_3B" "TEAM_BATTING_HR"
## [7] "TEAM_BATTING_BB" "TEAM_BATTING_SO" "TEAM_BASERUN_SB"
## [10] "TEAM_BASERUN_CS" "TEAM_BATTING_HBP" "TEAM_PITCHING_H"
## [13] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [16] "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
summary(mtd)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
The dataset consists of 17 elements, with 2276 total cases. There are multiple variables with missing (NA) values and TEAM-BATTING_HBP has the highest NAs.
Checking for outliers:
ggplot(stack(mtd), aes(x = ind, y = values)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 1000)) +
theme(legend.position="none") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
theme(panel.background = element_rect(fill = 'grey'))
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).
Checking for skewness in the data
mtd1 = melt(mtd)
## Using as id variables
ggplot(mtd1, aes(x= value)) +
geom_density(fill='red') + facet_wrap(~variable, scales = 'free')
## Warning: Removed 3478 rows containing non-finite values (stat_density).
As seen there are several variables that are skewed and also there are outliers.
Finding correlations:
mtd2 <- mtd[,-1 ]
names(mtd2)
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B"
## [4] "TEAM_BATTING_3B" "TEAM_BATTING_HR" "TEAM_BATTING_BB"
## [7] "TEAM_BATTING_SO" "TEAM_BASERUN_SB" "TEAM_BASERUN_CS"
## [10] "TEAM_BATTING_HBP" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"
## [16] "TEAM_FIELDING_DP"
cor(drop_na(mtd2))
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## TARGET_WINS 1.00000000 0.46994665 0.31298400
## TEAM_BATTING_H 0.46994665 1.00000000 0.56177286
## TEAM_BATTING_2B 0.31298400 0.56177286 1.00000000
## TEAM_BATTING_3B -0.12434586 0.21391883 0.04203441
## TEAM_BATTING_HR 0.42241683 0.39627593 0.25099045
## TEAM_BATTING_BB 0.46868793 0.19735234 0.19749256
## TEAM_BATTING_SO -0.22889273 -0.34174328 -0.06415123
## TEAM_BASERUN_SB 0.01483639 0.07167495 -0.18768279
## TEAM_BASERUN_CS -0.17875598 -0.09377545 -0.20413884
## TEAM_BATTING_HBP 0.07350424 -0.02911218 0.04608475
## TEAM_PITCHING_H 0.47123431 0.99919269 0.56045355
## TEAM_PITCHING_HR 0.42246683 0.39495630 0.24999875
## TEAM_PITCHING_BB 0.46839882 0.19529071 0.19592157
## TEAM_PITCHING_SO -0.22936481 -0.34445001 -0.06616615
## TEAM_FIELDING_E -0.38668800 -0.25381638 -0.19427027
## TEAM_FIELDING_DP -0.19586601 0.01776946 -0.02488808
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## TARGET_WINS -0.12434586 0.42241683 0.46868793
## TEAM_BATTING_H 0.21391883 0.39627593 0.19735234
## TEAM_BATTING_2B 0.04203441 0.25099045 0.19749256
## TEAM_BATTING_3B 1.00000000 -0.21879927 -0.20584392
## TEAM_BATTING_HR -0.21879927 1.00000000 0.45638161
## TEAM_BATTING_BB -0.20584392 0.45638161 1.00000000
## TEAM_BATTING_SO -0.19291841 0.21045444 0.21833871
## TEAM_BASERUN_SB 0.16946086 -0.19021893 -0.08806123
## TEAM_BASERUN_CS 0.23213978 -0.27579838 -0.20878051
## TEAM_BATTING_HBP -0.17424715 0.10618116 0.04746007
## TEAM_PITCHING_H 0.21250322 0.39549390 0.19848687
## TEAM_PITCHING_HR -0.21973263 0.99993259 0.45659283
## TEAM_PITCHING_BB -0.20675383 0.45542468 0.99988140
## TEAM_PITCHING_SO -0.19386654 0.20829574 0.21793253
## TEAM_FIELDING_E -0.06513145 0.01567397 -0.07847126
## TEAM_FIELDING_DP 0.13314758 -0.06182222 -0.07929078
## TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## TARGET_WINS -0.22889273 0.01483639 -0.178755979
## TEAM_BATTING_H -0.34174328 0.07167495 -0.093775445
## TEAM_BATTING_2B -0.06415123 -0.18768279 -0.204138837
## TEAM_BATTING_3B -0.19291841 0.16946086 0.232139777
## TEAM_BATTING_HR 0.21045444 -0.19021893 -0.275798375
## TEAM_BATTING_BB 0.21833871 -0.08806123 -0.208780510
## TEAM_BATTING_SO 1.00000000 -0.07475974 -0.056130355
## TEAM_BASERUN_SB -0.07475974 1.00000000 0.624737808
## TEAM_BASERUN_CS -0.05613035 0.62473781 1.000000000
## TEAM_BATTING_HBP 0.22094219 -0.06400498 -0.070513896
## TEAM_PITCHING_H -0.34145321 0.07395373 -0.092977893
## TEAM_PITCHING_HR 0.21111617 -0.18948057 -0.275471495
## TEAM_PITCHING_BB 0.21895783 -0.08741902 -0.208470154
## TEAM_PITCHING_SO 0.99976835 -0.07351325 -0.055308336
## TEAM_FIELDING_E 0.30814540 0.04292341 0.207701189
## TEAM_FIELDING_DP -0.12319072 -0.13023054 -0.006764233
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## TARGET_WINS 0.07350424 0.47123431 0.42246683
## TEAM_BATTING_H -0.02911218 0.99919269 0.39495630
## TEAM_BATTING_2B 0.04608475 0.56045355 0.24999875
## TEAM_BATTING_3B -0.17424715 0.21250322 -0.21973263
## TEAM_BATTING_HR 0.10618116 0.39549390 0.99993259
## TEAM_BATTING_BB 0.04746007 0.19848687 0.45659283
## TEAM_BATTING_SO 0.22094219 -0.34145321 0.21111617
## TEAM_BASERUN_SB -0.06400498 0.07395373 -0.18948057
## TEAM_BASERUN_CS -0.07051390 -0.09297789 -0.27547150
## TEAM_BATTING_HBP 1.00000000 -0.02769699 0.10675878
## TEAM_PITCHING_H -0.02769699 1.00000000 0.39463199
## TEAM_PITCHING_HR 0.10675878 0.39463199 1.00000000
## TEAM_PITCHING_BB 0.04785137 0.19703302 0.45580983
## TEAM_PITCHING_SO 0.22157375 -0.34330646 0.20920115
## TEAM_FIELDING_E 0.04178971 -0.25073028 0.01689330
## TEAM_FIELDING_DP -0.07120824 0.01416807 -0.06292475
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## TARGET_WINS 0.46839882 -0.22936481 -0.38668800
## TEAM_BATTING_H 0.19529071 -0.34445001 -0.25381638
## TEAM_BATTING_2B 0.19592157 -0.06616615 -0.19427027
## TEAM_BATTING_3B -0.20675383 -0.19386654 -0.06513145
## TEAM_BATTING_HR 0.45542468 0.20829574 0.01567397
## TEAM_BATTING_BB 0.99988140 0.21793253 -0.07847126
## TEAM_BATTING_SO 0.21895783 0.99976835 0.30814540
## TEAM_BASERUN_SB -0.08741902 -0.07351325 0.04292341
## TEAM_BASERUN_CS -0.20847015 -0.05530834 0.20770119
## TEAM_BATTING_HBP 0.04785137 0.22157375 0.04178971
## TEAM_PITCHING_H 0.19703302 -0.34330646 -0.25073028
## TEAM_PITCHING_HR 0.45580983 0.20920115 0.01689330
## TEAM_PITCHING_BB 1.00000000 0.21887700 -0.07692315
## TEAM_PITCHING_SO 0.21887700 1.00000000 0.31008407
## TEAM_FIELDING_E -0.07692315 0.31008407 1.00000000
## TEAM_FIELDING_DP -0.08040645 -0.12492321 0.04020581
## TEAM_FIELDING_DP
## TARGET_WINS -0.195866006
## TEAM_BATTING_H 0.017769456
## TEAM_BATTING_2B -0.024888081
## TEAM_BATTING_3B 0.133147578
## TEAM_BATTING_HR -0.061822219
## TEAM_BATTING_BB -0.079290775
## TEAM_BATTING_SO -0.123190715
## TEAM_BASERUN_SB -0.130230537
## TEAM_BASERUN_CS -0.006764233
## TEAM_BATTING_HBP -0.071208241
## TEAM_PITCHING_H 0.014168073
## TEAM_PITCHING_HR -0.062924751
## TEAM_PITCHING_BB -0.080406452
## TEAM_PITCHING_SO -0.124923213
## TEAM_FIELDING_E 0.040205814
## TEAM_FIELDING_DP 1.000000000
pairs.panels(mtd2[1:8])
pairs.panels(mtd2[9:16])
We can see there are some positively and some negatively correlated variables.
#DATA PREPARATION
Removing the variables:
mtd_f <- mtd[,-1 ]
names(mtd_f)
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B"
## [4] "TEAM_BATTING_3B" "TEAM_BATTING_HR" "TEAM_BATTING_BB"
## [7] "TEAM_BATTING_SO" "TEAM_BASERUN_SB" "TEAM_BASERUN_CS"
## [10] "TEAM_BATTING_HBP" "TEAM_PITCHING_H" "TEAM_PITCHING_HR"
## [13] "TEAM_PITCHING_BB" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"
## [16] "TEAM_FIELDING_DP"
The variable TEAM_BATTING_HBP is having mostly missing values so the variable will be removed completely.
mtd_f <- mtd_f[,-10 ]
names(mtd_f )
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B"
## [4] "TEAM_BATTING_3B" "TEAM_BATTING_HR" "TEAM_BATTING_BB"
## [7] "TEAM_BATTING_SO" "TEAM_BASERUN_SB" "TEAM_BASERUN_CS"
## [10] "TEAM_PITCHING_H" "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
TEAM_PITCHING_HR and TEAM_BATTING_HR are highly correlated, so we can remove one of them.
mtd_f <- mtd_f[,-11 ]
names(mtd_f)
## [1] "TARGET_WINS" "TEAM_BATTING_H" "TEAM_BATTING_2B"
## [4] "TEAM_BATTING_3B" "TEAM_BATTING_HR" "TEAM_BATTING_BB"
## [7] "TEAM_BATTING_SO" "TEAM_BASERUN_SB" "TEAM_BASERUN_CS"
## [10] "TEAM_PITCHING_H" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
Imputing the NAs using Mice(pmm - predictive mean matching)
imputed_mtd_Data <- mice(mtd_f, m=5, maxit = 5, method = 'pmm')
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_SO TEAM_FIELDING_DP
imputed_mtd_Data <- complete(imputed_mtd_Data)
summary(imputed_mtd_Data)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 544.0 1st Qu.: 67.0
## Median :102.00 Median :512.0 Median : 733.0 Median :106.0
## Mean : 99.61 Mean :501.6 Mean : 728.2 Mean :135.1
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 925.0 3rd Qu.:170.0
## Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0
## TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_BB TEAM_PITCHING_SO
## Min. : 0.00 Min. : 1137 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.: 1419 1st Qu.: 476.0 1st Qu.: 617.0
## Median : 57.00 Median : 1518 Median : 536.5 Median : 807.0
## Mean : 74.93 Mean : 1779 Mean : 553.0 Mean : 813.6
## 3rd Qu.: 89.00 3rd Qu.: 1682 3rd Qu.: 611.0 3rd Qu.: 958.2
## Max. :201.00 Max. :30132 Max. :3645.0 Max. :19278.0
## TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 65.0 Min. : 52
## 1st Qu.: 127.0 1st Qu.:125
## Median : 159.0 Median :146
## Mean : 246.5 Mean :142
## 3rd Qu.: 249.2 3rd Qu.:162
## Max. :1898.0 Max. :228
Centering and scaling was used to transform individual predictors in the dataset using the caret library.
t = preProcess(imputed_mtd_Data,
c("BoxCox", "center", "scale"))
mtd_final = data.frame(
t = predict(t, imputed_mtd_Data))
summary(mtd_final)
## t.TARGET_WINS t.TEAM_BATTING_H t.TEAM_BATTING_2B
## Min. :-5.12888 Min. :-7.537074 Min. :-4.48108
## 1st Qu.:-0.62156 1st Qu.:-0.573089 1st Qu.:-0.68949
## Median : 0.07676 Median :-0.003988 Median :-0.03019
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.71159 3rd Qu.: 0.586908 3rd Qu.: 0.69827
## Max. : 4.13970 Max. : 4.390097 Max. : 4.05391
## t.TEAM_BATTING_3B t.TEAM_BATTING_HR t.TEAM_BATTING_BB
## Min. :-1.9776 Min. :-1.64521 Min. :-4.08866
## 1st Qu.:-0.7606 1st Qu.:-0.95153 1st Qu.:-0.41215
## Median :-0.2953 Median : 0.03944 Median : 0.08511
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.5995 3rd Qu.: 0.78267 3rd Qu.: 0.63944
## Max. : 6.0042 Max. : 2.71505 Max. : 3.06871
## t.TEAM_BATTING_SO t.TEAM_BASERUN_SB t.TEAM_BASERUN_CS t.TEAM_PITCHING_H
## Min. :-2.95639 Min. :-1.3564 Min. :-1.5197 Min. :-2.8556
## 1st Qu.:-0.74796 1st Qu.:-0.6840 1st Qu.:-0.6678 1st Qu.:-0.6710
## Median : 0.01931 Median :-0.2925 Median :-0.3636 Median :-0.1765
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.79876 3rd Qu.: 0.3498 3rd Qu.: 0.2854 3rd Qu.: 0.4602
## Max. : 2.72302 Max. : 5.6391 Max. : 2.5569 Max. : 3.2387
## t.TEAM_PITCHING_BB t.TEAM_PITCHING_SO t.TEAM_FIELDING_E
## Min. :-3.32422 Min. :-1.50192 Min. :-3.3092
## 1st Qu.:-0.46291 1st Qu.:-0.36293 1st Qu.:-0.7163
## Median :-0.09923 Median :-0.01219 Median :-0.1424
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.34860 3rd Qu.: 0.26701 3rd Qu.: 0.7096
## Max. :18.58645 Max. :34.08525 Max. : 2.1432
## t.TEAM_FIELDING_DP
## Min. :-2.6139
## 1st Qu.:-0.6257
## Median : 0.0870
## Mean : 0.0000
## 3rd Qu.: 0.6660
## Max. : 3.3506
mtd_final1 = melt(mtd_final)
## Using as id variables
ggplot(mtd_final1, aes(x= value)) +
geom_density(fill='red') + facet_wrap(~variable, scales = 'free')
#BUILD MODELS:
Model1:
With all variables:
model1 <- lm(t.TARGET_WINS ~., mtd_final)
summary(model1)
##
## Call:
## lm(formula = t.TARGET_WINS ~ ., data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5760 -0.5031 -0.0020 0.5213 3.5486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.189e-11 1.711e-02 0.000 1.000000
## t.TEAM_BATTING_H 3.977e-01 3.666e-02 10.851 < 2e-16 ***
## t.TEAM_BATTING_2B -4.161e-02 2.763e-02 -1.506 0.132248
## t.TEAM_BATTING_3B 1.842e-01 3.010e-02 6.121 1.09e-09 ***
## t.TEAM_BATTING_HR 2.141e-01 3.839e-02 5.578 2.72e-08 ***
## t.TEAM_BATTING_BB 1.568e-01 3.487e-02 4.497 7.23e-06 ***
## t.TEAM_BATTING_SO -3.254e-01 4.033e-02 -8.067 1.16e-15 ***
## t.TEAM_BASERUN_SB 2.284e-01 3.196e-02 7.146 1.20e-12 ***
## t.TEAM_BASERUN_CS -1.912e-02 3.420e-02 -0.559 0.576128
## t.TEAM_PITCHING_H -1.405e-01 3.808e-02 -3.691 0.000229 ***
## t.TEAM_PITCHING_BB -2.218e-02 3.307e-02 -0.671 0.502582
## t.TEAM_PITCHING_SO 1.114e-01 2.925e-02 3.807 0.000144 ***
## t.TEAM_FIELDING_E -4.698e-01 3.859e-02 -12.174 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.248e-01 2.328e-02 -9.656 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8162 on 2262 degrees of freedom
## Multiple R-squared: 0.3376, Adjusted R-squared: 0.3338
## F-statistic: 88.69 on 13 and 2262 DF, p-value: < 2.2e-16
Model2:
With only the significant variables:
model2 <- lm(t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H + t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, mtd_final)
summary(model2)
##
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B +
## t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO +
## t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H +
## t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP,
## data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7306 -0.5073 -0.0023 0.5243 3.5123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.282e-11 1.711e-02 0.000 1
## t.TEAM_BATTING_H 3.723e-01 3.060e-02 12.166 < 2e-16 ***
## t.TEAM_BATTING_3B 1.829e-01 2.963e-02 6.175 7.83e-10 ***
## t.TEAM_BATTING_HR 2.221e-01 3.792e-02 5.856 5.42e-09 ***
## t.TEAM_BATTING_BB 1.351e-01 2.243e-02 6.025 1.97e-09 ***
## t.TEAM_BATTING_SO -3.318e-01 3.877e-02 -8.558 < 2e-16 ***
## t.TEAM_BASERUN_SB 2.200e-01 2.571e-02 8.557 < 2e-16 ***
## t.TEAM_PITCHING_SO 9.408e-02 2.180e-02 4.316 1.66e-05 ***
## t.TEAM_PITCHING_H -1.556e-01 3.491e-02 -4.457 8.71e-06 ***
## t.TEAM_FIELDING_E -4.619e-01 3.764e-02 -12.269 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.268e-01 2.287e-02 -9.916 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8162 on 2265 degrees of freedom
## Multiple R-squared: 0.3368, Adjusted R-squared: 0.3339
## F-statistic: 115 on 10 and 2265 DF, p-value: < 2.2e-16
Model3:
Further reducing the variables(TEAM_PITCHING_SO and TEAM_BATTING_SO are having high correlation, TEAM_BATTING_H and TEAM_PITCHING_H are also having high correlation, TEAM_BATTING_SO and TEAM_PITCHING_SO are also having high correlation):
model3 <- lm(t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B + t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO + t.TEAM_BASERUN_SB + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP, mtd_final)
summary(model3)
##
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B +
## t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO +
## t.TEAM_BASERUN_SB + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP,
## data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6903 -0.5124 0.0009 0.5278 4.0952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.432e-12 1.720e-02 0.000 1
## t.TEAM_BATTING_H 2.869e-01 2.454e-02 11.692 < 2e-16 ***
## t.TEAM_BATTING_3B 1.858e-01 2.968e-02 6.259 4.61e-10 ***
## t.TEAM_BATTING_HR 1.917e-01 3.757e-02 5.103 3.62e-07 ***
## t.TEAM_BATTING_BB 1.625e-01 2.106e-02 7.719 1.74e-14 ***
## t.TEAM_BATTING_SO -2.454e-01 3.508e-02 -6.996 3.45e-12 ***
## t.TEAM_BASERUN_SB 2.001e-01 2.489e-02 8.041 1.42e-15 ***
## t.TEAM_FIELDING_E -4.901e-01 3.647e-02 -13.439 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.275e-01 2.286e-02 -9.953 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8207 on 2267 degrees of freedom
## Multiple R-squared: 0.3288, Adjusted R-squared: 0.3265
## F-statistic: 138.8 on 8 and 2267 DF, p-value: < 2.2e-16
#SELECT MODELS AND PREDICTION:
summary(model1)
##
## Call:
## lm(formula = t.TARGET_WINS ~ ., data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5760 -0.5031 -0.0020 0.5213 3.5486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.189e-11 1.711e-02 0.000 1.000000
## t.TEAM_BATTING_H 3.977e-01 3.666e-02 10.851 < 2e-16 ***
## t.TEAM_BATTING_2B -4.161e-02 2.763e-02 -1.506 0.132248
## t.TEAM_BATTING_3B 1.842e-01 3.010e-02 6.121 1.09e-09 ***
## t.TEAM_BATTING_HR 2.141e-01 3.839e-02 5.578 2.72e-08 ***
## t.TEAM_BATTING_BB 1.568e-01 3.487e-02 4.497 7.23e-06 ***
## t.TEAM_BATTING_SO -3.254e-01 4.033e-02 -8.067 1.16e-15 ***
## t.TEAM_BASERUN_SB 2.284e-01 3.196e-02 7.146 1.20e-12 ***
## t.TEAM_BASERUN_CS -1.912e-02 3.420e-02 -0.559 0.576128
## t.TEAM_PITCHING_H -1.405e-01 3.808e-02 -3.691 0.000229 ***
## t.TEAM_PITCHING_BB -2.218e-02 3.307e-02 -0.671 0.502582
## t.TEAM_PITCHING_SO 1.114e-01 2.925e-02 3.807 0.000144 ***
## t.TEAM_FIELDING_E -4.698e-01 3.859e-02 -12.174 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.248e-01 2.328e-02 -9.656 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8162 on 2262 degrees of freedom
## Multiple R-squared: 0.3376, Adjusted R-squared: 0.3338
## F-statistic: 88.69 on 13 and 2262 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B +
## t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO +
## t.TEAM_BASERUN_SB + t.TEAM_PITCHING_SO + t.TEAM_PITCHING_H +
## t.TEAM_PITCHING_SO + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP,
## data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7306 -0.5073 -0.0023 0.5243 3.5123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.282e-11 1.711e-02 0.000 1
## t.TEAM_BATTING_H 3.723e-01 3.060e-02 12.166 < 2e-16 ***
## t.TEAM_BATTING_3B 1.829e-01 2.963e-02 6.175 7.83e-10 ***
## t.TEAM_BATTING_HR 2.221e-01 3.792e-02 5.856 5.42e-09 ***
## t.TEAM_BATTING_BB 1.351e-01 2.243e-02 6.025 1.97e-09 ***
## t.TEAM_BATTING_SO -3.318e-01 3.877e-02 -8.558 < 2e-16 ***
## t.TEAM_BASERUN_SB 2.200e-01 2.571e-02 8.557 < 2e-16 ***
## t.TEAM_PITCHING_SO 9.408e-02 2.180e-02 4.316 1.66e-05 ***
## t.TEAM_PITCHING_H -1.556e-01 3.491e-02 -4.457 8.71e-06 ***
## t.TEAM_FIELDING_E -4.619e-01 3.764e-02 -12.269 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.268e-01 2.287e-02 -9.916 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8162 on 2265 degrees of freedom
## Multiple R-squared: 0.3368, Adjusted R-squared: 0.3339
## F-statistic: 115 on 10 and 2265 DF, p-value: < 2.2e-16
summary(model3)
##
## Call:
## lm(formula = t.TARGET_WINS ~ t.TEAM_BATTING_H + t.TEAM_BATTING_3B +
## t.TEAM_BATTING_HR + t.TEAM_BATTING_BB + t.TEAM_BATTING_SO +
## t.TEAM_BASERUN_SB + t.TEAM_FIELDING_E + t.TEAM_FIELDING_DP,
## data = mtd_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6903 -0.5124 0.0009 0.5278 4.0952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.432e-12 1.720e-02 0.000 1
## t.TEAM_BATTING_H 2.869e-01 2.454e-02 11.692 < 2e-16 ***
## t.TEAM_BATTING_3B 1.858e-01 2.968e-02 6.259 4.61e-10 ***
## t.TEAM_BATTING_HR 1.917e-01 3.757e-02 5.103 3.62e-07 ***
## t.TEAM_BATTING_BB 1.625e-01 2.106e-02 7.719 1.74e-14 ***
## t.TEAM_BATTING_SO -2.454e-01 3.508e-02 -6.996 3.45e-12 ***
## t.TEAM_BASERUN_SB 2.001e-01 2.489e-02 8.041 1.42e-15 ***
## t.TEAM_FIELDING_E -4.901e-01 3.647e-02 -13.439 < 2e-16 ***
## t.TEAM_FIELDING_DP -2.275e-01 2.286e-02 -9.953 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8207 on 2267 degrees of freedom
## Multiple R-squared: 0.3288, Adjusted R-squared: 0.3265
## F-statistic: 138.8 on 8 and 2267 DF, p-value: < 2.2e-16
From the three models, I decided to use model3 for the predictions considering its more parsimonious model. There is no significant difference in R2, Adjusted R2 and RMSE even when i did the treatment for multi-collinearity.
#PREDICTION:
For the evaluation dataset also we will be doing all the preprocessing steps.
med <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-1/master/moneyball-evaluation-data.csv")
Removing the variables:
med_f <- med[,-1 ]
names(med_f)
## [1] "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [4] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_BATTING_SO"
## [7] "TEAM_BASERUN_SB" "TEAM_BASERUN_CS" "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_H" "TEAM_PITCHING_HR" "TEAM_PITCHING_BB"
## [13] "TEAM_PITCHING_SO" "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
med_f <- med_f[,-10 ]
names(med_f )
## [1] "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [4] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_BATTING_SO"
## [7] "TEAM_BASERUN_SB" "TEAM_BASERUN_CS" "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_HR" "TEAM_PITCHING_BB" "TEAM_PITCHING_SO"
## [13] "TEAM_FIELDING_E" "TEAM_FIELDING_DP"
med_f <- med_f[,-11 ]
names(med_f)
## [1] "TEAM_BATTING_H" "TEAM_BATTING_2B" "TEAM_BATTING_3B"
## [4] "TEAM_BATTING_HR" "TEAM_BATTING_BB" "TEAM_BATTING_SO"
## [7] "TEAM_BASERUN_SB" "TEAM_BASERUN_CS" "TEAM_BATTING_HBP"
## [10] "TEAM_PITCHING_HR" "TEAM_PITCHING_SO" "TEAM_FIELDING_E"
## [13] "TEAM_FIELDING_DP"
Imputing the NAs using Mice(pmm - predictive mean matching)
imputed_med_Data <- mice(med_f, m=5, maxit = 5, method = 'pmm')
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_SO TEAM_FIELDING_DP
imputed_med_Data <- complete(imputed_med_Data)
summary(imputed_med_Data)
## TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## Min. : 819 Min. : 44.0 Min. : 14.00 Min. : 0.00
## 1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00 1st Qu.: 44.50
## Median :1455 Median :239.0 Median : 52.00 Median :101.00
## Mean :1469 Mean :241.3 Mean : 55.91 Mean : 95.63
## 3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00 3rd Qu.:135.50
## Max. :2170 Max. :376.0 Max. :155.00 Max. :242.00
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## Min. : 15.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.:436.5 1st Qu.: 527.0 1st Qu.: 59.0 1st Qu.: 41.00
## Median :509.0 Median : 677.0 Median : 95.0 Median : 56.00
## Mean :499.0 Mean : 699.3 Mean :126.5 Mean : 64.27
## 3rd Qu.:565.5 3rd Qu.: 904.5 3rd Qu.:156.0 3rd Qu.: 75.00
## Max. :792.0 Max. :1268.0 Max. :580.0 Max. :154.00
## TEAM_BATTING_HBP TEAM_PITCHING_HR TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. :42.00 Min. : 0.0 Min. : 0.0 Min. : 73.0
## 1st Qu.:46.00 1st Qu.: 52.0 1st Qu.: 621.0 1st Qu.: 131.0
## Median :52.00 Median :104.0 Median : 777.0 Median : 163.0
## Mean :56.02 Mean :102.1 Mean : 806.5 Mean : 249.7
## 3rd Qu.:66.00 3rd Qu.:142.5 3rd Qu.: 953.0 3rd Qu.: 252.0
## Max. :96.00 Max. :336.0 Max. :9963.0 Max. :1568.0
## TEAM_FIELDING_DP
## Min. : 69.0
## 1st Qu.:121.0
## Median :146.0
## Mean :140.4
## 3rd Qu.:160.5
## Max. :204.0
Centering and scaling was used to transform individual predictors in the dataset using the caret library.
t = preProcess(imputed_med_Data,
c("BoxCox", "center", "scale"))
med_final = data.frame(
t = predict(t, imputed_med_Data))
summary(med_final)
## t.TEAM_BATTING_H t.TEAM_BATTING_2B t.TEAM_BATTING_3B
## Min. :-5.07603 Min. :-3.26217 Min. :-2.64215
## 1st Qu.:-0.52836 1st Qu.:-0.67016 1st Qu.:-0.73771
## Median :-0.06571 Median :-0.09057 Median : 0.08513
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.54648 3rd Qu.: 0.74495 3rd Qu.: 0.76149
## Max. : 4.16429 Max. : 3.00896 Max. : 2.35514
## t.TEAM_BATTING_HR t.TEAM_BATTING_BB t.TEAM_BATTING_SO
## Min. :-1.69766 Min. :-2.859388 Min. :-2.93148
## 1st Qu.:-0.90771 1st Qu.:-0.619108 1st Qu.:-0.72213
## Median : 0.09527 Median : 0.008141 Median :-0.09328
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.70771 3rd Qu.: 0.536019 3rd Qu.: 0.86047
## Max. : 2.59828 Max. : 2.968415 Max. : 2.38438
## t.TEAM_BASERUN_SB t.TEAM_BASERUN_CS t.TEAM_BATTING_HBP t.TEAM_PITCHING_HR
## Min. :-1.3407 Min. :-1.8973 Min. :-1.3581 Min. :-1.77169
## 1st Qu.:-0.7154 1st Qu.:-0.6870 1st Qu.:-0.8018 1st Qu.:-0.86977
## Median :-0.3339 Median :-0.2441 Median :-0.1349 Median : 0.03214
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.3126 3rd Qu.: 0.3167 3rd Qu.: 0.9305 3rd Qu.: 0.69991
## Max. : 4.8062 Max. : 2.6489 Max. : 2.1308 Max. : 4.05609
## t.TEAM_PITCHING_SO t.TEAM_FIELDING_E t.TEAM_FIELDING_DP
## Min. :-1.31288 Min. :-3.1354 Min. :-2.1966
## 1st Qu.:-0.30198 1st Qu.:-0.7317 1st Qu.:-0.7065
## Median :-0.04804 Median :-0.1378 Median : 0.1451
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.23846 3rd Qu.: 0.7208 3rd Qu.: 0.6745
## Max. :14.90538 Max. : 2.0409 Max. : 2.4054
eval_data <- predict(model3, newdata = med_final, interval="prediction")
eval_data
## fit lwr upr
## 1 -1.188703912 -2.801631098 0.42422327
## 2 -0.975852411 -2.587405318 0.63570050
## 3 -0.585833502 -2.196742233 1.02507523
## 4 0.282214067 -1.329046252 1.89347439
## 5 -1.237586845 -2.851812788 0.37663910
## 6 -1.018227203 -2.632402562 0.59594816
## 7 0.159187661 -1.455138991 1.77351431
## 8 -0.671020146 -2.282636111 0.94059582
## 9 -0.674110632 -2.286045016 0.93782375
## 10 -0.539014186 -2.149696146 1.07166777
## 11 -0.908734347 -2.521206765 0.70373807
## 12 -0.008168950 -1.621093522 1.60475562
## 13 0.104906055 -1.509380166 1.71919228
## 14 0.029780950 -1.583319259 1.64288116
## 15 0.368902624 -1.245713510 1.98351876
## 16 -0.479207877 -2.091109885 1.13269413
## 17 -0.704877184 -2.316855438 0.90710107
## 18 -0.122074980 -1.733100331 1.48895037
## 19 -0.745128405 -2.357311318 0.86705451
## 20 0.371914983 -1.240272455 1.98410242
## 21 0.309404144 -1.302542559 1.92135085
## 22 0.184283641 -1.427554877 1.79612216
## 23 0.089960494 -1.521718674 1.70163966
## 24 -0.696034436 -2.307681203 0.91561233
## 25 0.152415144 -1.459800645 1.76463093
## 26 0.471168682 -1.142109313 2.08444668
## 27 -0.782595772 -2.406150933 0.84095939
## 28 -0.504244973 -2.115090957 1.10660101
## 29 0.300364895 -1.312401842 1.91313163
## 30 -0.521175943 -2.134012595 1.09166071
## 31 0.685047403 -0.927620223 2.29771503
## 32 0.343697759 -1.267330046 1.95472556
## 33 0.311271780 -1.300909395 1.92345295
## 34 0.178522852 -1.435962660 1.79300837
## 35 0.052959132 -1.558334131 1.66425240
## 36 0.130007751 -1.484071831 1.74408733
## 37 -0.246329940 -1.856506149 1.36384627
## 38 0.477451381 -1.136248844 2.09115161
## 39 0.061566387 -1.550002204 1.67313498
## 40 0.495184111 -1.117337090 2.10770531
## 41 0.170942521 -1.441633769 1.78351881
## 42 1.327030319 -0.287826145 2.94188678
## 43 -1.436912856 -3.063628955 0.18980324
## 44 1.657339515 0.033738046 3.28094098
## 45 0.604521774 -1.010360386 2.21940393
## 46 0.930697121 -0.682412772 2.54380701
## 47 1.060283382 -0.552726934 2.67329370
## 48 -0.423778854 -2.034879371 1.18732166
## 49 -0.805048810 -2.416346923 0.80624930
## 50 -0.111199123 -1.721984002 1.49958576
## 51 -0.327496116 -1.938572804 1.28358057
## 52 0.209763524 -1.401988813 1.82151586
## 53 -0.426070507 -2.038206655 1.18606564
## 54 -0.247908468 -1.860061345 1.36424441
## 55 -0.611197155 -2.222117708 0.99972340
## 56 0.093144994 -1.518654246 1.70494423
## 57 0.617773553 -0.994801981 2.23034909
## 58 -0.399511598 -2.011106929 1.21208373
## 59 -1.067474921 -2.680453621 0.54550378
## 60 -0.271351571 -1.882430395 1.33972725
## 61 0.411150533 -1.200001886 2.02230295
## 62 0.008971026 -1.607084423 1.62502647
## 63 0.411557802 -1.199516053 2.02263166
## 64 0.308548785 -1.305488232 1.92258580
## 65 0.377231494 -1.236981598 1.99144459
## 66 1.291564181 -0.324872671 2.90800103
## 67 -0.607540694 -2.219195749 1.00411436
## 68 -0.339456176 -1.951510021 1.27259767
## 69 -0.167039064 -1.779040882 1.44496275
## 70 0.369686503 -1.243483304 1.98285631
## 71 0.274779556 -1.338855523 1.88841464
## 72 -0.384771750 -1.999982594 1.23043909
## 73 -0.207396877 -1.820855855 1.40606210
## 74 0.543432183 -1.071547012 2.15841138
## 75 -0.243530667 -1.856619020 1.36955769
## 76 -0.237590376 -1.850688313 1.37550756
## 77 0.391944421 -1.219345745 2.00323459
## 78 0.071845465 -1.539334619 1.68302555
## 79 -0.707455699 -2.319003057 0.90409166
## 80 -0.390873679 -2.001894870 1.22014751
## 81 0.191662483 -1.419630971 1.80295594
## 82 0.331875727 -1.279386092 1.94313755
## 83 0.798494850 -0.813761737 2.41075144
## 84 -0.487835474 -2.100862202 1.12519126
## 85 0.236059114 -1.375813599 1.84793183
## 86 -0.190661342 -1.803785449 1.42246277
## 87 0.172866215 -1.439663619 1.78539605
## 88 0.315253610 -1.295081481 1.92558870
## 89 0.771876896 -0.841297820 2.38505161
## 90 0.738058813 -0.873620461 2.34973809
## 91 0.127221950 -1.484646099 1.73909000
## 92 1.234894749 -0.381657668 2.85144717
## 93 -0.500930117 -2.112020003 1.11015977
## 94 0.067349892 -1.544344140 1.67904392
## 95 0.128406958 -1.483255817 1.74006973
## 96 0.177921646 -1.433556245 1.78939954
## 97 0.551033507 -1.063476455 2.16554347
## 98 1.129842456 -0.484313978 2.74399889
## 99 0.409059169 -1.203653915 2.02177225
## 100 0.334082664 -1.279320296 1.94748562
## 101 -0.105622652 -1.717374185 1.50612888
## 102 -0.473931456 -2.085319528 1.13745662
## 103 0.264393691 -1.346043946 1.87483133
## 104 0.269591258 -1.342156128 1.88133864
## 105 -0.487187224 -2.101228479 1.12685403
## 106 -0.844473452 -2.457586146 0.76863924
## 107 -1.529110955 -3.145785251 0.08756334
## 108 -0.066268807 -1.678720758 1.54618314
## 109 0.755330938 -0.856375893 2.36703777
## 110 -1.179629839 -2.794534176 0.43527450
## 111 0.336937738 -1.273928047 1.94780352
## 112 0.392931981 -1.218538450 2.00440241
## 113 0.744048052 -0.866884817 2.35498092
## 114 0.719973528 -0.891541003 2.33148806
## 115 0.060156800 -1.551268931 1.67158253
## 116 0.036204928 -1.574962926 1.64737278
## 117 0.253173653 -1.359456518 1.86580382
## 118 0.091442900 -1.518999191 1.70188499
## 119 -0.418907136 -2.030599210 1.19278494
## 120 -0.119719802 -1.732957060 1.49351746
## 121 0.599920829 -1.012434195 2.21227585
## 122 -0.928559816 -2.540774595 0.68365496
## 123 -0.664519205 -2.276994938 0.94795653
## 124 -0.975013516 -2.590351807 0.64032477
## 125 -0.830676284 -2.442893865 0.78154130
## 126 0.212659690 -1.399078127 1.82439751
## 127 0.400676007 -1.211411108 2.01276312
## 128 -0.340533499 -1.951532930 1.27046593
## 129 0.652151992 -0.959436715 2.26374070
## 130 0.438046767 -1.173961470 2.05005500
## 131 0.202610691 -1.408627104 1.81384849
## 132 0.117699268 -1.494461220 1.72985976
## 133 -0.666937111 -2.283235679 0.94936146
## 134 -0.063562489 -1.675769877 1.54864490
## 135 1.237544478 -0.379729562 2.85481852
## 136 -0.424598881 -2.037540603 1.18834284
## 137 -0.263794569 -1.875293359 1.34770422
## 138 -0.208344168 -1.819028616 1.40234028
## 139 1.062013491 -0.557749619 2.68177660
## 140 -0.077031170 -1.688201456 1.53413911
## 141 -1.202017518 -2.815325291 0.41129025
## 142 -0.543669952 -2.155525586 1.06818568
## 143 0.574721140 -1.037450267 2.18689255
## 144 -0.580114855 -2.191990481 1.03176077
## 145 -0.201476211 -1.813540101 1.41058768
## 146 -0.396270486 -2.006953811 1.21441284
## 147 -0.424079955 -2.035563519 1.18740361
## 148 0.012235815 -1.598735746 1.62320738
## 149 -0.134094498 -1.746418526 1.47822953
## 150 0.358206900 -1.252692253 1.96910605
## 151 0.121373157 -1.490687994 1.73343431
## 152 0.458047931 -1.156271021 2.07236688
## 153 -1.091101387 -2.715111868 0.53290909
## 154 -1.030449584 -2.642707851 0.58180868
## 155 -0.019741481 -1.631551499 1.59206854
## 156 -0.988478370 -2.601162038 0.62420530
## 157 0.833438637 -0.779314968 2.44619224
## 158 -0.737249321 -2.349351697 0.87485306
## 159 0.544022039 -1.067788981 2.15583306
## 160 -0.464209492 -2.075181647 1.14676266
## 161 1.192763045 -0.422709568 2.80823566
## 162 1.608078209 -0.007961502 3.22411792
## 163 0.974806731 -0.637808023 2.58742148
## 164 1.350078792 -0.265986965 2.96614455
## 165 1.077591667 -0.538343607 2.69352694
## 166 0.933343064 -0.680910433 2.54759656
## 167 0.144754363 -1.467547100 1.75705583
## 168 0.168398304 -1.444503557 1.78130016
## 169 -0.712666089 -2.325087477 0.89975530
## 170 -0.022671288 -1.634706482 1.58936391
## 171 0.570223658 -1.041554070 2.18200139
## 172 0.449488461 -1.161771373 2.06074829
## 173 0.120243507 -1.490742331 1.73122935
## 174 0.786653766 -0.825308821 2.39861635
## 175 0.025263833 -1.585580290 1.63610796
## 176 -0.167176497 -1.779147178 1.44479418
## 177 0.086591307 -1.526353437 1.69953605
## 178 -0.865603070 -2.478121732 0.74691559
## 179 -0.309436974 -1.919781098 1.30090715
## 180 -0.190839227 -1.801770266 1.42009181
## 181 0.399170213 -1.216207970 2.01454840
## 182 0.318134717 -1.294556657 1.93082609
## 183 0.440513332 -1.171394752 2.05242142
## 184 0.552131984 -1.059480847 2.16374482
## 185 1.103391260 -0.512298711 2.71908123
## 186 0.919819856 -0.700394214 2.54003393
## 187 0.530320633 -1.084560485 2.14520175
## 188 -0.469796995 -2.083396926 1.14380294
## 189 -1.002246473 -2.615017425 0.61052448
## 190 1.783289245 0.165378614 3.40119988
## 191 -0.603967482 -2.215200207 1.00726524
## 192 -0.126928397 -1.737999734 1.48414294
## 193 -0.551775195 -2.162777298 1.05922691
## 194 -0.444329737 -2.055526802 1.16686733
## 195 -0.384215468 -1.996769074 1.22833814
## 196 -1.074285278 -2.687088523 0.53851797
## 197 -0.409355606 -2.020136132 1.20142492
## 198 0.785419352 -0.828584646 2.39942335
## 199 0.069152915 -1.542005578 1.68031141
## 200 0.310159887 -1.301198400 1.92151817
## 201 -0.547262893 -2.160594645 1.06606886
## 202 0.162547089 -1.449273480 1.77436766
## 203 -0.063598367 -1.677399033 1.55020230
## 204 0.631162164 -0.980125780 2.24245011
## 205 0.072799475 -1.538671300 1.68427025
## 206 0.212851230 -1.398388817 1.82409128
## 207 0.108981150 -1.503185549 1.72114785
## 208 0.175126729 -1.436887358 1.78714082
## 209 -0.091293215 -1.702897446 1.52031102
## 210 -0.178430965 -1.790354750 1.43349282
## 211 1.592836102 -0.021588851 3.20726105
## 212 0.502322744 -1.109370883 2.11401637
## 213 0.016167798 -1.595723216 1.62805881
## 214 -1.194431949 -2.806461259 0.41759736
## 215 -0.778398648 -2.391184974 0.83438768
## 216 0.160704441 -1.450461237 1.77187012
## 217 -0.208031505 -1.822180420 1.40611741
## 218 0.643374006 -0.968575555 2.25532357
## 219 -0.170686526 -1.781556247 1.44018320
## 220 0.091191750 -1.519729181 1.70211268
## 221 -0.356844303 -1.968468330 1.25477972
## 222 -0.588014687 -2.200756089 1.02472671
## 223 -0.082170362 -1.693459299 1.52911858
## 224 -0.304005419 -1.918049883 1.31003905
## 225 0.456648669 -1.167714862 2.08101220
## 226 -0.203040390 -1.813841849 1.40776107
## 227 -0.128023986 -1.739091865 1.48304389
## 228 -0.197459618 -1.809863054 1.41494382
## 229 0.429681174 -1.181445213 2.04080756
## 230 -0.275925665 -1.888942923 1.33709159
## 231 -0.034969388 -1.647818378 1.57787960
## 232 0.569322662 -1.042239613 2.18088494
## 233 0.006375436 -1.606298315 1.61904919
## 234 0.271694288 -1.340869527 1.88425810
## 235 -0.206591382 -1.817363591 1.40418083
## 236 -0.346525083 -1.957093605 1.26404344
## 237 -0.303938405 -1.917120874 1.30924406
## 238 0.102396872 -1.510068137 1.71486188
## 239 0.749829744 -0.862960205 2.36261969
## 240 -0.688796106 -2.300046550 0.92245434
## 241 0.326092224 -1.284814139 1.93699859
## 242 0.745860360 -0.866678820 2.35839954
## 243 0.278486626 -1.333026401 1.88999965
## 244 0.167037606 -1.444900660 1.77897587
## 245 -1.527306256 -3.142697271 0.08808476
## 246 0.114727967 -1.497666595 1.72712253
## 247 -0.163295057 -1.774180029 1.44758991
## 248 0.196121627 -1.415252092 1.80749535
## 249 -0.340190856 -1.951442301 1.27106059
## 250 0.377720712 -1.236738070 1.99217949
## 251 0.188325815 -1.423531914 1.80018354
## 252 -0.751336980 -2.364881246 0.86220729
## 253 0.854123364 -0.759483249 2.46772998
## 254 -3.088483902 -4.717317539 -1.45965026
## 255 -0.796919418 -2.408282250 0.81444341
## 256 -0.364218157 -1.977809973 1.24937366
## 257 0.201884030 -1.410076170 1.81384423
## 258 0.038612801 -1.572571842 1.64979744
## 259 -0.303912358 -1.915901856 1.30807714
summary(eval_data)
## fit lwr upr
## Min. :-3.08848 Min. :-4.7173 Min. :-1.460
## 1st Qu.:-0.40443 1st Qu.:-2.0156 1st Qu.: 1.207
## Median : 0.06157 Median :-1.5500 Median : 1.673
## Mean : 0.00000 Mean :-1.6128 Mean : 1.613
## 3rd Qu.: 0.37457 3rd Qu.:-1.2386 3rd Qu.: 1.988
## Max. : 1.78329 Max. : 0.1654 Max. : 3.401