# loading libraries
library(tidyr)
library(tidyverse)
library(knitr)
library(kableExtra)
library(DT)
library(reshape2)
library(naniar) # for missing values
library(corrplot) # for plotting correlation matrix
library(imputeTS)
library(e1071) # To check skewness
library(caret)
library(jtools)
library(VIM) # Prints out the summary table of lm in a pretty way using summ() function
library(performance) # This package is to use check_model() for performance of model
library(see) # This packages is also used to support check_model()

Introduction

knitr::include_graphics('hw1.PNG')

train <- read_csv('https://raw.githubusercontent.com/habibkhan89/Data621/master/Homework%201/moneyball-training-data.csv', col_names = TRUE)[,-1] 
test <- read_csv('https://raw.githubusercontent.com/habibkhan89/Data621/master/Homework%201/moneyball-evaluation-data.csv',col_names=TRUE)[,-1] 

# Renaming the column names to make them less complicated
colnames(train) <- gsub("TEAM_", "", colnames(train))
colnames(test) <- gsub("TEAM_", "", colnames(test))

Data Exploration

print(train)
## # A tibble: 2,276 x 16
##    TARGET_WINS BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO
##          <dbl>     <dbl>      <dbl>      <dbl>      <dbl>      <dbl>      <dbl>
##  1          39      1445        194         39         13        143        842
##  2          70      1339        219         22        190        685       1075
##  3          86      1377        232         35        137        602        917
##  4          70      1387        209         38         96        451        922
##  5          82      1297        186         27        102        472        920
##  6          75      1279        200         36         92        443        973
##  7          80      1244        179         54        122        525       1062
##  8          85      1273        171         37        115        456       1027
##  9          86      1391        197         40        114        447        922
## 10          76      1271        213         18         96        441        827
## # ... with 2,266 more rows, and 9 more variables: BASERUN_SB <dbl>,
## #   BASERUN_CS <dbl>, BATTING_HBP <dbl>, PITCHING_H <dbl>, PITCHING_HR <dbl>,
## #   PITCHING_BB <dbl>, PITCHING_SO <dbl>, FIELDING_E <dbl>, FIELDING_DP <dbl>
summary(train)
##   TARGET_WINS       BATTING_H      BATTING_2B      BATTING_3B    
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##    BATTING_HR       BATTING_BB      BATTING_SO       BASERUN_SB   
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##    BASERUN_CS     BATTING_HBP      PITCHING_H     PITCHING_HR   
##  Min.   :  0.0   Min.   :29.00   Min.   : 1137   Min.   :  0.0  
##  1st Qu.: 38.0   1st Qu.:50.50   1st Qu.: 1419   1st Qu.: 50.0  
##  Median : 49.0   Median :58.00   Median : 1518   Median :107.0  
##  Mean   : 52.8   Mean   :59.36   Mean   : 1779   Mean   :105.7  
##  3rd Qu.: 62.0   3rd Qu.:67.00   3rd Qu.: 1682   3rd Qu.:150.0  
##  Max.   :201.0   Max.   :95.00   Max.   :30132   Max.   :343.0  
##  NA's   :772     NA's   :2085                                   
##   PITCHING_BB      PITCHING_SO        FIELDING_E      FIELDING_DP   
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0  
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0  
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0  
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4  
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0  
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0  
##                   NA's   :102                        NA's   :286
# Boxplot
ggplot(data = melt(train), aes(x=variable, y=value)) + geom_boxplot(aes(color=variable)) + coord_flip() + labs(title="BoxPlot for all variables")

# Skewness and outliers
ggplot(melt(train), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free')

# CHecking for missing values
sapply(train, function(x) sum(is.na(x)))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
##           0           0           0           0           0           0 
##  BATTING_SO  BASERUN_SB  BASERUN_CS BATTING_HBP  PITCHING_H PITCHING_HR 
##         102         131         772        2085           0           0 
## PITCHING_BB PITCHING_SO  FIELDING_E FIELDING_DP 
##           0         102           0         286
vis_miss(train) # From naniar library

gg_miss_var(train)+ labs(title="Visual Summaries of Missing Data")

# Creating Correlation Plot among the variables
train %>% cor(., use="pairwise.complete.obs", method = "pearson") %>% corrplot(.,method = "color", type= "upper", tl.col="black", diag=TRUE , number.cex = 0.5, addCoef.col = 'black', tl.srt=90)

train %>% gather(variable, value, -TARGET_WINS) %>% ggplot(., aes(value, TARGET_WINS))+geom_point()+geom_smooth(method="lm")+
  facet_wrap(~variable, scales="free")+ labs(title="Relationship between Predictors and TARGET_WINS")

Data Preparation

BATTIMT_HBP has 92 percent missing values out of it’s overall data that’s why it is essential to remove it out of model.

Model 1 - Basic Model

This model is created through removing basic extreme outliers from PITCHING_H and PITCHING_SO. Also, BATTING_HBP had 92% missing values that’s why it was removed from the basic model.

In this model, we are simply going to replace the missing values for each column with their median.

# Replacing extreme values with median for training & evaluation datasets and removing BATTING_HBP which has 92% missing values
train2 <- train %>% mutate(PITCHING_H = if_else(PITCHING_H > 5000, median(PITCHING_H), PITCHING_H),
                            PITCHING_SO = if_else(PITCHING_SO > 1500, median(PITCHING_SO), PITCHING_SO)) %>% 
  select(-BATTING_HBP)

test2 <- test %>% mutate(PITCHING_H = if_else(PITCHING_H > 5000, median(PITCHING_H), PITCHING_H),
                            PITCHING_SO = if_else(PITCHING_SO > 1500, median(PITCHING_SO), PITCHING_SO)) %>% 
  select(-BATTING_HBP)

All the missing values were replaced by the median of it’s column. Most of the variables don’t have extreme outliers but still it has some sort of skewness but I won’t remove them because it might change the model’s fitness. We will fix the model in next steps and check which model performs better. Variables like PITCHING_H and PITCHING_BB are skewed significantly which we have to transform in next models.

# Replacing missing values with median
train2[] <- lapply(train2, function(x) ifelse(is.na(x), median(x, na.rm=TRUE), x))
test2[] <- lapply(test2, function(x) ifelse(is.na(x), median(x, na.rm=TRUE), x))


# Verifying for missing values
sapply(train2, function(x) sum(is.na(x)))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
##           0           0           0           0           0           0 
##  BATTING_SO  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR PITCHING_BB 
##           0           0           0           0           0           0 
## PITCHING_SO  FIELDING_E FIELDING_DP 
##           0           0           0
vis_miss(train2) # From naniar library

# Checking skewness
sapply(train2, function(x) skewness(x))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
##  -0.3987232   1.5713335   0.2151018   1.1094652   0.1860421  -1.0257599 
##  BATTING_SO  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR PITCHING_BB 
##  -0.3126012   2.0658282   2.6021722   4.1130912   0.2877877   6.7438995 
## PITCHING_SO  FIELDING_E FIELDING_DP 
##  -0.1600733   2.9904656  -0.4551750

Model 2 - Log Transformation

In this model, we are going to use log transformation method which makes skewed distributed into normally distributed shape. Skewness got comparatively much better after applying log transformation on highly skewed variables. We have to apply log transformation in both training and test datasets to make sure consistency in both datasets. We used log10 from base model and added 1 on each skewed variable because there was lots of 0s in the data and log transformation cannot be applied on dataset if it has 0s. For that reason, it is recommended to add 1 to avoid that issue. Following 5 variables were highly skewed that’s why we applied log transformation on these. Skewness is comparatively much better now as it was in previous model.

train_log <- train2 # Model 2
test_log <- test2

# Applying log transformation for highly skewed variables
#training
train_log$PITCHING_BB <- log10(train_log$PITCHING_BB + 1)
train_log$PITCHING_H <- log10(train_log$PITCHING_H + 1)
train_log$FIELDING_E <- log10(train_log$FIELDING_E + 1)
train_log$BASERUN_SB <- log10(train_log$BASERUN_SB + 1)
train_log$BASERUN_CS <- log10(train_log$BASERUN_CS + 1)
#test
test_log$PITCHING_BB <- log10(test_log$PITCHING_BB + 1)
test_log$PITCHING_H <- log10(test_log$PITCHING_H + 1)
test_log$FIELDING_E <- log10(test_log$FIELDING_E + 1)
test_log$BASERUN_SB <- log10(test_log$BASERUN_SB + 1)
test_log$BASERUN_CS <- log10(test_log$BASERUN_CS + 1)


# CHecking skewness
sapply(train_log, function(x) skewness(x))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
##  -0.3987232   1.5713335   0.2151018   1.1094652   0.1860421  -1.0257599 
##  BATTING_SO  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR PITCHING_BB 
##  -0.3126012  -0.2565572  -0.7870606   2.5388764   0.2877877  -5.0646903 
## PITCHING_SO  FIELDING_E FIELDING_DP 
##  -0.1600733   1.2547515  -0.4551750
# Printing summary statistics
summary(train_log)
##   TARGET_WINS       BATTING_H      BATTING_2B      BATTING_3B    
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##    BATTING_HR       BATTING_BB      BATTING_SO       BASERUN_SB   
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :0.000  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 556.8   1st Qu.:1.833  
##  Median :102.00   Median :512.0   Median : 750.0   Median :2.009  
##  Mean   : 99.61   Mean   :501.6   Mean   : 736.3   Mean   :2.010  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 925.0   3rd Qu.:2.182  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :2.844  
##    BASERUN_CS      PITCHING_H     PITCHING_HR     PITCHING_BB   
##  Min.   :0.000   Min.   :3.056   Min.   :  0.0   Min.   :0.000  
##  1st Qu.:1.653   1st Qu.:3.152   1st Qu.: 50.0   1st Qu.:2.679  
##  Median :1.699   Median :3.182   Median :107.0   Median :2.730  
##  Mean   :1.697   Mean   :3.201   Mean   :105.7   Mean   :2.729  
##  3rd Qu.:1.742   3rd Qu.:3.221   3rd Qu.:150.0   3rd Qu.:2.787  
##  Max.   :2.305   Max.   :3.696   Max.   :343.0   Max.   :3.562  
##   PITCHING_SO       FIELDING_E     FIELDING_DP   
##  Min.   :   0.0   Min.   :1.820   Min.   : 52.0  
##  1st Qu.: 626.0   1st Qu.:2.107   1st Qu.:134.0  
##  Median : 811.0   Median :2.204   Median :149.0  
##  Mean   : 793.5   Mean   :2.291   Mean   :146.7  
##  3rd Qu.: 952.0   3rd Qu.:2.398   3rd Qu.:161.2  
##  Max.   :1491.0   Max.   :3.279   Max.   :228.0
# Skewness
ggplot(melt(train_log), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free')
## Warning in melt(train_log): The melt generic in data.table has been passed a
## tbl_df and will attempt to redirect to the relevant reshape2 method; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(train_log). In
## the next version, this warning will become an error.
## No id variables; using all as measure variables

Model 3 - BoxCox Transformation

In this model, we are going to use preProcess() function from caret’s package which not only applies BoxCox transformation but also makes the data much better through different methods but in this case we specified to apply boxcox along with center and scale. It simply means that we want to subtract the mean of predictor’s data from predictor’s values while scale divides by the standard deviation. I was hoping this model to perform better but seems like skewness did not dropped absolutely as good as it did with log transformation. We will see performance of the model in next step.

# Converting tibble to df
train_data_bx <- data.frame(train2) 
test_data_bx <- data.frame(test2)

dim(test2)
## [1] 259  14
# PreProcess function for boxcox transformation
preproc_value <- preProcess(train_data_bx[,-1], c("BoxCox", "center", "scale"))

# Transformation
train_bx_transformed <- predict(preproc_value, train_data_bx)
test_bx_transformed <- predict(preproc_value, test_data_bx)

# Normality and skewness
ggplot(melt(train_bx_transformed), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free')
## Warning in melt(train_bx_transformed): The melt generic in data.table has
## been passed a data.frame and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(train_bx_transformed). In the next version, this warning will
## become an error.
## No id variables; using all as measure variables

sapply(train_bx_transformed, function(x) skewness(x))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
## -0.39872320 -0.08517001 -0.01738229  1.10946519  0.18604214 -1.02575989 
##  BATTING_SO  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR PITCHING_BB 
## -0.31260120  2.06582822  2.60217224  0.77405438  0.28778767  6.74389947 
## PITCHING_SO  FIELDING_E FIELDING_DP 
## -0.16007329  0.21740959  0.05371277

Model 4 - knn Imputation on missing values

In model 4, we are going to replace the missing values with knn using Caret library’s preProcess function. We decided to remove BATTING_HBP and BASERUN_CS as they had 92% and 33% missing values respectively. We want to see if knnImputation with other selected variables might give better results. After imputation, we will use log transformation again with knn imputed values to see if it will cause any difference in the model. Few data scientists suggest to use knn imputation to replace missing values rather than median or mean. Again for consistency, we have to apply all transformation on both training and datasets

set.seed(1100)
library(VIM)

# knn imputation for missing values
# training
train_data_knn <- train %>% # TRAINING 
  select(-BATTING_HBP, -BASERUN_CS) 
train_knn <- kNN(train_data_knn, variable = c("BATTING_SO", "BASERUN_SB", "PITCHING_SO","FIELDING_DP"),k=6)

#test
test_data_knn <- test %>% # TEST
  select(-BATTING_HBP, -BASERUN_CS) 
test_knn <- kNN(test_data_knn, variable = c("BATTING_SO", "BASERUN_SB", "PITCHING_SO","FIELDING_DP"),k=6)

# CHecking for missing values
colSums(is.na(train_knn))
##     TARGET_WINS       BATTING_H      BATTING_2B      BATTING_3B      BATTING_HR 
##               0               0               0               0               0 
##      BATTING_BB      BATTING_SO      BASERUN_SB      PITCHING_H     PITCHING_HR 
##               0               0               0               0               0 
##     PITCHING_BB     PITCHING_SO      FIELDING_E     FIELDING_DP  BATTING_SO_imp 
##               0               0               0               0               0 
##  BASERUN_SB_imp PITCHING_SO_imp FIELDING_DP_imp 
##               0               0               0
# Log transformation on knn imputed dataset
train_knn_log <- train_data_knn # Model 2
test_knn_log <- test_data_knn
#test_log <- test2

# Applying log transformation for highly skewed variables
#training
train_knn_log$PITCHING_BB <- log10(train_knn_log$PITCHING_BB + 1)
train_knn_log$PITCHING_H <- log10(train_knn_log$PITCHING_H + 1)
train_knn_log$FIELDING_E <- log10(train_knn_log$FIELDING_E + 1)
train_knn_log$BASERUN_SB <- log10(train_knn_log$BASERUN_SB + 1)

# TEST DATASET TRANSFORMATION
test_knn_log$PITCHING_BB <- log10(test_knn_log$PITCHING_BB + 1)
test_knn_log$PITCHING_H <- log10(test_knn_log$PITCHING_H + 1)
test_knn_log$FIELDING_E <- log10(test_knn_log$FIELDING_E + 1)
test_knn_log$BASERUN_SB <- log10(test_knn_log$BASERUN_SB + 1)

# CHecking skewness and normality
sapply(train_knn_log, function(x) skewness(x))
## TARGET_WINS   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB 
##  -0.3987232   1.5713335   0.2151018   1.1094652   0.1860421  -1.0257599 
##  BATTING_SO  BASERUN_SB  PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO 
##          NA          NA   4.2997637   0.2877877  -5.0646903          NA 
##  FIELDING_E FIELDING_DP 
##   1.2547515          NA
ggplot(melt(train_knn_log), aes(x=value))+geom_density()+facet_wrap(~variable, scales='free')
## Warning in melt(train_knn_log): The melt generic in data.table has been
## passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(train_knn_log). In the next version, this warning will become an
## error.
## No id variables; using all as measure variables
## Warning: Removed 621 rows containing non-finite values (stat_density).

Build Models

Model 1

model1 <- lm(TARGET_WINS ~ ., train2)
#summary(model1)
summ(model1)
Observations 2276
Dependent variable TARGET_WINS
Type OLS linear regression
F(14,2261) 77.29
0.32
Adj. R² 0.32
Est. S.E. t val. p
(Intercept) 19.02 5.42 3.51 0.00
BATTING_H 0.04 0.00 11.97 0.00
BATTING_2B -0.02 0.01 -2.64 0.01
BATTING_3B 0.07 0.02 4.37 0.00
BATTING_HR 0.09 0.03 3.16 0.00
BATTING_BB 0.01 0.00 1.44 0.15
BATTING_SO -0.00 0.00 -0.74 0.46
BASERUN_SB 0.03 0.00 6.41 0.00
BASERUN_CS -0.01 0.02 -0.75 0.45
PITCHING_H 0.01 0.00 6.26 0.00
PITCHING_HR -0.03 0.03 -1.15 0.25
PITCHING_BB 0.01 0.00 2.72 0.01
PITCHING_SO -0.00 0.00 -0.14 0.89
FIELDING_E -0.03 0.00 -11.55 0.00
FIELDING_DP -0.11 0.01 -8.60 0.00
Standard errors: OLS

First of all the significance of the model is significant as per the p-value of F-statistics. It means that overall the model is significant. Adjusted r2 is 0.32 which means these variables accounts 32% of TARGET_WINS. All the variables other than BASERUN_CS and PITCHING_SO are insignificant. BASERUN_CS has 33% missing values which should have negative impact and it has negative but insignificant. It means Caught Stealing and it does not impact the chances of winning significantly. PITCHING_SO means strikeouts by pitchers which also has insigificant impact on TARGET_WINS. BATTING_2B should have positive impact but in model it shows significant but with very little coefficient. BATTING_HR means homeruns by batters should’ve and have positive impact on chances of winning. BATTING_BB means walks by batter. It should have positive impact but in our model it has insignificant impact. BATTING_SO means strikeouts by batters which shoul’ve negative impact but in our model it does not play significant role. BASERUN_SB should’ve significant impact and it has positive role. BASERUN_CS, PITCHING_HR and PITCHING_SO have insignificant impact while PITCHING_H, PITCHING_BB, FIELDING_E and FIELDING_DP have significant roles.

\[ TARGET_WINS = 19.02 + BATTING_H(0.04) - BATTING_2B(0.02) + BATTING_3B(0.07) + BATTING_HR(0.09) + BASERUN_SB(0.03) + PITCHING_H(0.01) + PITCHING_BB(0.01) - FIELDING_E (0.03) - FIELDING_DP(0.11) \]

Model 2

model2 <- lm(TARGET_WINS ~ ., train_log)
summ(model2)
Observations 2276
Dependent variable TARGET_WINS
Type OLS linear regression
F(14,2261) 78.96
0.33
Adj. R² 0.32
Est. S.E. t val. p
(Intercept) -21.40 18.05 -1.19 0.24
BATTING_H 0.04 0.00 11.30 0.00
BATTING_2B -0.03 0.01 -3.50 0.00
BATTING_3B 0.11 0.02 6.38 0.00
BATTING_HR 0.06 0.03 2.06 0.04
BATTING_BB 0.02 0.00 3.57 0.00
BATTING_SO -0.00 0.00 -0.96 0.34
BASERUN_SB 10.30 1.42 7.24 0.00
BASERUN_CS -5.59 2.30 -2.44 0.01
PITCHING_H 31.36 5.02 6.25 0.00
PITCHING_HR -0.02 0.02 -0.65 0.52
PITCHING_BB 1.02 3.81 0.27 0.79
PITCHING_SO -0.00 0.00 -0.31 0.76
FIELDING_E -27.31 2.30 -11.90 0.00
FIELDING_DP -0.12 0.01 -8.86 0.00
Standard errors: OLS

Adjusted-r2 did not improve overall as compared with model 1 and intercept is insignificant in this model which shows that the other variables which were not included in this model are not important. If that’s the case the adjusted r-square should’ve improved as compared with model 1. We are not going to prefer this model and also some of variables have become insignificant which were not before.

Model 3

model3 <- lm(TARGET_WINS ~ ., train_bx_transformed)
summ(model3)
Observations 2276
Dependent variable TARGET_WINS
Type OLS linear regression
F(14,2261) 70.38
0.30
Adj. R² 0.30
Est. S.E. t val. p
(Intercept) 80.79 0.28 292.29 0.00
BATTING_H 5.40 0.55 9.76 0.00
BATTING_2B -1.03 0.44 -2.33 0.02
BATTING_3B 3.31 0.49 6.81 0.00
BATTING_HR 2.16 1.77 1.22 0.22
BATTING_BB 2.10 0.46 4.60 0.00
BATTING_SO -0.27 1.06 -0.26 0.80
BASERUN_SB 2.01 0.36 5.51 0.00
BASERUN_CS -0.01 0.30 -0.04 0.97
PITCHING_H 1.15 0.47 2.43 0.02
PITCHING_HR 0.11 1.57 0.07 0.94
PITCHING_BB 0.30 0.39 0.77 0.44
PITCHING_SO -1.06 0.82 -1.29 0.20
FIELDING_E -5.98 0.60 -9.91 0.00
FIELDING_DP -3.12 0.32 -9.77 0.00
Standard errors: OLS

With boxcox transformation, intercept is now again significant but adjusted r-square dropped a little. I would prefer to look at the model at this time rather than merely depending on r-square or adjusted r-square’s values. BASERUN_CS, PITCHING_HR, PITCHING_BB and PITCHING_SO are still insignificant here which are almost consistent with model 2.

Model 4

model4 <- lm(TARGET_WINS ~ ., train_knn_log)
summ(model4)
Observations 1835 (441 missing obs. deleted)
Dependent variable TARGET_WINS
Type OLS linear regression
F(13,1821) 91.86
0.40
Adj. R² 0.39
Est. S.E. t val. p
(Intercept) -83.38 101.46 -0.82 0.41
BATTING_H -0.01 0.01 -0.70 0.48
BATTING_2B -0.06 0.01 -6.44 0.00
BATTING_3B 0.19 0.02 10.07 0.00
BATTING_HR 0.10 0.08 1.26 0.21
BATTING_BB 0.08 0.02 3.57 0.00
BATTING_SO 0.01 0.02 0.43 0.67
BASERUN_SB 12.22 1.23 9.90 0.00
PITCHING_H 128.45 42.86 3.00 0.00
PITCHING_HR -0.01 0.08 -0.14 0.89
PITCHING_BB -59.42 28.85 -2.06 0.04
PITCHING_SO -0.03 0.02 -1.40 0.16
FIELDING_E -49.22 3.07 -16.04 0.00
FIELDING_DP -0.11 0.01 -9.33 0.00
Standard errors: OLS

Seems like knn-imputation along with log transformation did improve the adjusted r-square from 0.33 to 0.42 which means it’s better now. As I discussed merely depending upon adjusted r-sq is not a good idea and we have to dig in the model if results actually mean in reality or not. According to new model BATTING_HR is insignificant all the other variables are significant at 10 % confidence interval which means result did improve here. BATTING_2B is negative though but consistent with all models but to very low extent. BATTING_SO is negative impacting which makes sense. PITCHING_H, PITCHING_BB and PITCHING_E have also negative impact which totally makes sense.

Our model will be: \[ TARGET_WINS = 154.048 + BATTING_H (0.05) - BATTING_2B (0.04) + BATTING_3B (0.10) + BATTING_BB (0.04) - BATTING_SO (0.02) + BASERUN_SB (12.89) - PITCHING_H (9.25) + PITCHING_HR (0.08) - PITCHING_BB (9.83) + PITCHING_SO (0.00) - FIELDING_E (46.98) - FIELDING_DP (0.14) \]

Select Model

As discussed I would prefer to selecting Model 4 because not only it improved adjusted r-square but also the intercept also has become significant which was not the case in model 3. Also, the significance and directions of each variable’s impact on TARGET_WINS make totally sense. We came to find that knn imputation along with log transformation improved the model significantly as compared with other variables. It also shows that we cannot randomly just replace the missing values with mean or median. We hvae to check our model with different criteria to see which transformation techniques do well overall in terms of not only r-square but also we have to see if the result actually makes sense or not. At this point we are convinced about performance of model 4 but we’ll take one step extra and double check.

# Checking the overall performance of model
model_test <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_BB + BATTING_SO + BASERUN_SB + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, data=train_knn_log)
check_model(model_test)

Data looks normally distributed but looks like there are high collinearity among some of the variables as shown below which might be affect their significance or coefficients in final model. According to Jim Frost, multicollinearity does not influence the predictions, precision and goodness of fit. He adds that if primary goal is to make predictions then you do not have to understand the role of each independent variable. There is some homoskedasticity in residuals as shown in graph 2 in second row.

# CHecking multicollinearity
check_collinearity(model_test)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##    Parameter  VIF Increased SE
##    BATTING_H 2.37         1.54
##   BATTING_2B 1.07         1.03
##   BATTING_3B 1.33         1.15
##   BATTING_SO 3.31         1.82
##   BASERUN_SB 1.23         1.11
##  PITCHING_HR 1.15         1.07
##  PITCHING_SO 3.05         1.75
##   FIELDING_E 1.45         1.20
##  FIELDING_DP 1.17         1.08
## 
## Moderate Correlation
## 
##    Parameter  VIF Increased SE
##   BATTING_BB 6.35         2.52
##   PITCHING_H 8.71         2.95
##  PITCHING_BB 9.56         3.09
# Checking model performance
model_performance(model_test)
## # Indices of model performance
## 
##      AIC |      BIC |   R2 | R2_adjusted |  RMSE
## ------------------------------------------------
## 13768.59 | 13845.80 | 0.40 |        0.39 | 10.23

It seems that model 4’s performance is almost consistent with other models in terms of collinearity and RMSE but adjusted r2 is comparatively better than others. Also, the results make sense too practically so I would select model 4.

References

https://easystats.github.io/performance/

https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/#:~:text=Multicollinearity%20makes%20it%20hard%20to,a%20way%20to%20fix%20multicollinearity.