Multiple Linear Regression

library(MASS)
library(caret)
library(car)
library(corrplot)
library(knitr)
library(mice)

Load the dataset

Load the data set that was curated after the preliminary explanatory analysis.

Plotted a correlation matrix on the original data set

df = read.csv('https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/baseball_output.csv')
corrplot(cor(df, use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)

Found that there is a data point with 0 value which needs to be corrected to a non-zero val for BOX-COX transformation later

df[df$WINS == 0,]
##         X WINS bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS  ph_H ph_HR ph_BB
## 1211 1211    0  891   135     0     0     0     0     0     0 24057     0     0
##      ph_SO fd_E fd_DP bt_1B BB
## 1211     0 1890   219   756 NA

Remove the index column that got added in the preliminary step. Also lift each datapoints by 1 to fix the zero data point.

df <- subset(df, select = -c(X))
df <- df + 1  

Split the data into training and test data set(80% training and 20% testing)

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(df$WINS, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)

Assumption of Ordinary Least square regression

Before building and trying out different linear regression models, we will review the assumptions for the OLS algorithm to make it perform well.

  1. Residual Error should be normally distributed
  2. Absence of hetroschdasticity
  3. Absence of Colinearity

We will check these assumption/factors while reviewing the results of each model.

Full Model

Fitting a full model with all remaining independent variables( “bt_H” “bt_2B” “bt_3B” “bt_HR” “bt_BB” “bt_SO” “br_SB” “br_CS” “ph_H” “ph_HR” “ph_BB” “ph_SO” “fd_E” “fd_DP” “bt_1B” “BB”) and the response variable WINS

colnames(training_set)
##  [1] "WINS"  "bt_H"  "bt_2B" "bt_3B" "bt_HR" "bt_BB" "bt_SO" "br_SB" "br_CS"
## [10] "ph_H"  "ph_HR" "ph_BB" "ph_SO" "fd_E"  "fd_DP" "bt_1B" "BB"

create a dataframe for holding the regression metrics.

regressor_metrics <- data.frame(matrix(ncol = 6, nrow = 0) ,stringsAsFactors = FALSE)
# Fitting Multiple Linear Regression to the Training set
fullregressor = lm(formula = WINS ~ . ,
               data = training_set)

Full model Stats

summary(fullregressor)
## 
## Call:
## lm(formula = WINS ~ ., data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.405  -8.181   0.329   7.759  59.627 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.115e+02  1.330e+01   8.383  < 2e-16 ***
## bt_H         4.365e-02  4.012e-03  10.882  < 2e-16 ***
## bt_2B       -2.487e-02  9.817e-03  -2.533  0.01139 *  
## bt_3B        6.384e-02  1.844e-02   3.461  0.00055 ***
## bt_HR        1.558e-01  3.360e-02   4.638 3.78e-06 ***
## bt_BB        4.178e-02  6.877e-03   6.076 1.50e-09 ***
## bt_SO       -1.176e-02  2.726e-03  -4.316 1.68e-05 ***
## br_SB        4.624e-02  5.487e-03   8.426  < 2e-16 ***
## br_CS        1.364e-02  1.122e-02   1.216  0.22433    
## ph_H         1.479e-03  4.608e-04   3.210  0.00135 ** 
## ph_HR       -7.223e-02  3.069e-02  -2.354  0.01869 *  
## ph_BB       -2.114e-02  4.834e-03  -4.373 1.30e-05 ***
## ph_SO        8.737e-04  9.125e-04   0.958  0.33841    
## fd_E        -5.399e-02  3.684e-03 -14.655  < 2e-16 ***
## fd_DP       -1.150e-01  1.381e-02  -8.326  < 2e-16 ***
## bt_1B               NA         NA      NA       NA    
## BB          -4.238e+01  6.263e+00  -6.766 1.78e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.47 on 1809 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3918, Adjusted R-squared:  0.3868 
## F-statistic: 77.69 on 15 and 1809 DF,  p-value: < 2.2e-16
plot(fullregressor$residuals)
abline(h = 0, lty = 3)

par(mfrow=c(2,2))
plot(fullregressor)

Test evaluation Metrics and prediction results

we see the two independent variables has p-value > 0.05. we will remove this independent variable from the model and try its performance.

since the R-square and RMSE of the model is not that great and it shows a possible underfitting problem. We will try out some transformation like backward elimination, square, logarithmic and BOX-COX transformations and review the results.

predictions = predict(fullregressor, newdata = test_set)
## Warning in predict.lm(fullregressor, newdata = test_set): prediction from a
## rank-deficient fit may be misleading
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     66.72792     83
## 10    66.58775     77
## 21    74.98691     71
## 29    75.99609     82
## 46    87.60120     86
## 48    91.52018     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(fullregressor)$adj.r.squared, digits = 4)
r2_train <- round(summary(fullregressor)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(fullregressor$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   12.8927    12.4108   0.3918   0.241       0.3868
regressor_metrics <- rbind(regressor_metrics, c("Full-Model", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

Backward Elimination

Through backward elimination process, some independent variables(ph_SO,br_CS, bt_1B) that has pvalue more than the significance level of 0.05 were removed from the full model

regressor_backward_E1 = lm(formula = WINS ~  bt_H+ bt_2B+ bt_3B+ bt_HR+ bt_BB+ bt_SO+ br_SB+ ph_H+ ph_HR+ ph_BB + fd_E+ fd_DP+ BB ,data = training_set)

Backward elimination Model Stats

summary(regressor_backward_E1)
## 
## Call:
## lm(formula = WINS ~ bt_H + bt_2B + bt_3B + bt_HR + bt_BB + bt_SO + 
##     br_SB + ph_H + ph_HR + ph_BB + fd_E + fd_DP + BB, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.845  -8.187   0.450   7.839  60.727 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.139e+02  1.319e+01   8.637  < 2e-16 ***
## bt_H         4.346e-02  3.973e-03  10.937  < 2e-16 ***
## bt_2B       -2.517e-02  9.732e-03  -2.587 0.009765 ** 
## bt_3B        6.687e-02  1.833e-02   3.648 0.000272 ***
## bt_HR        1.623e-01  3.267e-02   4.967 7.42e-07 ***
## bt_BB        3.854e-02  6.228e-03   6.188 7.51e-10 ***
## bt_SO       -1.088e-02  2.511e-03  -4.335 1.54e-05 ***
## br_SB        5.046e-02  4.592e-03  10.990  < 2e-16 ***
## ph_H         1.441e-03  4.589e-04   3.139 0.001722 ** 
## ph_HR       -8.049e-02  2.969e-02  -2.711 0.006778 ** 
## ph_BB       -1.847e-02  4.188e-03  -4.411 1.09e-05 ***
## fd_E        -5.440e-02  3.631e-03 -14.982  < 2e-16 ***
## fd_DP       -1.171e-01  1.344e-02  -8.710  < 2e-16 ***
## BB          -4.282e+01  6.236e+00  -6.866 9.05e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.47 on 1811 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.391,  Adjusted R-squared:  0.3866 
## F-statistic: 89.42 on 13 and 1811 DF,  p-value: < 2.2e-16
plot(regressor_backward_E1$residuals)
abline(h = 0, lty = 3)

par(mfrow=c(2,2))
plot(regressor_backward_E1)

Test evaluation Metrics and prediction results

predictions = predict(regressor_backward_E1, newdata = test_set)
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     66.80905     83
## 10    66.85463     77
## 21    74.79397     71
## 29    76.03111     82
## 46    87.70131     86
## 48    91.57863     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_backward_E1)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_backward_E1)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_backward_E1$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   12.9301    12.4195    0.391  0.2381       0.3866
regressor_metrics <- rbind(regressor_metrics, c("Backward elimination-1", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

Backward Elimination + removal of colinear Variables

Performing a VIF Test on all variables to remove some independent variables which are colinear( VIF has more than 5)

model1 <- lm(WINS ~ bt_H+ bt_2B+ bt_3B+ bt_HR+ bt_BB+ bt_SO+ br_SB+ ph_H+ ph_HR+ ph_BB + fd_E+ fd_DP+ BB, data = df)
car::vif(model1)
##      bt_H     bt_2B     bt_3B     bt_HR     bt_BB     bt_SO     br_SB      ph_H 
##  4.014112  2.464797  3.033585 46.653155  5.981965  4.528813  2.695233  4.896792 
##     ph_HR     ph_BB      fd_E     fd_DP        BB 
## 39.716223  4.674980  8.358823  1.898830 10.757738

remove Colinear variables bt_HR + ph_HR + BB + bt_BB. Also remove variables with p-value > 0.05 (bt_3B, bt_SO, ph_H)

regressor_backward_E2 = lm(formula = WINS ~  bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP  ,data = training_set)

Backward elimination Model Stats( with removal of colinear variables)

summary(regressor_backward_E2)
## 
## Call:
## lm(formula = WINS ~ bt_H + bt_2B + br_SB + ph_BB + fd_E + fd_DP, 
##     data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.579  -8.577   0.122   8.482  49.278 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.207764   3.593651   3.397 0.000696 ***
## bt_H         0.059449   0.002877  20.662  < 2e-16 ***
## bt_2B       -0.021329   0.009053  -2.356 0.018579 *  
## br_SB        0.040128   0.004117   9.747  < 2e-16 ***
## ph_BB        0.006392   0.001998   3.200 0.001399 ** 
## fd_E        -0.038440   0.001792 -21.450  < 2e-16 ***
## fd_DP       -0.084982   0.013246  -6.416 1.78e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared:  0.3511, Adjusted R-squared:  0.349 
## F-statistic: 164.1 on 6 and 1819 DF,  p-value: < 2.2e-16
plot(regressor_backward_E2$residuals)
abline(h = 0, lty = 3)

par(mfrow=c(2,2))
plot(regressor_backward_E2)

predictions = predict(regressor_backward_E2, newdata = test_set)
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     70.70862     83
## 10    70.34577     77
## 21    77.07268     71
## 29    79.39163     82
## 46    84.77783     86
## 48    88.10494     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_backward_E2)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_backward_E2)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_backward_E2$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   12.7212    12.9056   0.3511  0.2413        0.349
regressor_metrics <- rbind(regressor_metrics, c("Backward elimination-2", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

Square transformation Model

# Fitting Multiple Linear Regression to the Training set
#"bt_H"  "bt_2B" "bt_3B" "br_SB" "br_CS" "ph_H"  "fd_E"  "fd_DP"
regressor_sq = lm(WINS ~ I(bt_H^2)+ I(bt_2B^2) + (br_SB^2) + (ph_BB^2) + (fd_E^2)+ (fd_DP^2) ,
               data = training_set)

Square transformation Model Stats

summary(regressor_sq)
## 
## Call:
## lm(formula = WINS ~ I(bt_H^2) + I(bt_2B^2) + (br_SB^2) + (ph_BB^2) + 
##     (fd_E^2) + (fd_DP^2), data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.831  -8.729   0.132   8.529  49.738 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.641e+01  2.486e+00  22.694  < 2e-16 ***
## I(bt_H^2)    1.858e-05  8.849e-07  21.002  < 2e-16 ***
## I(bt_2B^2)  -3.519e-05  1.734e-05  -2.029  0.04256 *  
## br_SB        4.152e-02  4.116e-03  10.088  < 2e-16 ***
## ph_BB        5.967e-03  1.998e-03   2.987  0.00286 ** 
## fd_E        -3.987e-02  1.795e-03 -22.219  < 2e-16 ***
## fd_DP       -8.490e-02  1.322e-02  -6.424 1.69e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared:  0.3508, Adjusted R-squared:  0.3487 
## F-statistic: 163.8 on 6 and 1819 DF,  p-value: < 2.2e-16
plot(regressor_sq$residuals)
abline(h = 0, lty = 3)

par(mfrow=c(2,2))
plot(regressor_sq)

Test evaluation Metrics and prediction results

RMSE(test) has improved, but R-square is reduced slightly.

predictions = predict(regressor_sq, newdata = test_set)
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     71.49189     83
## 10    71.68233     77
## 21    76.78273     71
## 29    79.73048     82
## 46    84.73991     86
## 48    88.01546     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_sq)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_sq)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_sq$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   12.6774    12.9088   0.3508  0.2459       0.3487
regressor_metrics <- rbind(regressor_metrics, c("Square Transformation", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

Logarithmic transformation

# Fitting Multiple Linear Regresion to the Training set
regressor_log = lm(WINS ~  log1p(bt_H)+ log1p(bt_2B) + log1p(br_SB) + log1p(ph_BB) + log1p(fd_E)+ log1p(fd_DP),
               data = training_set)

Logarithmic transformation Stats

summary(regressor_log)
## 
## Call:
## lm(formula = WINS ~ log1p(bt_H) + log1p(bt_2B) + log1p(br_SB) + 
##     log1p(ph_BB) + log1p(fd_E) + log1p(fd_DP), data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.379  -8.525  -0.049   8.348  50.265 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -453.2839    26.5293 -17.086  < 2e-16 ***
## log1p(bt_H)    97.0853     4.7161  20.586  < 2e-16 ***
## log1p(bt_2B)   -9.6544     2.3418  -4.123 3.91e-05 ***
## log1p(br_SB)    4.8338     0.5874   8.229 3.56e-16 ***
## log1p(ph_BB)    3.5732     1.1583   3.085  0.00207 ** 
## log1p(fd_E)   -14.5104     0.7717 -18.804  < 2e-16 ***
## log1p(fd_DP)  -17.8524     1.8683  -9.555  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.02 on 1819 degrees of freedom
## Multiple R-squared:  0.3419, Adjusted R-squared:  0.3397 
## F-statistic: 157.5 on 6 and 1819 DF,  p-value: < 2.2e-16
plot(regressor_log$residuals)
abline(h = 0, lty = 3)

par(mfrow=c(2,2))
plot(regressor_log)

Test evaluation Metrics and prediction results

Residual Error plot developed a slight cure and OLS assumptions are not met.

predictions = predict(regressor_log, newdata = test_set)
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     69.82943     83
## 10    69.84392     77
## 21    74.99316     71
## 29    82.81690     82
## 46    88.47094     86
## 48    94.84237     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_log)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_log)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_log$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   13.0509    12.9974   0.3419  0.2117       0.3397
regressor_metrics <- rbind(regressor_metrics, c("Log Transformation", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

box cox transformation

Trying out a box-cox transformation. Used the best model so far which is backward elimination model to apply BOX-COX transformation. Lambda comes close to 1. So it doesnt make any difference and there is no need to apply the BOX-COX tranformation.

bc = boxcox(regressor_backward_E2)

Cross validation

Performing a cross validation algorithm if it make some improvement.

library(caret)
set.seed(123)

regression_cv <- train(
  WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , training_set,
  method = "lm",
  trControl = trainControl(
    method = "cv", 
    number =10,
    verboseIter = TRUE
  )
)
summary(regression_cv)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -53.579  -8.577   0.122   8.482  49.278 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.207764   3.593651   3.397 0.000696 ***
## bt_H         0.059449   0.002877  20.662  < 2e-16 ***
## bt_2B       -0.021329   0.009053  -2.356 0.018579 *  
## br_SB        0.040128   0.004117   9.747  < 2e-16 ***
## ph_BB        0.006392   0.001998   3.200 0.001399 ** 
## fd_E        -0.038440   0.001792 -21.450  < 2e-16 ***
## fd_DP       -0.084982   0.013246  -6.416 1.78e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared:  0.3511, Adjusted R-squared:  0.349 
## F-statistic: 164.1 on 6 and 1819 DF,  p-value: < 2.2e-16
predictions = predict(regression_cv, newdata = test_set)
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     70.70862     83
## 10    70.34577     77
## 21    77.07268     71
## 29    79.39163     82
## 46    84.77783     86
## 48    88.10494     99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regression_cv)$adj.r.squared, digits = 4)
r2_train <- round(summary(regression_cv)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regression_cv$residuals^2)), digits = 4)

data.frame(
  rmse_test = rmse_test,
  rmse_train = rmse_train,
  r2_train = r2_train,
  r2_test = r2_test,
  adj_r2_train = adj_r2_train
  
)
##   rmse_test rmse_train r2_train r2_test adj_r2_train
## 1   12.7212        NaN   0.3511  0.2413        0.349
regressor_metrics <- rbind(regressor_metrics, c("Cross Validation", r2_train , adj_r2_train, rmse_train  , rmse_test , r2_test), stringsAsFactors = FALSE)

metrics <- c("regressor", "Rsquare(Train-set)", "Adjusted-RSquare(Training-set)","RMSE(Train-set)" ,  "RMSE(Test-set)", "R-Square(Test)")
colnames(regressor_metrics) <- metrics
kable(regressor_metrics)
regressor Rsquare(Train-set) Adjusted-RSquare(Training-set) RMSE(Train-set) RMSE(Test-set) R-Square(Test)
Full-Model 0.3918 0.3868 12.4108 12.8927 0.241
Backward elimination-1 0.391 0.3866 12.4195 12.9301 0.2381
Backward elimination-2 0.3511 0.349 12.9056 12.7212 0.2413
Square Transformation 0.3508 0.3487 12.9088 12.6774 0.2459
Log Transformation 0.3419 0.3397 12.9974 13.0509 0.2117
Cross Validation 0.3511 0.349 NaN 12.7212 0.2413

Summary of findings

  1. Model built using Backward Elimination-2 and Square transformation looks to be the best among all models considering comparetively low RMSE(test) and comparetively good R-square values.

  2. Less R^2 and high RMSE shows an underfitting problem. The dataset doesnt follow a linear relationship with the response variable.

  3. Although the model exhibits an underfitting problem, it slightly met the ordinary least square assumptions.
  1. Residuals doesnt have high variance.
  2. Residual QQ plots gives a slight straight line.
  1. Residual Error plot shows a slight cure and OLS assumptions are not met.

  2. Metrics shows RMSE(Train) and RMSE(TEST) is almost same and dont have much differences. But the R^2 has some difference for all the models. We will see some overfitting solution and check the model gets improved further in next step.

Does overfitting exist ?

RMSE(train) and RMSE(test) doesnt indicate there is a problem of overfitting, but R^2 has some difference. We want to see if the model gets improved using some of the underfitting solutions.

Ridge Regression

Lets try out the ridge regression with cross validation.

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
lambda <- 10^seq(-3, 3, length = 100)

# Build the model
set.seed(123)
ridge <- train(
  WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 0, lambda = lambda)
  )
# Model coefficients
coef(ridge$finalModel, ridge$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) 16.789041438
## bt_H         0.053359100
## bt_2B       -0.007399118
## br_SB        0.035809230
## ph_BB        0.006233650
## fd_E        -0.034062969
## fd_DP       -0.080906289

R^2(test) and RMSE doesn’t get improved

# Make predictions
predictions <- predict(ridge, test_set)
# Model prediction performance
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     70.99658     83
## 10    71.00643     77
## 21    77.27990     71
## 29    79.50153     82
## 46    84.89866     86
## 48    88.04531     99
data.frame(
  RMSE = RMSE(predictions, test_set$WINS),
  Rsquare = R2(predictions, test_set$WINS)
)
##       RMSE   Rsquare
## 1 12.71836 0.2403934

lassso

Lets try out the Lasso regression with cross validation.

# Build the model
set.seed(123)
lasso <- train(
  WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneGrid = expand.grid(alpha = 1, lambda = lambda)
  )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) 12.536563485
## bt_H         0.058912750
## bt_2B       -0.019708708
## br_SB        0.039811044
## ph_BB        0.006247125
## fd_E        -0.038077537
## fd_DP       -0.084283559

R^2 and RMSE doesn’t get improved

# Make predictions
predictions <-  predict(lasso, test_set)
# Model prediction performance
data.frame(
  RMSE = RMSE(predictions, test_set$WINS),
  Rsquare = R2(predictions, test_set$WINS)
)
##       RMSE   Rsquare
## 1 12.72148 0.2410859

Elastic net

Lets try out the Elastic net regression with cross validation.

# Build the model
set.seed(123)
elastic <- train(
  WINS  ~bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneLength = 10
  )
# Model coefficients
coef(elastic$finalModel, elastic$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) 12.567573359
## bt_H         0.058888868
## bt_2B       -0.019710177
## br_SB        0.039783371
## ph_BB        0.006269394
## fd_E        -0.038058422
## fd_DP       -0.084345388

R^2 and RMSE doesn’t get improved

# Make predictions
predictions <- predict(elastic, test_set)
# Model prediction performance
head(data.frame(
  predictions = predictions,
  actual = test_set$WINS
))
##    predictions actual
## 5     70.72962     83
## 10    70.41252     77
## 21    77.10584     71
## 29    79.41875     82
## 46    84.80441     86
## 48    88.09797     99
data.frame(
  RMSE = RMSE(predictions, test_set$WINS),
  Rsquare = R2(predictions, test_set$WINS)
)
##       RMSE   Rsquare
## 1 12.72105 0.2411239

overfitting Solution didnt take any effect, so there is no improvement to the best model we identified above.

Prediction of evaluation data set

trimColumn <- function(df) {
  names(df) <- sub("TEAM_", "", names(df))
names(df) <- sub("BATTING_", "bt_", names(df))
names(df) <- sub("BASERUN_", "br_", names(df))
names(df) <- sub("FIELDING_", "fd_", names(df))
names(df) <- sub("PITCHING_", "ph_", names(df))
names(df) <- sub("TARGET_", "", names(df))
head(df)
df
}

LoadandpreprocessEvaluationSet <- function () {
  # load training set 
  df_train <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-training-data.csv")[-1]
  # trim the column name and drop the response variable from training set 
  df_train <- trimColumn(df_train)
  #drop the response variable from training set. we have to combine it with the evaluation data set and needs have same columns. 
  
  df_train <- subset(df_train , select = -c(WINS))

  #create a new column to indidcate it is training set or evaluation set
  df_train$type = 0
  # seprate out the evaluation set and return 
  df_eval <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")[-1]
  #trim the evaluation columns 
  df_eval <- trimColumn(df_eval)

  #create a new column to indidcate it is training set or evaluation set, Set to 1
  df_eval$type =1
  # combine training and evaluation in one df and apply the required tranformation for missing data 
  df_full <- rbind(df_train, df_eval)
  df_full$bt_1B <- df_full$bt_H - df_full$bt_2B - df_full$bt_3B - df_full$bt_HR
  df_full$BB <- df_full$bt_BB / df_full$ph_BB

  # transform the missing data using impute function
  df_full <- imputeMissingData(df_full)
  #filter only the evaluation set
  df_eval <- df_full[df_full$type == 1, ]
  #return the evaluation set 
  df_eval
}
df_eval <- LoadandpreprocessEvaluationSet()
## Warning: Number of logged events: 1750
df_eval<- subset(df_eval , select = -c(type))
#lift the data points by 1 to fix the zero data points( did the same transformation for training set while building the model)

df_eval<-df_eval + 1
eval_data <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")

Evaluating using Square transformation model

predictions = predict(regressor_sq, newdata = df_eval)
eval_data$WINS <- ceiling(predictions- 1) 
kable(eval_data[, c("INDEX", "WINS")])
INDEX WINS
9 68
10 69
14 77
47 88
60 65
63 69
74 83
83 79
98 73
120 76
123 74
135 79
138 76
140 79
151 83
153 77
171 75
184 79
193 74
213 90
217 83
226 86
230 79
241 72
291 84
294 88
300 48
348 77
350 85
357 80
367 88
368 84
372 82
382 82
388 79
396 81
398 76
403 86
407 88
410 89
412 81
414 90
436 17
440 101
476 91
479 92
481 102
501 77
503 72
506 80
519 78
522 88
550 77
554 74
566 78
578 81
596 91
599 78
605 67
607 81
614 90
644 77
692 87
699 82
700 83
716 106
721 75
722 84
729 80
731 89
746 86
763 75
774 75
776 82
788 86
789 85
792 82
811 83
835 76
837 81
861 85
862 89
863 96
871 78
879 81
887 81
892 81
904 84
909 88
925 88
940 85
951 91
976 74
981 88
983 89
984 85
989 88
995 104
1000 88
1001 87
1007 81
1016 74
1027 84
1033 80
1070 79
1081 64
1084 58
1098 79
1150 84
1160 62
1169 82
1172 86
1174 95
1176 92
1178 83
1184 82
1193 87
1196 81
1199 78
1207 78
1218 92
1223 72
1226 71
1227 69
1229 72
1241 89
1244 92
1246 77
1248 90
1249 92
1253 88
1261 81
1305 81
1314 86
1323 85
1328 70
1353 75
1363 79
1371 92
1372 83
1389 70
1393 75
1421 92
1431 75
1437 75
1442 72
1450 77
1463 81
1464 81
1470 82
1471 80
1484 85
1495 27
1507 73
1514 80
1526 75
1549 86
1552 66
1556 91
1564 76
1585 98
1586 103
1590 90
1591 98
1592 91
1603 84
1612 80
1634 79
1645 75
1647 81
1673 91
1674 90
1687 80
1688 95
1700 82
1708 75
1713 77
1717 72
1721 74
1730 79
1737 88
1748 85
1749 86
1763 81
1768 91
1778 98
1780 91
1782 52
1784 65
1794 123
1803 73
1804 84
1819 78
1832 80
1833 84
1844 70
1847 78
1854 80
1855 77
1857 84
1864 76
1865 81
1869 76
1880 91
1881 83
1882 82
1894 82
1896 80
1916 81
1918 73
1921 104
1926 95
1938 80
1979 67
1982 68
1987 84
1997 80
2004 94
2011 80
2015 80
2022 81
2025 79
2027 82
2031 77
2036 93
2066 76
2073 83
2087 80
2092 82
2125 70
2148 82
2162 93
2191 76
2203 85
2218 79
2221 77
2225 86
2232 77
2267 86
2291 74
2299 88
2317 86
2318 86
2353 85
2403 68
2411 89
2415 82
2424 83
2441 74
2464 84
2465 80
2472 55
2481 94
2487 42
2500 72
2501 76
2520 85
2521 84
2525 77

Evaluating using Backward elimination model

predictions = predict(regressor_backward_E2, newdata = df_eval)
eval_data$WINS <- ceiling(predictions- 1) 
kable(eval_data[, c("INDEX", "WINS")])
INDEX WINS
9 67
10 67
14 77
47 88
60 66
63 69
74 83
83 79
98 72
120 76
123 74
135 80
138 77
140 80
151 83
153 77
171 74
184 79
193 74
213 90
217 83
226 86
230 79
241 72
291 84
294 89
300 51
348 77
350 86
357 80
367 88
368 85
372 82
382 82
388 79
396 81
398 75
403 86
407 88
410 89
412 82
414 91
436 16
440 102
476 92
479 93
481 103
501 76
503 71
506 80
519 79
522 89
550 77
554 73
566 78
578 81
596 91
599 77
605 65
607 81
614 90
644 77
692 88
699 82
700 83
716 106
721 76
722 85
729 80
731 89
746 86
763 75
774 75
776 83
788 86
789 86
792 82
811 83
835 75
837 81
861 85
862 89
863 96
871 79
879 82
887 81
892 81
904 85
909 88
925 89
940 85
951 92
976 74
981 89
983 89
984 85
989 88
995 103
1000 89
1001 88
1007 82
1016 74
1027 85
1033 80
1070 79
1081 65
1084 57
1098 79
1150 84
1160 61
1169 82
1172 86
1174 96
1176 92
1178 83
1184 82
1193 88
1196 81
1199 78
1207 78
1218 93
1223 70
1226 70
1227 66
1229 70
1241 89
1244 92
1246 77
1248 91
1249 92
1253 88
1261 82
1305 81
1314 87
1323 85
1328 71
1353 74
1363 79
1371 91
1372 83
1389 69
1393 74
1421 92
1431 75
1437 74
1442 71
1450 76
1463 81
1464 82
1470 82
1471 80
1484 85
1495 31
1507 73
1514 80
1526 74
1549 86
1552 67
1556 92
1564 76
1585 98
1586 103
1590 91
1591 98
1592 91
1603 85
1612 80
1634 79
1645 75
1647 81
1673 92
1674 90
1687 80
1688 95
1700 82
1708 74
1713 77
1717 72
1721 74
1730 79
1737 87
1748 85
1749 86
1763 81
1768 89
1778 98
1780 91
1782 53
1784 65
1794 120
1803 73
1804 84
1819 78
1832 81
1833 85
1844 70
1847 78
1854 80
1855 77
1857 84
1864 76
1865 82
1869 75
1880 92
1881 83
1882 82
1894 82
1896 80
1916 81
1918 73
1921 104
1926 96
1938 81
1979 66
1982 68
1987 85
1997 81
2004 94
2011 80
2015 79
2022 81
2025 79
2027 82
2031 76
2036 89
2066 75
2073 83
2087 80
2092 82
2125 71
2148 81
2162 93
2191 76
2203 85
2218 78
2221 76
2225 86
2232 77
2267 87
2291 74
2299 89
2317 86
2318 87
2353 85
2403 65
2411 89
2415 83
2424 83
2441 73
2464 84
2465 80
2472 56
2481 95
2487 34
2500 71
2501 76
2520 85
2521 84
2525 77