library(MASS)
library(caret)
library(car)
library(corrplot)
library(knitr)
library(mice)
Load the data set that was curated after the preliminary explanatory analysis.
Plotted a correlation matrix on the original data set
df = read.csv('https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/baseball_output.csv')
corrplot(cor(df, use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)
Found that there is a data point with 0 value which needs to be corrected to a non-zero val for BOX-COX transformation later
df[df$WINS == 0,]
## X WINS bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS ph_H ph_HR ph_BB
## 1211 1211 0 891 135 0 0 0 0 0 0 24057 0 0
## ph_SO fd_E fd_DP bt_1B BB
## 1211 0 1890 219 756 NA
Remove the index column that got added in the preliminary step. Also lift each datapoints by 1 to fix the zero data point.
df <- subset(df, select = -c(X))
df <- df + 1
Split the data into training and test data set(80% training and 20% testing)
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(df$WINS, SplitRatio = 0.8)
training_set = subset(df, split == TRUE)
test_set = subset(df, split == FALSE)
Before building and trying out different linear regression models, we will review the assumptions for the OLS algorithm to make it perform well.
We will check these assumption/factors while reviewing the results of each model.
Fitting a full model with all remaining independent variables( “bt_H” “bt_2B” “bt_3B” “bt_HR” “bt_BB” “bt_SO” “br_SB” “br_CS” “ph_H” “ph_HR” “ph_BB” “ph_SO” “fd_E” “fd_DP” “bt_1B” “BB”) and the response variable WINS
colnames(training_set)
## [1] "WINS" "bt_H" "bt_2B" "bt_3B" "bt_HR" "bt_BB" "bt_SO" "br_SB" "br_CS"
## [10] "ph_H" "ph_HR" "ph_BB" "ph_SO" "fd_E" "fd_DP" "bt_1B" "BB"
create a dataframe for holding the regression metrics.
regressor_metrics <- data.frame(matrix(ncol = 6, nrow = 0) ,stringsAsFactors = FALSE)
# Fitting Multiple Linear Regression to the Training set
fullregressor = lm(formula = WINS ~ . ,
data = training_set)
summary(fullregressor)
##
## Call:
## lm(formula = WINS ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.405 -8.181 0.329 7.759 59.627
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.115e+02 1.330e+01 8.383 < 2e-16 ***
## bt_H 4.365e-02 4.012e-03 10.882 < 2e-16 ***
## bt_2B -2.487e-02 9.817e-03 -2.533 0.01139 *
## bt_3B 6.384e-02 1.844e-02 3.461 0.00055 ***
## bt_HR 1.558e-01 3.360e-02 4.638 3.78e-06 ***
## bt_BB 4.178e-02 6.877e-03 6.076 1.50e-09 ***
## bt_SO -1.176e-02 2.726e-03 -4.316 1.68e-05 ***
## br_SB 4.624e-02 5.487e-03 8.426 < 2e-16 ***
## br_CS 1.364e-02 1.122e-02 1.216 0.22433
## ph_H 1.479e-03 4.608e-04 3.210 0.00135 **
## ph_HR -7.223e-02 3.069e-02 -2.354 0.01869 *
## ph_BB -2.114e-02 4.834e-03 -4.373 1.30e-05 ***
## ph_SO 8.737e-04 9.125e-04 0.958 0.33841
## fd_E -5.399e-02 3.684e-03 -14.655 < 2e-16 ***
## fd_DP -1.150e-01 1.381e-02 -8.326 < 2e-16 ***
## bt_1B NA NA NA NA
## BB -4.238e+01 6.263e+00 -6.766 1.78e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.47 on 1809 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3918, Adjusted R-squared: 0.3868
## F-statistic: 77.69 on 15 and 1809 DF, p-value: < 2.2e-16
plot(fullregressor$residuals)
abline(h = 0, lty = 3)
par(mfrow=c(2,2))
plot(fullregressor)
we see the two independent variables has p-value > 0.05. we will remove this independent variable from the model and try its performance.
since the R-square and RMSE of the model is not that great and it shows a possible underfitting problem. We will try out some transformation like backward elimination, square, logarithmic and BOX-COX transformations and review the results.
predictions = predict(fullregressor, newdata = test_set)
## Warning in predict.lm(fullregressor, newdata = test_set): prediction from a
## rank-deficient fit may be misleading
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 66.72792 83
## 10 66.58775 77
## 21 74.98691 71
## 29 75.99609 82
## 46 87.60120 86
## 48 91.52018 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(fullregressor)$adj.r.squared, digits = 4)
r2_train <- round(summary(fullregressor)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(fullregressor$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 12.8927 12.4108 0.3918 0.241 0.3868
regressor_metrics <- rbind(regressor_metrics, c("Full-Model", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
Through backward elimination process, some independent variables(ph_SO,br_CS, bt_1B) that has pvalue more than the significance level of 0.05 were removed from the full model
regressor_backward_E1 = lm(formula = WINS ~ bt_H+ bt_2B+ bt_3B+ bt_HR+ bt_BB+ bt_SO+ br_SB+ ph_H+ ph_HR+ ph_BB + fd_E+ fd_DP+ BB ,data = training_set)
summary(regressor_backward_E1)
##
## Call:
## lm(formula = WINS ~ bt_H + bt_2B + bt_3B + bt_HR + bt_BB + bt_SO +
## br_SB + ph_H + ph_HR + ph_BB + fd_E + fd_DP + BB, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.845 -8.187 0.450 7.839 60.727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.139e+02 1.319e+01 8.637 < 2e-16 ***
## bt_H 4.346e-02 3.973e-03 10.937 < 2e-16 ***
## bt_2B -2.517e-02 9.732e-03 -2.587 0.009765 **
## bt_3B 6.687e-02 1.833e-02 3.648 0.000272 ***
## bt_HR 1.623e-01 3.267e-02 4.967 7.42e-07 ***
## bt_BB 3.854e-02 6.228e-03 6.188 7.51e-10 ***
## bt_SO -1.088e-02 2.511e-03 -4.335 1.54e-05 ***
## br_SB 5.046e-02 4.592e-03 10.990 < 2e-16 ***
## ph_H 1.441e-03 4.589e-04 3.139 0.001722 **
## ph_HR -8.049e-02 2.969e-02 -2.711 0.006778 **
## ph_BB -1.847e-02 4.188e-03 -4.411 1.09e-05 ***
## fd_E -5.440e-02 3.631e-03 -14.982 < 2e-16 ***
## fd_DP -1.171e-01 1.344e-02 -8.710 < 2e-16 ***
## BB -4.282e+01 6.236e+00 -6.866 9.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.47 on 1811 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.391, Adjusted R-squared: 0.3866
## F-statistic: 89.42 on 13 and 1811 DF, p-value: < 2.2e-16
plot(regressor_backward_E1$residuals)
abline(h = 0, lty = 3)
par(mfrow=c(2,2))
plot(regressor_backward_E1)
predictions = predict(regressor_backward_E1, newdata = test_set)
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 66.80905 83
## 10 66.85463 77
## 21 74.79397 71
## 29 76.03111 82
## 46 87.70131 86
## 48 91.57863 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_backward_E1)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_backward_E1)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_backward_E1$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 12.9301 12.4195 0.391 0.2381 0.3866
regressor_metrics <- rbind(regressor_metrics, c("Backward elimination-1", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
Performing a VIF Test on all variables to remove some independent variables which are colinear( VIF has more than 5)
model1 <- lm(WINS ~ bt_H+ bt_2B+ bt_3B+ bt_HR+ bt_BB+ bt_SO+ br_SB+ ph_H+ ph_HR+ ph_BB + fd_E+ fd_DP+ BB, data = df)
car::vif(model1)
## bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB ph_H
## 4.014112 2.464797 3.033585 46.653155 5.981965 4.528813 2.695233 4.896792
## ph_HR ph_BB fd_E fd_DP BB
## 39.716223 4.674980 8.358823 1.898830 10.757738
remove Colinear variables bt_HR + ph_HR + BB + bt_BB. Also remove variables with p-value > 0.05 (bt_3B, bt_SO, ph_H)
regressor_backward_E2 = lm(formula = WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP ,data = training_set)
summary(regressor_backward_E2)
##
## Call:
## lm(formula = WINS ~ bt_H + bt_2B + br_SB + ph_BB + fd_E + fd_DP,
## data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.579 -8.577 0.122 8.482 49.278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.207764 3.593651 3.397 0.000696 ***
## bt_H 0.059449 0.002877 20.662 < 2e-16 ***
## bt_2B -0.021329 0.009053 -2.356 0.018579 *
## br_SB 0.040128 0.004117 9.747 < 2e-16 ***
## ph_BB 0.006392 0.001998 3.200 0.001399 **
## fd_E -0.038440 0.001792 -21.450 < 2e-16 ***
## fd_DP -0.084982 0.013246 -6.416 1.78e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared: 0.3511, Adjusted R-squared: 0.349
## F-statistic: 164.1 on 6 and 1819 DF, p-value: < 2.2e-16
plot(regressor_backward_E2$residuals)
abline(h = 0, lty = 3)
par(mfrow=c(2,2))
plot(regressor_backward_E2)
predictions = predict(regressor_backward_E2, newdata = test_set)
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 70.70862 83
## 10 70.34577 77
## 21 77.07268 71
## 29 79.39163 82
## 46 84.77783 86
## 48 88.10494 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_backward_E2)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_backward_E2)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_backward_E2$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 12.7212 12.9056 0.3511 0.2413 0.349
regressor_metrics <- rbind(regressor_metrics, c("Backward elimination-2", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
# Fitting Multiple Linear Regression to the Training set
#"bt_H" "bt_2B" "bt_3B" "br_SB" "br_CS" "ph_H" "fd_E" "fd_DP"
regressor_sq = lm(WINS ~ I(bt_H^2)+ I(bt_2B^2) + (br_SB^2) + (ph_BB^2) + (fd_E^2)+ (fd_DP^2) ,
data = training_set)
summary(regressor_sq)
##
## Call:
## lm(formula = WINS ~ I(bt_H^2) + I(bt_2B^2) + (br_SB^2) + (ph_BB^2) +
## (fd_E^2) + (fd_DP^2), data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.831 -8.729 0.132 8.529 49.738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.641e+01 2.486e+00 22.694 < 2e-16 ***
## I(bt_H^2) 1.858e-05 8.849e-07 21.002 < 2e-16 ***
## I(bt_2B^2) -3.519e-05 1.734e-05 -2.029 0.04256 *
## br_SB 4.152e-02 4.116e-03 10.088 < 2e-16 ***
## ph_BB 5.967e-03 1.998e-03 2.987 0.00286 **
## fd_E -3.987e-02 1.795e-03 -22.219 < 2e-16 ***
## fd_DP -8.490e-02 1.322e-02 -6.424 1.69e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared: 0.3508, Adjusted R-squared: 0.3487
## F-statistic: 163.8 on 6 and 1819 DF, p-value: < 2.2e-16
plot(regressor_sq$residuals)
abline(h = 0, lty = 3)
par(mfrow=c(2,2))
plot(regressor_sq)
RMSE(test) has improved, but R-square is reduced slightly.
predictions = predict(regressor_sq, newdata = test_set)
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 71.49189 83
## 10 71.68233 77
## 21 76.78273 71
## 29 79.73048 82
## 46 84.73991 86
## 48 88.01546 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_sq)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_sq)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_sq$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 12.6774 12.9088 0.3508 0.2459 0.3487
regressor_metrics <- rbind(regressor_metrics, c("Square Transformation", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
# Fitting Multiple Linear Regresion to the Training set
regressor_log = lm(WINS ~ log1p(bt_H)+ log1p(bt_2B) + log1p(br_SB) + log1p(ph_BB) + log1p(fd_E)+ log1p(fd_DP),
data = training_set)
summary(regressor_log)
##
## Call:
## lm(formula = WINS ~ log1p(bt_H) + log1p(bt_2B) + log1p(br_SB) +
## log1p(ph_BB) + log1p(fd_E) + log1p(fd_DP), data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -54.379 -8.525 -0.049 8.348 50.265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -453.2839 26.5293 -17.086 < 2e-16 ***
## log1p(bt_H) 97.0853 4.7161 20.586 < 2e-16 ***
## log1p(bt_2B) -9.6544 2.3418 -4.123 3.91e-05 ***
## log1p(br_SB) 4.8338 0.5874 8.229 3.56e-16 ***
## log1p(ph_BB) 3.5732 1.1583 3.085 0.00207 **
## log1p(fd_E) -14.5104 0.7717 -18.804 < 2e-16 ***
## log1p(fd_DP) -17.8524 1.8683 -9.555 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.02 on 1819 degrees of freedom
## Multiple R-squared: 0.3419, Adjusted R-squared: 0.3397
## F-statistic: 157.5 on 6 and 1819 DF, p-value: < 2.2e-16
plot(regressor_log$residuals)
abline(h = 0, lty = 3)
par(mfrow=c(2,2))
plot(regressor_log)
Residual Error plot developed a slight cure and OLS assumptions are not met.
predictions = predict(regressor_log, newdata = test_set)
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 69.82943 83
## 10 69.84392 77
## 21 74.99316 71
## 29 82.81690 82
## 46 88.47094 86
## 48 94.84237 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regressor_log)$adj.r.squared, digits = 4)
r2_train <- round(summary(regressor_log)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regressor_log$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 13.0509 12.9974 0.3419 0.2117 0.3397
regressor_metrics <- rbind(regressor_metrics, c("Log Transformation", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
Trying out a box-cox transformation. Used the best model so far which is backward elimination model to apply BOX-COX transformation. Lambda comes close to 1. So it doesnt make any difference and there is no need to apply the BOX-COX tranformation.
bc = boxcox(regressor_backward_E2)
Performing a cross validation algorithm if it make some improvement.
library(caret)
set.seed(123)
regression_cv <- train(
WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , training_set,
method = "lm",
trControl = trainControl(
method = "cv",
number =10,
verboseIter = TRUE
)
)
summary(regression_cv)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.579 -8.577 0.122 8.482 49.278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.207764 3.593651 3.397 0.000696 ***
## bt_H 0.059449 0.002877 20.662 < 2e-16 ***
## bt_2B -0.021329 0.009053 -2.356 0.018579 *
## br_SB 0.040128 0.004117 9.747 < 2e-16 ***
## ph_BB 0.006392 0.001998 3.200 0.001399 **
## fd_E -0.038440 0.001792 -21.450 < 2e-16 ***
## fd_DP -0.084982 0.013246 -6.416 1.78e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.93 on 1819 degrees of freedom
## Multiple R-squared: 0.3511, Adjusted R-squared: 0.349
## F-statistic: 164.1 on 6 and 1819 DF, p-value: < 2.2e-16
predictions = predict(regression_cv, newdata = test_set)
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 70.70862 83
## 10 70.34577 77
## 21 77.07268 71
## 29 79.39163 82
## 46 84.77783 86
## 48 88.10494 99
rmse_test <- round(RMSE(predictions, test_set$WINS), digits=4)
r2_test <- round(R2(predictions, test_set$WINS), digits = 4)
adj_r2_train <- round(summary(regression_cv)$adj.r.squared, digits = 4)
r2_train <- round(summary(regression_cv)$r.squared, digits = 4)
rmse_train <- round(sqrt(mean(regression_cv$residuals^2)), digits = 4)
data.frame(
rmse_test = rmse_test,
rmse_train = rmse_train,
r2_train = r2_train,
r2_test = r2_test,
adj_r2_train = adj_r2_train
)
## rmse_test rmse_train r2_train r2_test adj_r2_train
## 1 12.7212 NaN 0.3511 0.2413 0.349
regressor_metrics <- rbind(regressor_metrics, c("Cross Validation", r2_train , adj_r2_train, rmse_train , rmse_test , r2_test), stringsAsFactors = FALSE)
metrics <- c("regressor", "Rsquare(Train-set)", "Adjusted-RSquare(Training-set)","RMSE(Train-set)" , "RMSE(Test-set)", "R-Square(Test)")
colnames(regressor_metrics) <- metrics
kable(regressor_metrics)
| regressor | Rsquare(Train-set) | Adjusted-RSquare(Training-set) | RMSE(Train-set) | RMSE(Test-set) | R-Square(Test) |
|---|---|---|---|---|---|
| Full-Model | 0.3918 | 0.3868 | 12.4108 | 12.8927 | 0.241 |
| Backward elimination-1 | 0.391 | 0.3866 | 12.4195 | 12.9301 | 0.2381 |
| Backward elimination-2 | 0.3511 | 0.349 | 12.9056 | 12.7212 | 0.2413 |
| Square Transformation | 0.3508 | 0.3487 | 12.9088 | 12.6774 | 0.2459 |
| Log Transformation | 0.3419 | 0.3397 | 12.9974 | 13.0509 | 0.2117 |
| Cross Validation | 0.3511 | 0.349 | NaN | 12.7212 | 0.2413 |
Model built using Backward Elimination-2 and Square transformation looks to be the best among all models considering comparetively low RMSE(test) and comparetively good R-square values.
Less R^2 and high RMSE shows an underfitting problem. The dataset doesnt follow a linear relationship with the response variable.
Residual Error plot shows a slight cure and OLS assumptions are not met.
Metrics shows RMSE(Train) and RMSE(TEST) is almost same and dont have much differences. But the R^2 has some difference for all the models. We will see some overfitting solution and check the model gets improved further in next step.
RMSE(train) and RMSE(test) doesnt indicate there is a problem of overfitting, but R^2 has some difference. We want to see if the model gets improved using some of the underfitting solutions.
Lets try out the ridge regression with cross validation.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
lambda <- 10^seq(-3, 3, length = 100)
# Build the model
set.seed(123)
ridge <- train(
WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)
# Model coefficients
coef(ridge$finalModel, ridge$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 16.789041438
## bt_H 0.053359100
## bt_2B -0.007399118
## br_SB 0.035809230
## ph_BB 0.006233650
## fd_E -0.034062969
## fd_DP -0.080906289
R^2(test) and RMSE doesn’t get improved
# Make predictions
predictions <- predict(ridge, test_set)
# Model prediction performance
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 70.99658 83
## 10 71.00643 77
## 21 77.27990 71
## 29 79.50153 82
## 46 84.89866 86
## 48 88.04531 99
data.frame(
RMSE = RMSE(predictions, test_set$WINS),
Rsquare = R2(predictions, test_set$WINS)
)
## RMSE Rsquare
## 1 12.71836 0.2403934
Lets try out the Lasso regression with cross validation.
# Build the model
set.seed(123)
lasso <- train(
WINS ~ bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
# Model coefficients
coef(lasso$finalModel, lasso$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 12.536563485
## bt_H 0.058912750
## bt_2B -0.019708708
## br_SB 0.039811044
## ph_BB 0.006247125
## fd_E -0.038077537
## fd_DP -0.084283559
R^2 and RMSE doesn’t get improved
# Make predictions
predictions <- predict(lasso, test_set)
# Model prediction performance
data.frame(
RMSE = RMSE(predictions, test_set$WINS),
Rsquare = R2(predictions, test_set$WINS)
)
## RMSE Rsquare
## 1 12.72148 0.2410859
Lets try out the Elastic net regression with cross validation.
# Build the model
set.seed(123)
elastic <- train(
WINS ~bt_H+ bt_2B + br_SB + ph_BB + fd_E+ fd_DP , data = training_set, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
# Model coefficients
coef(elastic$finalModel, elastic$bestTune$lambda)
## 7 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 12.567573359
## bt_H 0.058888868
## bt_2B -0.019710177
## br_SB 0.039783371
## ph_BB 0.006269394
## fd_E -0.038058422
## fd_DP -0.084345388
R^2 and RMSE doesn’t get improved
# Make predictions
predictions <- predict(elastic, test_set)
# Model prediction performance
head(data.frame(
predictions = predictions,
actual = test_set$WINS
))
## predictions actual
## 5 70.72962 83
## 10 70.41252 77
## 21 77.10584 71
## 29 79.41875 82
## 46 84.80441 86
## 48 88.09797 99
data.frame(
RMSE = RMSE(predictions, test_set$WINS),
Rsquare = R2(predictions, test_set$WINS)
)
## RMSE Rsquare
## 1 12.72105 0.2411239
overfitting Solution didnt take any effect, so there is no improvement to the best model we identified above.
trimColumn <- function(df) {
names(df) <- sub("TEAM_", "", names(df))
names(df) <- sub("BATTING_", "bt_", names(df))
names(df) <- sub("BASERUN_", "br_", names(df))
names(df) <- sub("FIELDING_", "fd_", names(df))
names(df) <- sub("PITCHING_", "ph_", names(df))
names(df) <- sub("TARGET_", "", names(df))
head(df)
df
}
LoadandpreprocessEvaluationSet <- function () {
# load training set
df_train <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-training-data.csv")[-1]
# trim the column name and drop the response variable from training set
df_train <- trimColumn(df_train)
#drop the response variable from training set. we have to combine it with the evaluation data set and needs have same columns.
df_train <- subset(df_train , select = -c(WINS))
#create a new column to indidcate it is training set or evaluation set
df_train$type = 0
# seprate out the evaluation set and return
df_eval <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")[-1]
#trim the evaluation columns
df_eval <- trimColumn(df_eval)
#create a new column to indidcate it is training set or evaluation set, Set to 1
df_eval$type =1
# combine training and evaluation in one df and apply the required tranformation for missing data
df_full <- rbind(df_train, df_eval)
df_full$bt_1B <- df_full$bt_H - df_full$bt_2B - df_full$bt_3B - df_full$bt_HR
df_full$BB <- df_full$bt_BB / df_full$ph_BB
# transform the missing data using impute function
df_full <- imputeMissingData(df_full)
#filter only the evaluation set
df_eval <- df_full[df_full$type == 1, ]
#return the evaluation set
df_eval
}
df_eval <- LoadandpreprocessEvaluationSet()
## Warning: Number of logged events: 1750
df_eval<- subset(df_eval , select = -c(type))
#lift the data points by 1 to fix the zero data points( did the same transformation for training set while building the model)
df_eval<-df_eval + 1
eval_data <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")
predictions = predict(regressor_sq, newdata = df_eval)
eval_data$WINS <- ceiling(predictions- 1)
kable(eval_data[, c("INDEX", "WINS")])
| INDEX | WINS |
|---|---|
| 9 | 68 |
| 10 | 69 |
| 14 | 77 |
| 47 | 88 |
| 60 | 65 |
| 63 | 69 |
| 74 | 83 |
| 83 | 79 |
| 98 | 73 |
| 120 | 76 |
| 123 | 74 |
| 135 | 79 |
| 138 | 76 |
| 140 | 79 |
| 151 | 83 |
| 153 | 77 |
| 171 | 75 |
| 184 | 79 |
| 193 | 74 |
| 213 | 90 |
| 217 | 83 |
| 226 | 86 |
| 230 | 79 |
| 241 | 72 |
| 291 | 84 |
| 294 | 88 |
| 300 | 48 |
| 348 | 77 |
| 350 | 85 |
| 357 | 80 |
| 367 | 88 |
| 368 | 84 |
| 372 | 82 |
| 382 | 82 |
| 388 | 79 |
| 396 | 81 |
| 398 | 76 |
| 403 | 86 |
| 407 | 88 |
| 410 | 89 |
| 412 | 81 |
| 414 | 90 |
| 436 | 17 |
| 440 | 101 |
| 476 | 91 |
| 479 | 92 |
| 481 | 102 |
| 501 | 77 |
| 503 | 72 |
| 506 | 80 |
| 519 | 78 |
| 522 | 88 |
| 550 | 77 |
| 554 | 74 |
| 566 | 78 |
| 578 | 81 |
| 596 | 91 |
| 599 | 78 |
| 605 | 67 |
| 607 | 81 |
| 614 | 90 |
| 644 | 77 |
| 692 | 87 |
| 699 | 82 |
| 700 | 83 |
| 716 | 106 |
| 721 | 75 |
| 722 | 84 |
| 729 | 80 |
| 731 | 89 |
| 746 | 86 |
| 763 | 75 |
| 774 | 75 |
| 776 | 82 |
| 788 | 86 |
| 789 | 85 |
| 792 | 82 |
| 811 | 83 |
| 835 | 76 |
| 837 | 81 |
| 861 | 85 |
| 862 | 89 |
| 863 | 96 |
| 871 | 78 |
| 879 | 81 |
| 887 | 81 |
| 892 | 81 |
| 904 | 84 |
| 909 | 88 |
| 925 | 88 |
| 940 | 85 |
| 951 | 91 |
| 976 | 74 |
| 981 | 88 |
| 983 | 89 |
| 984 | 85 |
| 989 | 88 |
| 995 | 104 |
| 1000 | 88 |
| 1001 | 87 |
| 1007 | 81 |
| 1016 | 74 |
| 1027 | 84 |
| 1033 | 80 |
| 1070 | 79 |
| 1081 | 64 |
| 1084 | 58 |
| 1098 | 79 |
| 1150 | 84 |
| 1160 | 62 |
| 1169 | 82 |
| 1172 | 86 |
| 1174 | 95 |
| 1176 | 92 |
| 1178 | 83 |
| 1184 | 82 |
| 1193 | 87 |
| 1196 | 81 |
| 1199 | 78 |
| 1207 | 78 |
| 1218 | 92 |
| 1223 | 72 |
| 1226 | 71 |
| 1227 | 69 |
| 1229 | 72 |
| 1241 | 89 |
| 1244 | 92 |
| 1246 | 77 |
| 1248 | 90 |
| 1249 | 92 |
| 1253 | 88 |
| 1261 | 81 |
| 1305 | 81 |
| 1314 | 86 |
| 1323 | 85 |
| 1328 | 70 |
| 1353 | 75 |
| 1363 | 79 |
| 1371 | 92 |
| 1372 | 83 |
| 1389 | 70 |
| 1393 | 75 |
| 1421 | 92 |
| 1431 | 75 |
| 1437 | 75 |
| 1442 | 72 |
| 1450 | 77 |
| 1463 | 81 |
| 1464 | 81 |
| 1470 | 82 |
| 1471 | 80 |
| 1484 | 85 |
| 1495 | 27 |
| 1507 | 73 |
| 1514 | 80 |
| 1526 | 75 |
| 1549 | 86 |
| 1552 | 66 |
| 1556 | 91 |
| 1564 | 76 |
| 1585 | 98 |
| 1586 | 103 |
| 1590 | 90 |
| 1591 | 98 |
| 1592 | 91 |
| 1603 | 84 |
| 1612 | 80 |
| 1634 | 79 |
| 1645 | 75 |
| 1647 | 81 |
| 1673 | 91 |
| 1674 | 90 |
| 1687 | 80 |
| 1688 | 95 |
| 1700 | 82 |
| 1708 | 75 |
| 1713 | 77 |
| 1717 | 72 |
| 1721 | 74 |
| 1730 | 79 |
| 1737 | 88 |
| 1748 | 85 |
| 1749 | 86 |
| 1763 | 81 |
| 1768 | 91 |
| 1778 | 98 |
| 1780 | 91 |
| 1782 | 52 |
| 1784 | 65 |
| 1794 | 123 |
| 1803 | 73 |
| 1804 | 84 |
| 1819 | 78 |
| 1832 | 80 |
| 1833 | 84 |
| 1844 | 70 |
| 1847 | 78 |
| 1854 | 80 |
| 1855 | 77 |
| 1857 | 84 |
| 1864 | 76 |
| 1865 | 81 |
| 1869 | 76 |
| 1880 | 91 |
| 1881 | 83 |
| 1882 | 82 |
| 1894 | 82 |
| 1896 | 80 |
| 1916 | 81 |
| 1918 | 73 |
| 1921 | 104 |
| 1926 | 95 |
| 1938 | 80 |
| 1979 | 67 |
| 1982 | 68 |
| 1987 | 84 |
| 1997 | 80 |
| 2004 | 94 |
| 2011 | 80 |
| 2015 | 80 |
| 2022 | 81 |
| 2025 | 79 |
| 2027 | 82 |
| 2031 | 77 |
| 2036 | 93 |
| 2066 | 76 |
| 2073 | 83 |
| 2087 | 80 |
| 2092 | 82 |
| 2125 | 70 |
| 2148 | 82 |
| 2162 | 93 |
| 2191 | 76 |
| 2203 | 85 |
| 2218 | 79 |
| 2221 | 77 |
| 2225 | 86 |
| 2232 | 77 |
| 2267 | 86 |
| 2291 | 74 |
| 2299 | 88 |
| 2317 | 86 |
| 2318 | 86 |
| 2353 | 85 |
| 2403 | 68 |
| 2411 | 89 |
| 2415 | 82 |
| 2424 | 83 |
| 2441 | 74 |
| 2464 | 84 |
| 2465 | 80 |
| 2472 | 55 |
| 2481 | 94 |
| 2487 | 42 |
| 2500 | 72 |
| 2501 | 76 |
| 2520 | 85 |
| 2521 | 84 |
| 2525 | 77 |
predictions = predict(regressor_backward_E2, newdata = df_eval)
eval_data$WINS <- ceiling(predictions- 1)
kable(eval_data[, c("INDEX", "WINS")])
| INDEX | WINS |
|---|---|
| 9 | 67 |
| 10 | 67 |
| 14 | 77 |
| 47 | 88 |
| 60 | 66 |
| 63 | 69 |
| 74 | 83 |
| 83 | 79 |
| 98 | 72 |
| 120 | 76 |
| 123 | 74 |
| 135 | 80 |
| 138 | 77 |
| 140 | 80 |
| 151 | 83 |
| 153 | 77 |
| 171 | 74 |
| 184 | 79 |
| 193 | 74 |
| 213 | 90 |
| 217 | 83 |
| 226 | 86 |
| 230 | 79 |
| 241 | 72 |
| 291 | 84 |
| 294 | 89 |
| 300 | 51 |
| 348 | 77 |
| 350 | 86 |
| 357 | 80 |
| 367 | 88 |
| 368 | 85 |
| 372 | 82 |
| 382 | 82 |
| 388 | 79 |
| 396 | 81 |
| 398 | 75 |
| 403 | 86 |
| 407 | 88 |
| 410 | 89 |
| 412 | 82 |
| 414 | 91 |
| 436 | 16 |
| 440 | 102 |
| 476 | 92 |
| 479 | 93 |
| 481 | 103 |
| 501 | 76 |
| 503 | 71 |
| 506 | 80 |
| 519 | 79 |
| 522 | 89 |
| 550 | 77 |
| 554 | 73 |
| 566 | 78 |
| 578 | 81 |
| 596 | 91 |
| 599 | 77 |
| 605 | 65 |
| 607 | 81 |
| 614 | 90 |
| 644 | 77 |
| 692 | 88 |
| 699 | 82 |
| 700 | 83 |
| 716 | 106 |
| 721 | 76 |
| 722 | 85 |
| 729 | 80 |
| 731 | 89 |
| 746 | 86 |
| 763 | 75 |
| 774 | 75 |
| 776 | 83 |
| 788 | 86 |
| 789 | 86 |
| 792 | 82 |
| 811 | 83 |
| 835 | 75 |
| 837 | 81 |
| 861 | 85 |
| 862 | 89 |
| 863 | 96 |
| 871 | 79 |
| 879 | 82 |
| 887 | 81 |
| 892 | 81 |
| 904 | 85 |
| 909 | 88 |
| 925 | 89 |
| 940 | 85 |
| 951 | 92 |
| 976 | 74 |
| 981 | 89 |
| 983 | 89 |
| 984 | 85 |
| 989 | 88 |
| 995 | 103 |
| 1000 | 89 |
| 1001 | 88 |
| 1007 | 82 |
| 1016 | 74 |
| 1027 | 85 |
| 1033 | 80 |
| 1070 | 79 |
| 1081 | 65 |
| 1084 | 57 |
| 1098 | 79 |
| 1150 | 84 |
| 1160 | 61 |
| 1169 | 82 |
| 1172 | 86 |
| 1174 | 96 |
| 1176 | 92 |
| 1178 | 83 |
| 1184 | 82 |
| 1193 | 88 |
| 1196 | 81 |
| 1199 | 78 |
| 1207 | 78 |
| 1218 | 93 |
| 1223 | 70 |
| 1226 | 70 |
| 1227 | 66 |
| 1229 | 70 |
| 1241 | 89 |
| 1244 | 92 |
| 1246 | 77 |
| 1248 | 91 |
| 1249 | 92 |
| 1253 | 88 |
| 1261 | 82 |
| 1305 | 81 |
| 1314 | 87 |
| 1323 | 85 |
| 1328 | 71 |
| 1353 | 74 |
| 1363 | 79 |
| 1371 | 91 |
| 1372 | 83 |
| 1389 | 69 |
| 1393 | 74 |
| 1421 | 92 |
| 1431 | 75 |
| 1437 | 74 |
| 1442 | 71 |
| 1450 | 76 |
| 1463 | 81 |
| 1464 | 82 |
| 1470 | 82 |
| 1471 | 80 |
| 1484 | 85 |
| 1495 | 31 |
| 1507 | 73 |
| 1514 | 80 |
| 1526 | 74 |
| 1549 | 86 |
| 1552 | 67 |
| 1556 | 92 |
| 1564 | 76 |
| 1585 | 98 |
| 1586 | 103 |
| 1590 | 91 |
| 1591 | 98 |
| 1592 | 91 |
| 1603 | 85 |
| 1612 | 80 |
| 1634 | 79 |
| 1645 | 75 |
| 1647 | 81 |
| 1673 | 92 |
| 1674 | 90 |
| 1687 | 80 |
| 1688 | 95 |
| 1700 | 82 |
| 1708 | 74 |
| 1713 | 77 |
| 1717 | 72 |
| 1721 | 74 |
| 1730 | 79 |
| 1737 | 87 |
| 1748 | 85 |
| 1749 | 86 |
| 1763 | 81 |
| 1768 | 89 |
| 1778 | 98 |
| 1780 | 91 |
| 1782 | 53 |
| 1784 | 65 |
| 1794 | 120 |
| 1803 | 73 |
| 1804 | 84 |
| 1819 | 78 |
| 1832 | 81 |
| 1833 | 85 |
| 1844 | 70 |
| 1847 | 78 |
| 1854 | 80 |
| 1855 | 77 |
| 1857 | 84 |
| 1864 | 76 |
| 1865 | 82 |
| 1869 | 75 |
| 1880 | 92 |
| 1881 | 83 |
| 1882 | 82 |
| 1894 | 82 |
| 1896 | 80 |
| 1916 | 81 |
| 1918 | 73 |
| 1921 | 104 |
| 1926 | 96 |
| 1938 | 81 |
| 1979 | 66 |
| 1982 | 68 |
| 1987 | 85 |
| 1997 | 81 |
| 2004 | 94 |
| 2011 | 80 |
| 2015 | 79 |
| 2022 | 81 |
| 2025 | 79 |
| 2027 | 82 |
| 2031 | 76 |
| 2036 | 89 |
| 2066 | 75 |
| 2073 | 83 |
| 2087 | 80 |
| 2092 | 82 |
| 2125 | 71 |
| 2148 | 81 |
| 2162 | 93 |
| 2191 | 76 |
| 2203 | 85 |
| 2218 | 78 |
| 2221 | 76 |
| 2225 | 86 |
| 2232 | 77 |
| 2267 | 87 |
| 2291 | 74 |
| 2299 | 89 |
| 2317 | 86 |
| 2318 | 87 |
| 2353 | 85 |
| 2403 | 65 |
| 2411 | 89 |
| 2415 | 83 |
| 2424 | 83 |
| 2441 | 73 |
| 2464 | 84 |
| 2465 | 80 |
| 2472 | 56 |
| 2481 | 95 |
| 2487 | 34 |
| 2500 | 71 |
| 2501 | 76 |
| 2520 | 85 |
| 2521 | 84 |
| 2525 | 77 |