library(corrplot)
library(psych)
library(ggplot2)
require(gridExtra)
library(car)
library(mice)
library(VIM)
library(caret)
library(dplyr)
library(MASS)
Here, we read the dataset and shorten the feature names for better readibility in visualizations.
#setwd("/Users/elinaazrilyan/Documents/Data621/")
df <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-training-data.csv")[-1]
names(df) <- sub("TEAM_", "", names(df))
names(df) <- sub("BATTING_", "bt_", names(df))
names(df) <- sub("BASERUN_", "br_", names(df))
names(df) <- sub("FIELDING_", "fd_", names(df))
names(df) <- sub("PITCHING_", "ph_", names(df))
names(df) <- sub("TARGET_", "", names(df))
head(df)
## WINS bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS bt_HBP ph_H ph_HR
## 1 39 1445 194 39 13 143 842 NA NA NA 9364 84
## 2 70 1339 219 22 190 685 1075 37 28 NA 1347 191
## 3 86 1377 232 35 137 602 917 46 27 NA 1377 137
## 4 70 1387 209 38 96 451 922 43 30 NA 1396 97
## 5 82 1297 186 27 102 472 920 49 39 NA 1297 102
## 6 75 1279 200 36 92 443 973 107 59 NA 1279 92
## ph_BB ph_SO fd_E fd_DP
## 1 927 5456 1011 NA
## 2 689 1082 193 155
## 3 602 917 175 153
## 4 454 928 164 156
## 5 472 920 138 168
## 6 443 973 123 149
First, we take a look at a summary of the data. A few things of interest are revealed:
summary(df)
## WINS bt_H bt_2B bt_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
##
## bt_HR bt_BB bt_SO br_SB
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0
## Median :102.00 Median :512.0 Median : 750.0 Median :101.0
## Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0
## Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0
## NA's :102 NA's :131
## br_CS bt_HBP ph_H ph_HR
## Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0
## 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0
## Median : 49.0 Median :58.00 Median : 1518 Median :107.0
## Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7
## 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0
## Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0
## NA's :772 NA's :2085
## ph_BB ph_SO fd_E fd_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
## Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
## Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
## 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
## NA's :102 NA's :286
Next, we create histograms of each of the features and target variable.
grid.arrange(ggplot(df, aes(bt_H)) + geom_histogram(binwidth = 30),
ggplot(df, aes(bt_2B)) + geom_histogram(binwidth = 10),
ggplot(df, aes(bt_3B)) + geom_histogram(binwidth = 10),
ggplot(df, aes(bt_HR)) + geom_histogram(binwidth = 10),
ggplot(df, aes(bt_BB)) + geom_histogram(binwidth = 30),
ggplot(df, aes(bt_SO)) + geom_histogram(binwidth = 50),
ggplot(df, aes(br_SB)) + geom_histogram(binwidth = 30),
ggplot(df, aes(br_CS)) + geom_histogram(binwidth = 10),
ggplot(df, aes(bt_HBP)) + geom_histogram(binwidth = 3),
ggplot(df, aes(ph_H)) + geom_histogram(binwidth = 100),
ggplot(df, aes(ph_HR)) + geom_histogram(binwidth = 10),
ggplot(df, aes(ph_BB)) + geom_histogram(binwidth = 100),
ggplot(df, aes(ph_SO)) + geom_histogram(binwidth = 30),
ggplot(df, aes(fd_E)) + geom_histogram(binwidth = 30),
ggplot(df, aes(fd_DP)) + geom_histogram(binwidth = 10),
ggplot(df, aes(WINS)) + geom_histogram(binwidth = 5),
ncol=4)
grid.arrange(ggplot(df, aes(x = "bt_H", y = bt_H))+geom_boxplot(),
ggplot(df, aes(x = "bt_2B", y = bt_2B))+geom_boxplot(),
ggplot(df, aes(x = "bt_3B", y = bt_3B))+geom_boxplot(),
ggplot(df, aes(x = "bt_HR", y = bt_HR))+geom_boxplot(),
ggplot(df, aes(x = "bt_BB", y = bt_BB))+geom_boxplot(),
ggplot(df, aes(x = "bt_SO", y = bt_SO))+geom_boxplot(),
ggplot(df, aes(x = "br_SB", y = br_SB))+geom_boxplot(),
ggplot(df, aes(x = "br_CS", y = br_CS))+geom_boxplot(),
ggplot(df, aes(x = "bt_HBP", y = bt_HBP))+geom_boxplot(),
ggplot(df, aes(x = "ph_H", y = ph_H))+geom_boxplot(),
ggplot(df, aes(x = "ph_HR", y = ph_HR))+geom_boxplot(),
ggplot(df, aes(x = "ph_BB", y = ph_BB))+geom_boxplot(),
ggplot(df, aes(x = "ph_SO", y = ph_SO))+geom_boxplot(),
ggplot(df, aes(x = "fd_E", y = fd_E))+geom_boxplot(),
ggplot(df, aes(x = "fd_DP", y = fd_DP))+geom_boxplot(),
ggplot(df, aes(x = "WINS", y = WINS))+geom_boxplot(),
ncol=4)
corrplot(cor(df, use = "complete.obs"), method="color", type="lower", tl.col = "black", tl.srt = 25)
Here, we see a scatter plot of each of the feature variables with the target variable.
grid.arrange(ggplot(df, aes(bt_H, WINS)) + geom_point(),
ggplot(df, aes(bt_2B, WINS)) + geom_point(),
ggplot(df, aes(bt_3B, WINS)) + geom_point(),
ggplot(df, aes(bt_HR, WINS)) + geom_point(),
ggplot(df, aes(bt_BB, WINS)) + geom_point(),
ggplot(df, aes(bt_SO, WINS)) + geom_point(),
ggplot(df, aes(br_SB, WINS)) + geom_point(),
ggplot(df, aes(br_CS, WINS)) + geom_point(),
ggplot(df, aes(bt_HBP, WINS)) + geom_point(),
ggplot(df, aes(ph_H, WINS)) + geom_point(),
ggplot(df, aes(ph_HR, WINS)) + geom_point(),
ggplot(df, aes(ph_BB, WINS)) + geom_point(),
ggplot(df, aes(ph_SO, WINS)) + geom_point(),
ggplot(df, aes(fd_E, WINS)) + geom_point(),
ggplot(df, aes(fd_DP, WINS)) + geom_point(),
ncol=4)
While exploring the data, we noticed that the max values of ph_H, ph_BB, ph_SO, and fd_E seem abnormally high.
We see that the record for most hits in a season by team (ph_H) was set at 1,724 in 1921. However, we also know that the datapoints were normalized for 162 games in a season. To take a moderate approach, we will remove the some of the most egggregious outliers that are seen in these variables.
grid.arrange(ggplot(df, aes(x = "ph_H", y = ph_H))+geom_boxplot(),
ggplot(df, aes(x = "ph_BB", y = ph_BB))+geom_boxplot(),
ggplot(df, aes(x = "ph_SO", y = ph_SO))+geom_boxplot(),
ggplot(df, aes(x = "fd_E", y = fd_E))+geom_boxplot(),
ncol=4)
df <- filter(df, ph_H < 15000 | ph_BB < 1500 | ph_SO < 3000 | fd_E < 1500)
We will also remove influencial outliers using Cooks distance.
mod <- lm(WINS ~ ., data=df)
cooksd <- cooks.distance(mod)
plot(cooksd, pch="*", cex=2, main="Influential Outliers by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
We remove the influencial outliers.
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
df <- df[-influential, ]
The following features have missing values.
Since most values in bt_HBP are missing (90%), we will drop this feature.
We will use Multivariable Imputation by Chained Equations (mice) to fill the missing variables.
df <- subset(df, select = -c(bt_HBP))
aggr_plot <- aggr(df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(data), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## br_CS 0.33934066
## fd_DP 0.12571429
## br_SB 0.05758242
## bt_SO 0.04483516
## ph_SO 0.04483516
## WINS 0.00000000
## bt_H 0.00000000
## bt_2B 0.00000000
## bt_3B 0.00000000
## bt_HR 0.00000000
## bt_BB 0.00000000
## ph_H 0.00000000
## ph_HR 0.00000000
## ph_BB 0.00000000
## fd_E 0.00000000
#write.csv(df, "C:\\Users\\mkive\\Documents\\GitHub\\Business-Analytics-Data-Mining\\Business-Analytics-Data-Mining\\Moneyball Regression\\baseball_output.csv")
We will begin with all independent variables and use the back elimination method to eliminate the non-significant ones.
be_lm1 <- lm(WINS ~., data = df)
summary(be_lm1)
##
## Call:
## lm(formula = WINS ~ ., data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.840 -8.261 0.255 8.063 58.803
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.126e+01 7.397e+00 9.634 < 2e-16 ***
## bt_H 4.312e-02 3.685e-03 11.700 < 2e-16 ***
## bt_2B -2.292e-02 8.895e-03 -2.576 0.010051 *
## bt_3B 6.010e-02 1.661e-02 3.619 0.000302 ***
## bt_HR 1.705e-01 3.091e-02 5.514 3.90e-08 ***
## bt_BB 2.336e-02 6.080e-03 3.843 0.000125 ***
## bt_SO -1.179e-02 2.504e-03 -4.708 2.66e-06 ***
## br_SB 4.750e-02 5.313e-03 8.940 < 2e-16 ***
## br_CS -8.480e-04 1.071e-02 -0.079 0.936878
## ph_H 1.000e-03 4.370e-04 2.289 0.022175 *
## ph_HR -8.838e-02 2.837e-02 -3.116 0.001859 **
## ph_BB -8.976e-03 4.329e-03 -2.073 0.038249 *
## ph_SO 1.158e-03 8.935e-04 1.296 0.195121
## fd_E -5.340e-02 3.370e-03 -15.845 < 2e-16 ***
## fd_DP -1.128e-01 1.233e-02 -9.147 < 2e-16 ***
## bt_1B NA NA NA NA
## BB -4.075e+01 5.829e+00 -6.991 3.58e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.54 on 2258 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3634, Adjusted R-squared: 0.3592
## F-statistic: 85.94 on 15 and 2258 DF, p-value: < 2.2e-16
We will start by eliminating the variables with high p-values and lowest significance from the model
Let’s take a look at the resulting model:
be_lm1_1 <- lm(WINS ~ bt_H + bt_SO + br_SB + ph_HR + fd_E + fd_DP + BB, data = df)
summary(be_lm1_1)
##
## Call:
## lm(formula = WINS ~ bt_H + bt_SO + br_SB + ph_HR + fd_E + fd_DP +
## BB, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.406 -8.681 0.188 8.571 50.992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.032320 6.046549 9.101 < 2e-16 ***
## bt_H 0.041900 0.002537 16.517 < 2e-16 ***
## bt_SO -0.012711 0.002075 -6.126 1.06e-09 ***
## br_SB 0.049784 0.003897 12.774 < 2e-16 ***
## ph_HR 0.064094 0.008003 8.009 1.83e-15 ***
## fd_E -0.045279 0.002865 -15.804 < 2e-16 ***
## fd_DP -0.104410 0.011864 -8.800 < 2e-16 ***
## BB -15.324908 3.851413 -3.979 7.14e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.71 on 2266 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3433, Adjusted R-squared: 0.3413
## F-statistic: 169.2 on 7 and 2266 DF, p-value: < 2.2e-16
We are seeing high significance indicators and p-values of 0 across all 10 remaining variables, however our R squared value is rather low - 36.
The next step is to check residuals plot and QQ plot to check the validity of our model.
plot(be_lm1_1$residuals)
abline(h = 0, lty = 3)
qqnorm(be_lm1_1$residuals)
qqline(be_lm1_1$residuals)
Both of these plots show that the model is a reasonable model. There is no pattern evident in the residuals and normality assumptions is close enough, even though there are some outliers.
We are going to use Box-Cox transformation to determine if a transformation is required.
boxcox(be_lm1_1, data=df, plotit=T)
Lambda is close to 1, so no transformation is needed.
This Linear Model will be built using the variables we believe would have the highest corelation with WINs.
THe following variables will be used: - Base Hits by batters (1B,2B,3B,HR) - Walks by batters - Stolen bases - Strikeouts by batters - Errors - Strikeouts by pitchers - Double Plays - Hits allowed
be_lm2b <- lm(WINS ~ bt_H + bt_BB + br_SB + bt_SO + fd_E + ph_SO + fd_DP + ph_H, df)
summary(be_lm2b)
##
## Call:
## lm(formula = WINS ~ bt_H + bt_BB + br_SB + bt_SO + fd_E + ph_SO +
## fd_DP + ph_H, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.371 -8.782 0.096 8.404 48.478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.7559399 4.0363642 3.656 0.000262 ***
## bt_H 0.0516053 0.0022156 23.292 < 2e-16 ***
## bt_BB 0.0148564 0.0030068 4.941 8.34e-07 ***
## br_SB 0.0402606 0.0040219 10.010 < 2e-16 ***
## bt_SO -0.0023137 0.0016423 -1.409 0.159035
## fd_E -0.0368034 0.0026176 -14.060 < 2e-16 ***
## ph_SO 0.0007058 0.0006636 1.063 0.287674
## fd_DP -0.1018142 0.0124835 -8.156 5.68e-16 ***
## ph_H 0.0011261 0.0003389 3.323 0.000904 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.88 on 2266 degrees of freedom
## Multiple R-squared: 0.3335, Adjusted R-squared: 0.3312
## F-statistic: 141.7 on 8 and 2266 DF, p-value: < 2.2e-16
Let’s remove the two variables with low significance:
be_lm2b <- lm(WINS ~ I(bt_H^1/2) + I(bt_BB^1/2) + I(br_SB^1/2) + I(fd_E^1/2) + I(fd_DP^1/2) + I(ph_H^1/2), df)
summary(be_lm2b)
##
## Call:
## lm(formula = WINS ~ I(bt_H^1/2) + I(bt_BB^1/2) + I(br_SB^1/2) +
## I(fd_E^1/2) + I(fd_DP^1/2) + I(ph_H^1/2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.801 -8.782 0.074 8.372 47.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.9481124 3.3773778 3.834 0.00013 ***
## I(bt_H^1/2) 0.1039919 0.0040143 25.906 < 2e-16 ***
## I(bt_BB^1/2) 0.0292321 0.0060052 4.868 1.21e-06 ***
## I(br_SB^1/2) 0.0814860 0.0077994 10.448 < 2e-16 ***
## I(fd_E^1/2) -0.0726319 0.0048402 -15.006 < 2e-16 ***
## I(fd_DP^1/2) -0.2063788 0.0248440 -8.307 < 2e-16 ***
## I(ph_H^1/2) 0.0025359 0.0005943 4.267 2.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.88 on 2268 degrees of freedom
## Multiple R-squared: 0.3329, Adjusted R-squared: 0.3311
## F-statistic: 188.6 on 6 and 2268 DF, p-value: < 2.2e-16
plot(be_lm2b$residuals)
abline(h = 0, lty = 3)
qqnorm(be_lm2b$residuals)
qqline(be_lm2b$residuals)
dfeval <- read.csv("https://raw.githubusercontent.com/mkivenson/Business-Analytics-Data-Mining/master/Moneyball%20Regression/moneyball-evaluation-data.csv")[-1]
names(dfeval) <- sub("TEAM_", "", names(dfeval))
names(dfeval) <- sub("BATTING_", "bt_", names(dfeval))
names(dfeval) <- sub("BASERUN_", "br_", names(dfeval))
names(dfeval) <- sub("FIELDING_", "fd_", names(dfeval))
names(dfeval) <- sub("PITCHING_", "ph_", names(dfeval))
names(dfeval) <- sub("TARGET_", "", names(dfeval))
head(dfeval)
## bt_H bt_2B bt_3B bt_HR bt_BB bt_SO br_SB br_CS bt_HBP ph_H ph_HR ph_BB
## 1 1209 170 33 83 447 1080 62 50 NA 1209 83 447
## 2 1221 151 29 88 516 929 54 39 NA 1221 88 516
## 3 1395 183 29 93 509 816 59 47 NA 1395 93 509
## 4 1539 309 29 159 486 914 148 57 42 1539 159 486
## 5 1445 203 68 5 95 416 NA NA NA 3902 14 257
## 6 1431 236 53 10 215 377 NA NA NA 2793 20 420
## ph_SO fd_E fd_DP
## 1 1080 140 156
## 2 929 135 164
## 3 816 156 153
## 4 914 124 154
## 5 1123 616 130
## 6 736 572 105
pred <- predict(be_lm2b, dfeval, type="response", se.fit=FALSE)
final <- data.frame(Win_Pred=pred)
Mean = mean(final[, 1], na.rm = TRUE)
final[,1][is.na(final[,1])] <- Mean
#write.csv(final,"Baseball_pred.csv", row.names = FALSE)
plot(be_lm2b)