library(tidyverse)
library(MASS)
library(PerformanceAnalytics)
library(mice)
library(DMwR)
library(HDCI)
library(caret)
library(repr)
setwd("~/CUNY/DATA 621/HW 1")
mbeval <- read_csv('moneyball-evaluation-data.csv')
mbtrain <- read_csv('moneyball-training-data.csv')
glimpse(mbtrain)
## Observations: 2,276
## Variables: 17
## $ INDEX <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 1...
## $ TARGET_WINS <dbl> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 6...
## $ TEAM_BATTING_H <dbl> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 127...
## $ TEAM_BATTING_2B <dbl> 194, 219, 232, 209, 186, 200, 179, 171, 197, ...
## $ TEAM_BATTING_3B <dbl> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 3...
## $ TEAM_BATTING_HR <dbl> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96,...
## $ TEAM_BATTING_BB <dbl> 143, 685, 602, 451, 472, 443, 525, 456, 447, ...
## $ TEAM_BATTING_SO <dbl> 842, 1075, 917, 922, 920, 973, 1062, 1027, 92...
## $ TEAM_BASERUN_SB <dbl> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, ...
## $ TEAM_BASERUN_CS <dbl> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 7...
## $ TEAM_BATTING_HBP <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ TEAM_PITCHING_H <dbl> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 128...
## $ TEAM_PITCHING_HR <dbl> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96,...
## $ TEAM_PITCHING_BB <dbl> 927, 689, 602, 454, 472, 443, 525, 459, 447, ...
## $ TEAM_PITCHING_SO <dbl> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 9...
## $ TEAM_FIELDING_E <dbl> 1011, 193, 175, 164, 138, 123, 136, 112, 127,...
## $ TEAM_FIELDING_DP <dbl> NA, 155, 153, 156, 168, 149, 186, 136, 169, 1...
skimr::skim(mbtrain)
Name | mbtrain |
Number of rows | 2276 |
Number of columns | 17 |
_______________________ | |
Column type frequency: | |
numeric | 17 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
INDEX | 0 | 1.00 | 1268.46 | 736.35 | 1 | 630.75 | 1270.5 | 1915.50 | 2535 | ▇▇▇▇▇ |
TARGET_WINS | 0 | 1.00 | 80.79 | 15.75 | 0 | 71.00 | 82.0 | 92.00 | 146 | ▁▁▇▅▁ |
TEAM_BATTING_H | 0 | 1.00 | 1469.27 | 144.59 | 891 | 1383.00 | 1454.0 | 1537.25 | 2554 | ▁▇▂▁▁ |
TEAM_BATTING_2B | 0 | 1.00 | 241.25 | 46.80 | 69 | 208.00 | 238.0 | 273.00 | 458 | ▁▆▇▂▁ |
TEAM_BATTING_3B | 0 | 1.00 | 55.25 | 27.94 | 0 | 34.00 | 47.0 | 72.00 | 223 | ▇▇▂▁▁ |
TEAM_BATTING_HR | 0 | 1.00 | 99.61 | 60.55 | 0 | 42.00 | 102.0 | 147.00 | 264 | ▇▆▇▅▁ |
TEAM_BATTING_BB | 0 | 1.00 | 501.56 | 122.67 | 0 | 451.00 | 512.0 | 580.00 | 878 | ▁▁▇▇▁ |
TEAM_BATTING_SO | 102 | 0.96 | 735.61 | 248.53 | 0 | 548.00 | 750.0 | 930.00 | 1399 | ▁▆▇▇▁ |
TEAM_BASERUN_SB | 131 | 0.94 | 124.76 | 87.79 | 0 | 66.00 | 101.0 | 156.00 | 697 | ▇▃▁▁▁ |
TEAM_BASERUN_CS | 772 | 0.66 | 52.80 | 22.96 | 0 | 38.00 | 49.0 | 62.00 | 201 | ▃▇▁▁▁ |
TEAM_BATTING_HBP | 2085 | 0.08 | 59.36 | 12.97 | 29 | 50.50 | 58.0 | 67.00 | 95 | ▂▇▇▅▁ |
TEAM_PITCHING_H | 0 | 1.00 | 1779.21 | 1406.84 | 1137 | 1419.00 | 1518.0 | 1682.50 | 30132 | ▇▁▁▁▁ |
TEAM_PITCHING_HR | 0 | 1.00 | 105.70 | 61.30 | 0 | 50.00 | 107.0 | 150.00 | 343 | ▇▇▆▁▁ |
TEAM_PITCHING_BB | 0 | 1.00 | 553.01 | 166.36 | 0 | 476.00 | 536.5 | 611.00 | 3645 | ▇▁▁▁▁ |
TEAM_PITCHING_SO | 102 | 0.96 | 817.73 | 553.09 | 0 | 615.00 | 813.5 | 968.00 | 19278 | ▇▁▁▁▁ |
TEAM_FIELDING_E | 0 | 1.00 | 246.48 | 227.77 | 65 | 127.00 | 159.0 | 249.25 | 1898 | ▇▁▁▁▁ |
TEAM_FIELDING_DP | 286 | 0.87 | 146.39 | 26.23 | 52 | 131.00 | 149.0 | 164.00 | 228 | ▁▂▇▆▁ |
chart.Correlation(mbtrain, histogram = TRUE)
From skim() we can see there is a high percentage of missing data for baserun_cs and batting_hbp. Also from the correlation matrix we can see that those two fields do not have a statistically significant relationship with wins. For these reasons we are going to remove those columns.
mbtrain1 <- mbtrain %>% subset(select = -c(TEAM_BASERUN_CS, TEAM_BATTING_HBP))
We still have missing data in Batting_SO, Baserun_SB, Pitching_SO, and Fielding_DP. Lets see how different methods of imputation effect our model. We can omit rows with missing values, replace them with the median, use kNN, or mice.
foo<-mbtrain1 %>%
dplyr::select(TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
ggplot(gather(foo),aes(value)) +
geom_histogram(bins = 500) +
facet_wrap(~key)
Let’s impute the data.
#Since lm() uses case complete by default, let's start with replacing NA's with the column median.
mbtrainmed <- mbtrain1 %>%
mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
#Next lets use kNN
mbtrainnkk <- knnImputation(as.data.frame(mbtrain1))
#And finally, mice based on random forrest
mbtrainmice <- complete(mice(mbtrain1, method = "rf"))
##
## iter imp variable
## 1 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 1 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 2 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 3 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 4 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 1 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 2 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 3 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 4 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
## 5 5 TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_PITCHING_SO TEAM_FIELDING_DP
Let’s pick the imputation method we will use for our regressions.
nonnacols <- mbtrain1[complete.cases(mbtrain1),]
nonnacols <- nonnacols %>%
dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
nacols <- mbtrain1[!complete.cases(mbtrain1),]
nacols <- nacols %>%
dplyr::select(INDEX)
nacolsmed <- merge(nacols, mbtrainmed, by = "INDEX")
nacolsmed <- nacolsmed %>%
dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
nacolsnkk <- merge(nacols, mbtrainnkk, by = "INDEX")
nacolsnkk <- nacolsnkk %>%
dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
nacolsmice <- merge(nacols, mbtrainmice, by = "INDEX")
nacolsmice <- nacolsmice %>%
dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
#I discovered I can't test the dataset as a whole so instead I'm testing the column with the most missing values.
naevalmed <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsmed$TEAM_FIELDING_DP)
naevalnkk <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsnkk$TEAM_FIELDING_DP)
naevalmice <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsmice$TEAM_FIELDING_DP)
base::rbind(naevalmed, naevalnkk, naevalmice)
## mae mse rmse mape
## naevalmed 30.84578 1646.843 40.58131 0.2040520
## naevalnkk 38.55577 2130.493 46.15727 0.2454723
## naevalmice 45.01853 2789.778 52.81835 0.2901541
The root mean squared error and mean absolute percentage error are lowest when values are imputed with the median of TEAM_FIELDING_DP. For this reason we will use this method for out models.
For our models, first we’re going to run a linear regression. Variables that don’t reach statistical significance will be removed.
lmmed <- lm(TARGET_WINS~., mbtrainmed)
#summary(lmmed)
#I will now removed one variable at a time based on which is the least significant and retested the model until all variables are significant.
lmmed <- lm(TARGET_WINS~. -TEAM_PITCHING_BB -TEAM_PITCHING_HR -INDEX, mbtrainmed)
summary(lmmed)
##
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_PITCHING_BB - TEAM_PITCHING_HR -
## INDEX, data = mbtrainmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.598 -8.593 0.085 8.445 58.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.3440443 5.2338369 4.269 2.04e-05 ***
## TEAM_BATTING_H 0.0490922 0.0036699 13.377 < 2e-16 ***
## TEAM_BATTING_2B -0.0213744 0.0091626 -2.333 0.019746 *
## TEAM_BATTING_3B 0.0665763 0.0166230 4.005 6.40e-05 ***
## TEAM_BATTING_HR 0.0674046 0.0096315 6.998 3.40e-12 ***
## TEAM_BATTING_BB 0.0115464 0.0033748 3.421 0.000634 ***
## TEAM_BATTING_SO -0.0085211 0.0024529 -3.474 0.000523 ***
## TEAM_BASERUN_SB 0.0249207 0.0042092 5.920 3.70e-09 ***
## TEAM_PITCHING_H -0.0007770 0.0003209 -2.421 0.015552 *
## TEAM_PITCHING_SO 0.0029662 0.0006719 4.415 1.06e-05 ***
## TEAM_FIELDING_E -0.0190100 0.0023919 -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894 0.0129296 -9.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared: 0.3151, Adjusted R-squared: 0.3117
## F-statistic: 94.68 on 11 and 2264 DF, p-value: < 2.2e-16
#Imputing median values in eval data.
mbeval1 <- mbeval %>%
mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
lmpred<-predict(lmmed, mbeval1)
Let’s see what we got:
summary(lmmed)
##
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_PITCHING_BB - TEAM_PITCHING_HR -
## INDEX, data = mbtrainmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.598 -8.593 0.085 8.445 58.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.3440443 5.2338369 4.269 2.04e-05 ***
## TEAM_BATTING_H 0.0490922 0.0036699 13.377 < 2e-16 ***
## TEAM_BATTING_2B -0.0213744 0.0091626 -2.333 0.019746 *
## TEAM_BATTING_3B 0.0665763 0.0166230 4.005 6.40e-05 ***
## TEAM_BATTING_HR 0.0674046 0.0096315 6.998 3.40e-12 ***
## TEAM_BATTING_BB 0.0115464 0.0033748 3.421 0.000634 ***
## TEAM_BATTING_SO -0.0085211 0.0024529 -3.474 0.000523 ***
## TEAM_BASERUN_SB 0.0249207 0.0042092 5.920 3.70e-09 ***
## TEAM_PITCHING_H -0.0007770 0.0003209 -2.421 0.015552 *
## TEAM_PITCHING_SO 0.0029662 0.0006719 4.415 1.06e-05 ***
## TEAM_FIELDING_E -0.0190100 0.0023919 -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894 0.0129296 -9.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared: 0.3151, Adjusted R-squared: 0.3117
## F-statistic: 94.68 on 11 and 2264 DF, p-value: < 2.2e-16
plot(lmmed)
regr.eval(mbtrain$TARGET_WINS, lmpred)
## mae mse rmse mape
## 14.59687 356.40579 18.87871 Inf
The residual plots are not evenly varried and the QQ plot is not linear. This is not the right model for our predictions.
Let’s try a Lasso regression instead.
#Removing index column. Not of predictive value.
mbtrainmed1 <- mbtrainmed[,c(-1,-2)]
mblasso <- Lasso(mbtrainmed1, mbtrainmed$TARGET_WINS, fix.lambda = FALSE)
#Removing same columns from eval that were removed from training.
mbeval2 <- mbeval[,c(-1, -9,-10)]
#Imputing median values in eval data.
mbeval2 <- mbeval2 %>%
mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
lassopred<-mypredict(mblasso, mbeval2)
Let’s analyze our results. Acceptance criteria is a lower rmse and valid residual plots.
regr.eval(mbtrain$TARGET_WINS, lassopred)
## mae mse rmse mape
## 14.25809 338.25174 18.39162 Inf
It appears that the lasso prediction has a lower rmse but it is very close. Plots are not working for the Lasso regression but with such a close rmse, the accuracy does not appear to have improved. In the future, I’ll try to create new values that are more predictive or use BoxCox transformations to make existing variables more evenly distributed to better fit the model.