library(tidyverse)
library(MASS)
library(PerformanceAnalytics)
library(mice)
library(DMwR)
library(HDCI)
library(caret)
library(repr)
setwd("~/CUNY/DATA 621/HW 1")
mbeval <- read_csv('moneyball-evaluation-data.csv')
mbtrain <- read_csv('moneyball-training-data.csv')
glimpse(mbtrain)
## Observations: 2,276
## Variables: 17
## $ INDEX            <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 15, 16, 1...
## $ TARGET_WINS      <dbl> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 6...
## $ TEAM_BATTING_H   <dbl> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 127...
## $ TEAM_BATTING_2B  <dbl> 194, 219, 232, 209, 186, 200, 179, 171, 197, ...
## $ TEAM_BATTING_3B  <dbl> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 3...
## $ TEAM_BATTING_HR  <dbl> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96,...
## $ TEAM_BATTING_BB  <dbl> 143, 685, 602, 451, 472, 443, 525, 456, 447, ...
## $ TEAM_BATTING_SO  <dbl> 842, 1075, 917, 922, 920, 973, 1062, 1027, 92...
## $ TEAM_BASERUN_SB  <dbl> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, ...
## $ TEAM_BASERUN_CS  <dbl> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 7...
## $ TEAM_BATTING_HBP <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ TEAM_PITCHING_H  <dbl> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 128...
## $ TEAM_PITCHING_HR <dbl> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96,...
## $ TEAM_PITCHING_BB <dbl> 927, 689, 602, 454, 472, 443, 525, 459, 447, ...
## $ TEAM_PITCHING_SO <dbl> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 9...
## $ TEAM_FIELDING_E  <dbl> 1011, 193, 175, 164, 138, 123, 136, 112, 127,...
## $ TEAM_FIELDING_DP <dbl> NA, 155, 153, 156, 168, 149, 186, 136, 169, 1...
skimr::skim(mbtrain)
Data summary
Name mbtrain
Number of rows 2276
Number of columns 17
_______________________
Column type frequency:
numeric 17
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 1268.46 736.35 1 630.75 1270.5 1915.50 2535 ▇▇▇▇▇
TARGET_WINS 0 1.00 80.79 15.75 0 71.00 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.00 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.00 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.00 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.00 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.00 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.00 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.00 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.00 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.50 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.00 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.00 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.00 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.00 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.00 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.00 149.0 164.00 228 ▁▂▇▆▁
chart.Correlation(mbtrain, histogram = TRUE)

From skim() we can see there is a high percentage of missing data for baserun_cs and batting_hbp. Also from the correlation matrix we can see that those two fields do not have a statistically significant relationship with wins. For these reasons we are going to remove those columns.

mbtrain1 <- mbtrain %>% subset(select = -c(TEAM_BASERUN_CS, TEAM_BATTING_HBP))

We still have missing data in Batting_SO, Baserun_SB, Pitching_SO, and Fielding_DP. Lets see how different methods of imputation effect our model. We can omit rows with missing values, replace them with the median, use kNN, or mice.

foo<-mbtrain1 %>% 
  dplyr::select(TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
  
ggplot(gather(foo),aes(value)) +
  geom_histogram(bins = 500) +
  facet_wrap(~key)

Let’s impute the data.

#Since lm() uses case complete by default, let's start with replacing NA's with the column median.
mbtrainmed <- mbtrain1 %>% 
   mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
#Next lets use kNN 
mbtrainnkk <- knnImputation(as.data.frame(mbtrain1))
#And finally, mice based on random forrest
mbtrainmice <- complete(mice(mbtrain1, method = "rf"))
## 
##  iter imp variable
##   1   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   1   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   2   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   3   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   4   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   1  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   2  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   3  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   4  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP
##   5   5  TEAM_BATTING_SO  TEAM_BASERUN_SB  TEAM_PITCHING_SO  TEAM_FIELDING_DP

Let’s pick the imputation method we will use for our regressions.

nonnacols <- mbtrain1[complete.cases(mbtrain1),]
nonnacols <- nonnacols %>% 
  dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)

nacols <- mbtrain1[!complete.cases(mbtrain1),]
nacols <- nacols %>% 
  dplyr::select(INDEX)

nacolsmed <- merge(nacols, mbtrainmed, by = "INDEX")
nacolsmed <- nacolsmed %>% 
  dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
nacolsnkk <- merge(nacols, mbtrainnkk, by = "INDEX")
nacolsnkk <- nacolsnkk %>% 
  dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)
nacolsmice <- merge(nacols, mbtrainmice, by = "INDEX")
nacolsmice <- nacolsmice %>% 
  dplyr::select(INDEX, TEAM_BATTING_SO, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_FIELDING_DP)

#I discovered I can't test the dataset as a whole so instead I'm testing the column with the most missing values. 
naevalmed <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsmed$TEAM_FIELDING_DP)
naevalnkk <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsnkk$TEAM_FIELDING_DP)
naevalmice <- regr.eval(nonnacols$TEAM_FIELDING_DP, nacolsmice$TEAM_FIELDING_DP)
base::rbind(naevalmed, naevalnkk, naevalmice)
##                 mae      mse     rmse      mape
## naevalmed  30.84578 1646.843 40.58131 0.2040520
## naevalnkk  38.55577 2130.493 46.15727 0.2454723
## naevalmice 45.01853 2789.778 52.81835 0.2901541

The root mean squared error and mean absolute percentage error are lowest when values are imputed with the median of TEAM_FIELDING_DP. For this reason we will use this method for out models.

For our models, first we’re going to run a linear regression. Variables that don’t reach statistical significance will be removed.

lmmed <- lm(TARGET_WINS~., mbtrainmed)
#summary(lmmed)
#I will now removed one variable at a time based on which is the least significant and retested the model until all variables are significant.
lmmed <- lm(TARGET_WINS~. -TEAM_PITCHING_BB -TEAM_PITCHING_HR -INDEX, mbtrainmed)
summary(lmmed)
## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_PITCHING_BB - TEAM_PITCHING_HR - 
##     INDEX, data = mbtrainmed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.598  -8.593   0.085   8.445  58.582 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.3440443  5.2338369   4.269 2.04e-05 ***
## TEAM_BATTING_H    0.0490922  0.0036699  13.377  < 2e-16 ***
## TEAM_BATTING_2B  -0.0213744  0.0091626  -2.333 0.019746 *  
## TEAM_BATTING_3B   0.0665763  0.0166230   4.005 6.40e-05 ***
## TEAM_BATTING_HR   0.0674046  0.0096315   6.998 3.40e-12 ***
## TEAM_BATTING_BB   0.0115464  0.0033748   3.421 0.000634 ***
## TEAM_BATTING_SO  -0.0085211  0.0024529  -3.474 0.000523 ***
## TEAM_BASERUN_SB   0.0249207  0.0042092   5.920 3.70e-09 ***
## TEAM_PITCHING_H  -0.0007770  0.0003209  -2.421 0.015552 *  
## TEAM_PITCHING_SO  0.0029662  0.0006719   4.415 1.06e-05 ***
## TEAM_FIELDING_E  -0.0190100  0.0023919  -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894  0.0129296  -9.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared:  0.3151, Adjusted R-squared:  0.3117 
## F-statistic: 94.68 on 11 and 2264 DF,  p-value: < 2.2e-16
#Imputing median values in eval data.
mbeval1 <- mbeval %>% 
   mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
lmpred<-predict(lmmed, mbeval1)

Let’s see what we got:

summary(lmmed)
## 
## Call:
## lm(formula = TARGET_WINS ~ . - TEAM_PITCHING_BB - TEAM_PITCHING_HR - 
##     INDEX, data = mbtrainmed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.598  -8.593   0.085   8.445  58.582 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.3440443  5.2338369   4.269 2.04e-05 ***
## TEAM_BATTING_H    0.0490922  0.0036699  13.377  < 2e-16 ***
## TEAM_BATTING_2B  -0.0213744  0.0091626  -2.333 0.019746 *  
## TEAM_BATTING_3B   0.0665763  0.0166230   4.005 6.40e-05 ***
## TEAM_BATTING_HR   0.0674046  0.0096315   6.998 3.40e-12 ***
## TEAM_BATTING_BB   0.0115464  0.0033748   3.421 0.000634 ***
## TEAM_BATTING_SO  -0.0085211  0.0024529  -3.474 0.000523 ***
## TEAM_BASERUN_SB   0.0249207  0.0042092   5.920 3.70e-09 ***
## TEAM_PITCHING_H  -0.0007770  0.0003209  -2.421 0.015552 *  
## TEAM_PITCHING_SO  0.0029662  0.0006719   4.415 1.06e-05 ***
## TEAM_FIELDING_E  -0.0190100  0.0023919  -7.948 2.97e-15 ***
## TEAM_FIELDING_DP -0.1217894  0.0129296  -9.419  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 2264 degrees of freedom
## Multiple R-squared:  0.3151, Adjusted R-squared:  0.3117 
## F-statistic: 94.68 on 11 and 2264 DF,  p-value: < 2.2e-16
plot(lmmed)

regr.eval(mbtrain$TARGET_WINS, lmpred)
##       mae       mse      rmse      mape 
##  14.59687 356.40579  18.87871       Inf

The residual plots are not evenly varried and the QQ plot is not linear. This is not the right model for our predictions.

Let’s try a Lasso regression instead.

#Removing index column. Not of predictive value. 
mbtrainmed1 <- mbtrainmed[,c(-1,-2)]
mblasso <- Lasso(mbtrainmed1, mbtrainmed$TARGET_WINS, fix.lambda = FALSE)
#Removing same columns from eval that were removed from training.
mbeval2 <- mbeval[,c(-1, -9,-10)]
#Imputing median values in eval data.
mbeval2 <- mbeval2 %>% 
   mutate_all(~ifelse(is.na(.), median(., na.rm = TRUE), .))
lassopred<-mypredict(mblasso, mbeval2)

Let’s analyze our results. Acceptance criteria is a lower rmse and valid residual plots.

regr.eval(mbtrain$TARGET_WINS, lassopred)
##       mae       mse      rmse      mape 
##  14.25809 338.25174  18.39162       Inf

It appears that the lasso prediction has a lower rmse but it is very close. Plots are not working for the Lasso regression but with such a close rmse, the accuracy does not appear to have improved. In the future, I’ll try to create new values that are more predictive or use BoxCox transformations to make existing variables more evenly distributed to better fit the model.