Load datasets

train <- read.csv("datasets/moneyball-training-data.csv")
train <- train[c(2:17)]
summary(train)
##   TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B 
##  Min.   :  0.00   Min.   : 891   Min.   : 69.0   Min.   :  0.00  
##  1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0   1st Qu.: 34.00  
##  Median : 82.00   Median :1454   Median :238.0   Median : 47.00  
##  Mean   : 80.79   Mean   :1469   Mean   :241.2   Mean   : 55.25  
##  3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0   3rd Qu.: 72.00  
##  Max.   :146.00   Max.   :2554   Max.   :458.0   Max.   :223.00  
##                                                                  
##  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO  TEAM_BASERUN_SB
##  Min.   :  0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0   1st Qu.: 66.0  
##  Median :102.00   Median :512.0   Median : 750.0   Median :101.0  
##  Mean   : 99.61   Mean   :501.6   Mean   : 735.6   Mean   :124.8  
##  3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0   3rd Qu.:156.0  
##  Max.   :264.00   Max.   :878.0   Max.   :1399.0   Max.   :697.0  
##                                   NA's   :102      NA's   :131    
##  TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.0   Min.   :29.00    Min.   : 1137   Min.   :  0.0   
##  1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419   1st Qu.: 50.0   
##  Median : 49.0   Median :58.00    Median : 1518   Median :107.0   
##  Mean   : 52.8   Mean   :59.36    Mean   : 1779   Mean   :105.7   
##  3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682   3rd Qu.:150.0   
##  Max.   :201.0   Max.   :95.00    Max.   :30132   Max.   :343.0   
##  NA's   :772     NA's   :2085                                     
##  TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0   
##  1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0   1st Qu.:131.0   
##  Median : 536.5   Median :  813.5   Median : 159.0   Median :149.0   
##  Mean   : 553.0   Mean   :  817.7   Mean   : 246.5   Mean   :146.4   
##  3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2   3rd Qu.:164.0   
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0   
##                   NA's   :102                        NA's   :286
head(train, n=10)
##    TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1           39           1445             194              39              13
## 2           70           1339             219              22             190
## 3           86           1377             232              35             137
## 4           70           1387             209              38              96
## 5           82           1297             186              27             102
## 6           75           1279             200              36              92
## 7           80           1244             179              54             122
## 8           85           1273             171              37             115
## 9           86           1391             197              40             114
## 10          76           1271             213              18              96
##    TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1              143             842              NA              NA
## 2              685            1075              37              28
## 3              602             917              46              27
## 4              451             922              43              30
## 5              472             920              49              39
## 6              443             973             107              59
## 7              525            1062              80              54
## 8              456            1027              40              36
## 9              447             922              69              27
## 10             441             827              72              34
##    TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1                NA            9364               84              927
## 2                NA            1347              191              689
## 3                NA            1377              137              602
## 4                NA            1396               97              454
## 5                NA            1297              102              472
## 6                NA            1279               92              443
## 7                NA            1244              122              525
## 8                NA            1281              116              459
## 9                NA            1391              114              447
## 10               NA            1271               96              441
##    TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1              5456            1011               NA
## 2              1082             193              155
## 3               917             175              153
## 4               928             164              156
## 5               920             138              168
## 6               973             123              149
## 7              1062             136              186
## 8              1033             112              136
## 9               922             127              169
## 10              827             131              159

Exploratory Data Analysis

Wins by Team

plot(train$TEAM_BATTING_H, train$TARGET_WINS )

par(mfrow=c(2,2))
hist(train$TEAM_BATTING_H,
     main = "hits histogram", xlab = "hits (season)",
     breaks = 20)
hist(train$TEAM_BATTING_2B,
     main = "doubles histogram", xlab = "doubles (season)",
     breaks = 20)
hist(train$TEAM_BATTING_3B,
     main = "triples histogram", xlab = "triples (season)",
     breaks = 20)
hist(train$TEAM_BATTING_HR,
     main = "homeruns histogram", xlab = "homeruns (season)",
     breaks = 20)

par(mfrow=c(1,1))

Addressing data quality issues

From the summary, we can observe that following variables have a high number of NA or missing values, and we need to address these before we can work on the models.

TEAM_BATTING_SO - 102
TEAM_BASERUN_SB - 131
TEAM_BASERUN_CS - 772
TEAM_BATTING_HBP - 2085
TEAM_PITCHING_SO - 102
TEAM_FIELDING_DP - 286

TEAM_BATTING_HBP has highest number of missing cases i.e., 2085 (almost 90%). Therefore, we can drop this variable from our dataset.

# Removing TEAM_BATTING_HBP variable
 train <- subset(train, select = -c(10))

Now let’s measure how many complete observations do we have

sum(complete.cases(train)==TRUE)
## [1] 1486
sum(complete.cases(train)==FALSE)
## [1] 790

Set mean value to the missing col values in the dataset

train$TEAM_BATTING_SO[is.na(train$TEAM_BATTING_SO)] = mean(train$TEAM_BATTING_SO, na.rm = TRUE)
train$TEAM_BASERUN_SB[is.na(train$TEAM_BASERUN_SB)] = mean(train$TEAM_BASERUN_SB, na.rm = TRUE)
train$TEAM_BASERUN_CS[is.na(train$TEAM_BASERUN_CS)] = mean(train$TEAM_BASERUN_CS, na.rm = TRUE)
train$TEAM_PITCHING_SO[is.na(train$TEAM_PITCHING_SO)] = mean(train$TEAM_PITCHING_SO, na.rm = TRUE)
train$TEAM_FIELDING_DP[is.na(train$TEAM_FIELDING_DP)] = mean(train$TEAM_FIELDING_DP, na.rm = TRUE)

We have about 1486 complete observations, which is close to 65%. Therefore, we should try to work with other variables to see if we can increase this as much as possible.

We have 2 options, either replace them with a mean or alternate value or identify alias variables which represent them with similar correlation.

Identify highest impacting independent variables

lm <- lm(formula = train$TARGET_WINS ~ ., data = train)
summary(lm)
## 
## Call:
## lm(formula = train$TARGET_WINS ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.994  -8.576   0.136   8.345  58.628 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.502e+01  5.397e+00   4.636 3.75e-06 ***
## TEAM_BATTING_H    4.824e-02  3.687e-03  13.085  < 2e-16 ***
## TEAM_BATTING_2B  -2.006e-02  9.152e-03  -2.192 0.028486 *  
## TEAM_BATTING_3B   6.047e-02  1.676e-02   3.608 0.000315 ***
## TEAM_BATTING_HR   5.299e-02  2.743e-02   1.932 0.053488 .  
## TEAM_BATTING_BB   1.042e-02  5.818e-03   1.790 0.073544 .  
## TEAM_BATTING_SO  -9.349e-03  2.551e-03  -3.665 0.000253 ***
## TEAM_BASERUN_SB   2.949e-02  4.462e-03   6.610 4.78e-11 ***
## TEAM_BASERUN_CS  -1.188e-02  1.614e-02  -0.736 0.461905    
## TEAM_PITCHING_H  -7.342e-04  3.676e-04  -1.997 0.045946 *  
## TEAM_PITCHING_HR  1.480e-02  2.432e-02   0.609 0.542877    
## TEAM_PITCHING_BB  8.891e-05  4.145e-03   0.021 0.982891    
## TEAM_PITCHING_SO  2.843e-03  9.187e-04   3.095 0.001994 ** 
## TEAM_FIELDING_E  -2.112e-02  2.480e-03  -8.516  < 2e-16 ***
## TEAM_FIELDING_DP -1.210e-01  1.302e-02  -9.297  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.04 on 2261 degrees of freedom
## Multiple R-squared:  0.3189, Adjusted R-squared:  0.3147 
## F-statistic: 75.63 on 14 and 2261 DF,  p-value: < 2.2e-16

Use covariates

#cor(train)

cor_train <- cor(train,  use = "na.or.complete")
corrplot(cor_train)