train <- read.csv("datasets/moneyball-training-data.csv")
train <- train[c(2:17)]
summary(train)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00
## 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00
## Median : 82.00 Median :1454 Median :238.0 Median : 47.00
## Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25
## 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00
## Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00
##
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0
## Median :102.00 Median :512.0 Median : 750.0 Median :101.0
## Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8
## 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0
## Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0
## NA's :102 NA's :131
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0
## 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0
## Median : 49.0 Median :58.00 Median : 1518 Median :107.0
## Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7
## 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0
## Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0
## NA's :772 NA's :2085
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
## Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
## Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
## 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
## NA's :102 NA's :286
head(train, n=10)
## TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR
## 1 39 1445 194 39 13
## 2 70 1339 219 22 190
## 3 86 1377 232 35 137
## 4 70 1387 209 38 96
## 5 82 1297 186 27 102
## 6 75 1279 200 36 92
## 7 80 1244 179 54 122
## 8 85 1273 171 37 115
## 9 86 1391 197 40 114
## 10 76 1271 213 18 96
## TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## 1 143 842 NA NA
## 2 685 1075 37 28
## 3 602 917 46 27
## 4 451 922 43 30
## 5 472 920 49 39
## 6 443 973 107 59
## 7 525 1062 80 54
## 8 456 1027 40 36
## 9 447 922 69 27
## 10 441 827 72 34
## TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 1 NA 9364 84 927
## 2 NA 1347 191 689
## 3 NA 1377 137 602
## 4 NA 1396 97 454
## 5 NA 1297 102 472
## 6 NA 1279 92 443
## 7 NA 1244 122 525
## 8 NA 1281 116 459
## 9 NA 1391 114 447
## 10 NA 1271 96 441
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 5456 1011 NA
## 2 1082 193 155
## 3 917 175 153
## 4 928 164 156
## 5 920 138 168
## 6 973 123 149
## 7 1062 136 186
## 8 1033 112 136
## 9 922 127 169
## 10 827 131 159
plot(train$TEAM_BATTING_H, train$TARGET_WINS )
par(mfrow=c(2,2))
hist(train$TEAM_BATTING_H,
main = "hits histogram", xlab = "hits (season)",
breaks = 20)
hist(train$TEAM_BATTING_2B,
main = "doubles histogram", xlab = "doubles (season)",
breaks = 20)
hist(train$TEAM_BATTING_3B,
main = "triples histogram", xlab = "triples (season)",
breaks = 20)
hist(train$TEAM_BATTING_HR,
main = "homeruns histogram", xlab = "homeruns (season)",
breaks = 20)
par(mfrow=c(1,1))
From the summary, we can observe that following variables have a high number of NA or missing values, and we need to address these before we can work on the models.
TEAM_BATTING_SO - 102 TEAM_BASERUN_SB - 131 TEAM_BASERUN_CS - 772 TEAM_BATTING_HBP - 2085 TEAM_PITCHING_SO - 102 TEAM_FIELDING_DP - 286
TEAM_BATTING_HBP has highest number of missing cases i.e., 2085 (almost 90%). Therefore, we can drop this variable from our dataset.
# Removing TEAM_BATTING_HBP variable
train <- subset(train, select = -c(10))
Now let’s measure how many complete observations do we have
sum(complete.cases(train)==TRUE)
## [1] 1486
sum(complete.cases(train)==FALSE)
## [1] 790
train$TEAM_BATTING_SO[is.na(train$TEAM_BATTING_SO)] = mean(train$TEAM_BATTING_SO, na.rm = TRUE)
train$TEAM_BASERUN_SB[is.na(train$TEAM_BASERUN_SB)] = mean(train$TEAM_BASERUN_SB, na.rm = TRUE)
train$TEAM_BASERUN_CS[is.na(train$TEAM_BASERUN_CS)] = mean(train$TEAM_BASERUN_CS, na.rm = TRUE)
train$TEAM_PITCHING_SO[is.na(train$TEAM_PITCHING_SO)] = mean(train$TEAM_PITCHING_SO, na.rm = TRUE)
train$TEAM_FIELDING_DP[is.na(train$TEAM_FIELDING_DP)] = mean(train$TEAM_FIELDING_DP, na.rm = TRUE)
We have about 1486 complete observations, which is close to 65%. Therefore, we should try to work with other variables to see if we can increase this as much as possible.
We have 2 options, either replace them with a mean or alternate value or identify alias variables which represent them with similar correlation.
lm <- lm(formula = train$TARGET_WINS ~ ., data = train)
summary(lm)
##
## Call:
## lm(formula = train$TARGET_WINS ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.994 -8.576 0.136 8.345 58.628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.502e+01 5.397e+00 4.636 3.75e-06 ***
## TEAM_BATTING_H 4.824e-02 3.687e-03 13.085 < 2e-16 ***
## TEAM_BATTING_2B -2.006e-02 9.152e-03 -2.192 0.028486 *
## TEAM_BATTING_3B 6.047e-02 1.676e-02 3.608 0.000315 ***
## TEAM_BATTING_HR 5.299e-02 2.743e-02 1.932 0.053488 .
## TEAM_BATTING_BB 1.042e-02 5.818e-03 1.790 0.073544 .
## TEAM_BATTING_SO -9.349e-03 2.551e-03 -3.665 0.000253 ***
## TEAM_BASERUN_SB 2.949e-02 4.462e-03 6.610 4.78e-11 ***
## TEAM_BASERUN_CS -1.188e-02 1.614e-02 -0.736 0.461905
## TEAM_PITCHING_H -7.342e-04 3.676e-04 -1.997 0.045946 *
## TEAM_PITCHING_HR 1.480e-02 2.432e-02 0.609 0.542877
## TEAM_PITCHING_BB 8.891e-05 4.145e-03 0.021 0.982891
## TEAM_PITCHING_SO 2.843e-03 9.187e-04 3.095 0.001994 **
## TEAM_FIELDING_E -2.112e-02 2.480e-03 -8.516 < 2e-16 ***
## TEAM_FIELDING_DP -1.210e-01 1.302e-02 -9.297 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.04 on 2261 degrees of freedom
## Multiple R-squared: 0.3189, Adjusted R-squared: 0.3147
## F-statistic: 75.63 on 14 and 2261 DF, p-value: < 2.2e-16
#cor(train)
cor_train <- cor(train, use = "na.or.complete")
corrplot(cor_train)