Introduction

In professional sports, attaining the most amount of wins in a season is the ultimate goal. Player and team statistics are commonly used to project and predict the number of wins for an upcoming season. In this analysis, we will use team statistics from every Major League Baseball team from 1871 to 2006 to predict the number of wins for each team. We will address how we handled missing values, created new variables based on the data available to us, and transformed variables to help normalize the data. We will show how we selected our variables for each of the three multiple regression models, and compare the results of each before determining which model to use for our test data.

library(tidyverse)
library(gtExtras)
library(vtable)
library(kableExtra)
library(reactable)
library(GGally)
library(corrplot)
library(corrr)
library(caret)
library(leaps)
library(MASS)
library(mice)
library(dlookr)
library(VIM)
library(performance)
library(ggcorrplot)
library(rsample)
library(skimr)
library(performance)
library(car)
library(caret)
library(olsrr)
library(DataExplorer)
library(vip)

Data Exploration

mlb_training <- read.csv("https://raw.githubusercontent.com/moham6839/Data_621_HW1/main/moneyball-training-data.csv")
mlb_test <- read.csv("https://raw.githubusercontent.com/moham6839/Data_621_HW1/main/moneyball-evaluation-data.csv")

Since INDEX is not a variable we will be using, we dropped it from the training and test sets:

mlb_training <- mlb_training %>%
  dplyr::select(-INDEX)
mlb_test <- mlb_test %>%
  dplyr::select(-INDEX)

Glimpse and Summary of MLB Training Set

reactable(mlb_training)
mlb_training %>%
  glimpse()
## Rows: 2,276
## Columns: 16
## $ TARGET_WINS      <int> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 68, 72, 7…
## $ TEAM_BATTING_H   <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B  <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B  <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR  <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB  <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO  <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB  <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS  <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E  <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
  #kable() %>%
  #kable_styling()
mlb_training %>%
  summary() %>%
  kable() %>%
  kable_styling()
TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
Median : 82.00 Median :1454 Median :238.0 Median : 47.00 Median :102.00 Median :512.0 Median : 750.0 Median :101.0 Median : 49.0 Median :58.00 Median : 1518 Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
NA NA NA NA NA NA NA’s :102 NA’s :131 NA’s :772 NA’s :2085 NA NA NA NA’s :102 NA NA’s :286
skim(mlb_training)
Data summary
Name mlb_training
Number of rows 2276
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TARGET_WINS 0 1.00 80.79 15.75 0 71.0 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.0 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.0 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.0 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.0 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.0 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.0 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.0 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.0 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.5 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.0 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.0 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.0 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.0 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.0 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.0 149.0 164.00 228 ▁▂▇▆▁

Glimpse and Summary of MLB Test Set

reactable(mlb_test)
mlb_test %>%
  glimpse() 
## Rows: 259
## Columns: 15
## $ TEAM_BATTING_H   <int> 1209, 1221, 1395, 1539, 1445, 1431, 1430, 1385, 1259,…
## $ TEAM_BATTING_2B  <int> 170, 151, 183, 309, 203, 236, 219, 158, 177, 212, 243…
## $ TEAM_BATTING_3B  <int> 33, 29, 29, 29, 68, 53, 55, 42, 78, 42, 40, 55, 57, 2…
## $ TEAM_BATTING_HR  <int> 83, 88, 93, 159, 5, 10, 37, 33, 23, 58, 50, 164, 186,…
## $ TEAM_BATTING_BB  <int> 447, 516, 509, 486, 95, 215, 568, 356, 466, 452, 495,…
## $ TEAM_BATTING_SO  <int> 1080, 929, 816, 914, 416, 377, 527, 609, 689, 584, 64…
## $ TEAM_BASERUN_SB  <int> 62, 54, 59, 148, NA, NA, 365, 185, 150, 52, 64, 48, 3…
## $ TEAM_BASERUN_CS  <int> 50, 39, 47, 57, NA, NA, NA, NA, NA, NA, NA, 28, 21, 8…
## $ TEAM_BATTING_HBP <int> NA, NA, NA, 42, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 1209, 1221, 1395, 1539, 3902, 2793, 1544, 1626, 1342,…
## $ TEAM_PITCHING_HR <int> 83, 88, 93, 159, 14, 20, 40, 39, 25, 62, 53, 173, 196…
## $ TEAM_PITCHING_BB <int> 447, 516, 509, 486, 257, 420, 613, 418, 497, 482, 521…
## $ TEAM_PITCHING_SO <int> 1080, 929, 816, 914, 1123, 736, 569, 715, 734, 622, 6…
## $ TEAM_FIELDING_E  <int> 140, 135, 156, 124, 616, 572, 490, 328, 226, 184, 200…
## $ TEAM_FIELDING_DP <int> 156, 164, 153, 154, 130, 105, NA, 104, 132, 145, 183,…
  # kable() %>%
  # kable_styling()
mlb_test %>%
  summary() %>%
  kable() %>%
  kable_styling()
TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
Min. : 819 Min. : 44.0 Min. : 14.00 Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0 Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
Median :1455 Median :239.0 Median : 52.00 Median :101.00 Median :509.0 Median : 686.0 Median : 92.0 Median : 49.50 Median :62.00 Median : 1515 Median :104.0 Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
Mean :1469 Mean :241.3 Mean : 55.91 Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7 Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1 Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
Max. :2170 Max. :376.0 Max. :155.00 Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0 Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0 Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
NA NA NA NA NA NA’s :18 NA’s :13 NA’s :87 NA’s :240 NA NA NA NA’s :18 NA NA’s :31
skim(mlb_test)
Data summary
Name mlb_test
Number of rows 259
Number of columns 15
_______________________
Column type frequency:
numeric 15
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TEAM_BATTING_H 0 1.00 1469.39 150.66 819 1387.0 1455.0 1548.00 2170 ▁▂▇▁▁
TEAM_BATTING_2B 0 1.00 241.32 49.52 44 210.0 239.0 278.50 376 ▁▂▇▇▂
TEAM_BATTING_3B 0 1.00 55.91 27.14 14 35.0 52.0 72.00 155 ▇▇▃▁▁
TEAM_BATTING_HR 0 1.00 95.63 56.33 0 44.5 101.0 135.50 242 ▆▅▇▃▁
TEAM_BATTING_BB 0 1.00 498.96 120.59 15 436.5 509.0 565.50 792 ▁▁▅▇▁
TEAM_BATTING_SO 18 0.93 709.34 243.11 0 545.0 686.0 912.00 1268 ▁▃▇▇▂
TEAM_BASERUN_SB 13 0.95 123.70 93.39 0 59.0 92.0 151.75 580 ▇▃▁▁▁
TEAM_BASERUN_CS 87 0.66 52.32 23.10 0 38.0 49.5 63.00 154 ▂▇▃▁▁
TEAM_BATTING_HBP 240 0.07 62.37 12.71 42 53.5 62.0 67.50 96 ▃▇▅▁▁
TEAM_PITCHING_H 0 1.00 1813.46 1662.91 1155 1426.5 1515.0 1681.00 22768 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 102.15 57.65 0 52.0 104.0 142.50 336 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 552.42 172.95 136 471.0 526.0 606.50 2008 ▆▇▁▁▁
TEAM_PITCHING_SO 18 0.93 799.67 634.31 0 613.0 745.0 938.00 9963 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 249.75 230.90 73 131.0 163.0 252.00 1568 ▇▁▁▁▁
TEAM_FIELDING_DP 31 0.88 146.06 25.88 69 131.0 148.0 164.00 204 ▁▂▇▇▂
mlb_training %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "blue", color="blue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

ggplot(gather(mlb_training), aes(value)) + 
    geom_histogram(bins = 8) + 
    facet_wrap(~key, scales = 'free_x')

The density and histogram show some of the features with positive skewness to the right. TEAM_PITCHING_BB, TEAM_PITCHING_H, TEAM_PITCHING_SO, and TEAM_FIELDING_E have distinct skewness to the right.TEAM_BATTING_SO, TEAM_BATTING_2B, and TEAM_FIELDING_DP appear to show a normal distribution.

mlb_training %>% 
  ggplot(aes(TARGET_WINS)) + 
  geom_histogram(bins = 50, fill = 'blue', color="black",) +
  geom_vline(aes(xintercept = mean(TARGET_WINS, na.rm = T)), col = "red", lty = 2) +
  geom_vline(aes(xintercept = median(TARGET_WINS, na.rm = T)), col = "yellow", lty = 2) +
  labs(x = element_blank(),
       y = "Count",
       title = "Distribution of Wins",
       caption = "* Red line is the mean value and yellow is the median") + 
  theme_classic()

The TARGET_WINS column follows a normal distribution. This will be important when deciding which transformation method to use. Since TARGET_WINS will be our dependent variable, using a Box-Cox transformation may not be the best method to use since our dependent variable already follows a normal distribution.

mlb_training %>%
  gather(-TARGET_WINS, key = "var", value = "value") %>% 
  ggplot(aes(x = value, y = TARGET_WINS)) +
    facet_wrap(~ var, scales = "free") +
    geom_point(fill = "blue", color="blue") +
    geom_smooth(method = "lm", se = FALSE, color = "black") + 
  labs(x = element_blank(), y = "Wins")

In terms of the relationship between each independent variable and the dependent variable TARGET_WINS, the variables that appear to show a linear relationship are TEAM_BATTING_BB, TEAM_BATTING_H, and TEAM_BATTING_2B. Variables

ggplot(stack(mlb_training), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 5000))+
  labs(x = element_blank(), y = element_blank()) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Measuring Correlation of Features

cor_matrix <- mlb_training %>% 
  cor(., use = "complete.obs") 
ggcorrplot::ggcorrplot(cor_matrix, type = "lower",
          lab = TRUE, lab_size = 2.1, tl.cex = 8)

cor_matrix[lower.tri(cor_matrix, diag=TRUE)] <- ""
cor_matrix <- cor_matrix %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  gather(Variable, Correlation, -rowname) %>%
  filter(Variable != rowname) %>%
  filter(Correlation != "") %>%
  mutate(Correlation = as.numeric(Correlation)) %>%
  rename(` Variable` = rowname) %>%
  arrange(desc(abs(Correlation))) 
cor_matrix %>%
  filter(abs(Correlation) > .5) %>%
  kable() %>%
  kable_styling()
Variable Variable Correlation
TEAM_BATTING_HR TEAM_PITCHING_HR 0.9999326
TEAM_BATTING_BB TEAM_PITCHING_BB 0.9998814
TEAM_BATTING_SO TEAM_PITCHING_SO 0.9997684
TEAM_BATTING_H TEAM_PITCHING_H 0.9991927
TEAM_BASERUN_SB TEAM_BASERUN_CS 0.6247378
TEAM_BATTING_H TEAM_BATTING_2B 0.5617729
TEAM_BATTING_2B TEAM_PITCHING_H 0.5604535

There were high correlation coefficienys between batting and pitching stats that were measuring the same feature. For instance, homeruns (HR), Walks (BB), Strikeouts (SO), and Hits (H) for batting and pitching were highly correlated with their respective feature. With the exception of strikeouts, the same stats have the highest correlation coefficient with wins.

Missing Data

mlb_training %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(mlb_training) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable(caption="<center>Missing Training Data Count and Percentage", align = "c") %>% 
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Missing Training Data Count and Percentage
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 2085 92%
TEAM_BASERUN_CS 772 34%
TEAM_FIELDING_DP 286 13%
TEAM_BASERUN_SB 131 5.8%
TEAM_BATTING_SO 102 4.5%
TEAM_PITCHING_SO 102 4.5%
mlb_test %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(mlb_test) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable(caption="<center>Missing Test Data Count and Percentage", align = "c") %>% 
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Missing Test Data Count and Percentage
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 240 93%
TEAM_BASERUN_CS 87 34%
TEAM_FIELDING_DP 31 12%
TEAM_BATTING_SO 18 6.9%
TEAM_PITCHING_SO 18 6.9%
TEAM_BASERUN_SB 13 5%

The amount of missing values in the training and test datasets for TEAM_BATTING_HBP is exceptionally large, missing 92% and 93% of its values, respectively. If we imputed the missing values for an amount that large, the results would lack natural variation that could result in an effective model. Combined with their low correlation to TARGET_WINS, we decided to drop the column in the training and test data. For the other variables with missing values, we used a K-Nearest Neighbor function from the VIM package to impute the missing values.

Flaws of Imputing TEAM_BATTING_HBP

Dropping HBP was a difficult decision to make, considering HBP is used to calculate On-Base Percentage and At-Bats (AB), with AB also used to calculate Slugging Percentage and Batting Average. Having additional baseball statistics that are commonly used to evaluate team performance would have been helpful for this analysis. However, as you will see, the additional statistics that were produced when imputing HBP created higher than normal stats, particularly with Batting Average. The highest team batting average ever was accomplished by the Philadelphia Phillies in 1894, which had a Batting Average of .350. The max of the Batting Average column is 0.9428571. Therefore, we can infer that after imputing HBP, the averages of the other statistics become inflated, and therefore unreliable when predicting the number of wins.

set.seed(123)
mlb_train_imp2 <- mlb_training %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO", "TEAM_BATTING_HBP"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
set.seed(123)
mlb_train_imp2 <- mlb_train_imp2 %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  dplyr::mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_PITCHING_BB + TEAM_BATTING_SO + TEAM_BATTING_HBP) %>%
  dplyr::mutate(TEAM_BATTING_AVERAGE = TEAM_BATTING_H/TEAM_BATTING_AB) %>%
  dplyr::mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP)) %>%
  dplyr::mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 4*TEAM_BATTING_HR)/TEAM_BATTING_AB) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B) 
reactable(mlb_train_imp2)
max(mlb_train_imp2$TEAM_BATTING_AVERAGE)
## [1] 0.9428571

Dropping TEAM_BATTING_HBP and Imputing Missing Values

After deciding to drop HBP, we decided to use the K-Nearest Neighbors function kNN() from the VIM package to impute the missing data. Using KNN is advantageous because it considers the relationships between observations, leading to more accurate imputations than simpler methods like mean or mode imputation.

Training

set.seed(123)
mlb_training_no_hbp <- mlb_training %>%
  dplyr::select(-TEAM_BATTING_HBP)
set.seed(123)
mlb_train_imp <- mlb_training_no_hbp %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
reactable(mlb_train_imp)
sum(is.na(mlb_train_imp))
## [1] 0
sum(is.nan(as.matrix(mlb_train_imp)))
## [1] 0
sum(is.infinite(as.matrix(mlb_train_imp)))
## [1] 0

Test Set

set.seed(123)
mlb_test_no_hbp <- mlb_test %>%
  dplyr::select(-TEAM_BATTING_HBP)
set.seed(123)
mlb_test_imp <- mlb_test_no_hbp %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
reactable(mlb_test_imp)
sum(is.na(mlb_test_imp))
## [1] 0
sum(is.nan(as.matrix(mlb_test_imp)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_imp)))
## [1] 0

Data Preparation

When analyzing the dataset, we realized that the number of singles (1B) were not a feature. We can deduce that the total number of homeruns (HR), triples (3B), and doubles (2B) can be subtracted from the total number of hits in order to get the total amount of singles.

mlb_train_imp <- mlb_train_imp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_train_imp %>% 
  ggplot(aes(TEAM_BATTING_1B)) + 
  geom_histogram(bins = 50, fill = 'blue', color="black",) +
  geom_vline(aes(xintercept = mean(TEAM_BATTING_1B, na.rm = T)), col = "red", lty = 2) +
  geom_vline(aes(xintercept = median(TEAM_BATTING_1B, na.rm = T)), col = "yellow", lty = 2) +
  labs(x = element_blank(),
       y = "Count",
       title = "Distribution of Singles",
       caption = "* Red line is the mean value and yellow is the median") + 
  theme_classic()

The number of singles shows a positive right skewness. Let’s take a look at the correlation matrix that includes the new feature as well as the imputed data:

cor_matrix2 <- mlb_train_imp %>% 
  cor(., use = "complete.obs") 
ggcorrplot::ggcorrplot(cor_matrix2, type = "lower",
          lab = TRUE, lab_size = 2.1, tl.cex = 8)

Batting hits had the highest correlation coefficient with TARGET_WINS, but had decreased from the initial correlation matrix. The new variable for singles had a correlation coefficient of 0.22 with TARGET_WINS, which was 4th-highest behind batting hits, batting doubles, and batting walks.

Log-Transforming Variables

Since transforming the variables before imputing missing values helps preserve the relationships between the variables in the regression model, we decided to transform the data first before using our KNN imputation method.

The independent variables that aren’t normally distributed show positive skewness, and for this reason we decided to use a log-transformation method.

Training Set

mlb_training_no_hbp <- mlb_training_no_hbp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_train_log <- log(mlb_training_no_hbp)
set.seed(123)
mlb_train_log <- mlb_train_log%>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_train_log))
## [1] 0
sum(is.nan(as.matrix(mlb_train_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_train_log)))
## [1] 271

After log-transforming the data, there were a considerable amount of infinite values created. To address this, we turned those values into NA values and imputed the data:

mlb_train_log[sapply(mlb_train_log, is.infinite)] <- NA
set.seed(123)
mlb_train_log <- mlb_train_log %>%
  kNN(k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
mlb_train_log %>%
  gather(-TARGET_WINS, key = "var", value = "value") %>% 
  ggplot(aes(x = value, y = TARGET_WINS)) +
    facet_wrap(~ var, scales = "free") +
    geom_point(fill = "blue", color="blue") +
    geom_smooth(method = "lm", se = FALSE, color = "black") + 
  labs(x = element_blank(), y = "Wins")

ggplot(gather(mlb_train_log), aes(value)) + 
    geom_histogram(bins = 8) + 
    facet_wrap(~key, scales = 'free_x')

mlb_train_log %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "blue", color="blue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

After log-transforming and imputing the data,

Test Set

mlb_test_no_hbp <- mlb_test_no_hbp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_test_log <- log(mlb_test_no_hbp)
set.seed(123)
mlb_test_log <- mlb_test_log%>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_test_log))
## [1] 0
sum(is.nan(as.matrix(mlb_test_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_log)))
## [1] 10
mlb_test_log[sapply(mlb_test_log, is.infinite)] <- NA
set.seed(123)
mlb_test_log <- mlb_test_log %>%
  kNN(k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_test_log))
## [1] 0
sum(is.nan(as.matrix(mlb_test_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_log)))
## [1] 0
ggplot(gather(mlb_test_log), aes(value)) + 
    geom_histogram(bins = 10) + 
    facet_wrap(~key, scales = 'free_x')

Model Building

Model 1 - Using findCorrelation function from caret package

For our first model, we utilized the findCorrelation function to determine which variables to use in our model:

set.seed(123)
highlyCorDescr <- findCorrelation(cor(mlb_train_imp), cutoff = .50, verbose = TRUE, names=TRUE)
## Compare row 6  and column  12 with corr  0.969 
##   Means:  0.445 vs 0.299 so flagging column 6 
## Compare row 12  and column  8 with corr  0.664 
##   Means:  0.385 vs 0.28 so flagging column 12 
## Compare row 8  and column  15 with corr  0.585 
##   Means:  0.361 vs 0.264 so flagging column 8 
## Compare row 15  and column  3 with corr  0.548 
##   Means:  0.351 vs 0.25 so flagging column 15 
## Compare row 3  and column  5 with corr  0.6 
##   Means:  0.319 vs 0.234 so flagging column 3 
## Compare row 5  and column  9 with corr  0.545 
##   Means:  0.27 vs 0.223 so flagging column 5 
## Compare row 10  and column  16 with corr  0.537 
##   Means:  0.23 vs 0.21 so flagging column 10 
## Compare row 2  and column  4 with corr  0.563 
##   Means:  0.23 vs 0.205 so flagging column 2 
## All correlations <= 0.5
set.seed(123)
keep_these <- names(mlb_train_imp)[!(names(mlb_train_imp) %in% colnames(mlb_train_imp)[highlyCorDescr])]
mlb_train_features <- mlb_train_imp[, keep_these]
reactable(mlb_train_features)
set.seed(123)
m1 <- lm(TARGET_WINS ~., data = mlb_train_features)
summary(m1)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = mlb_train_features)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.566  -8.345   0.303   8.189  52.052 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      28.6728210  5.3947533   5.315 1.17e-07 ***
## TEAM_BATTING_H    0.1237754  0.0266112   4.651 3.49e-06 ***
## TEAM_BATTING_1B  -0.0786938  0.0265437  -2.965 0.003062 ** 
## TEAM_BATTING_2B  -0.0942028  0.0280605  -3.357 0.000801 ***
## TEAM_BATTING_3B  -0.0566817  0.0283020  -2.003 0.045324 *  
## TEAM_BATTING_HR          NA         NA      NA       NA    
## TEAM_BATTING_BB   0.0161483  0.0056273   2.870 0.004147 ** 
## TEAM_BATTING_SO  -0.0171662  0.0025251  -6.798 1.35e-11 ***
## TEAM_BASERUN_SB   0.0453571  0.0044947  10.091  < 2e-16 ***
## TEAM_BASERUN_CS   0.0401036  0.0108571   3.694 0.000226 ***
## TEAM_PITCHING_H   0.0002165  0.0003664   0.591 0.554753    
## TEAM_PITCHING_HR  0.0221576  0.0234035   0.947 0.343859    
## TEAM_PITCHING_BB -0.0032305  0.0040037  -0.807 0.419823    
## TEAM_PITCHING_SO  0.0022881  0.0008826   2.592 0.009590 ** 
## TEAM_FIELDING_E  -0.0291447  0.0026099 -11.167  < 2e-16 ***
## TEAM_FIELDING_DP -0.1329713  0.0136476  -9.743  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.56 on 2261 degrees of freedom
## Multiple R-squared:  0.3677, Adjusted R-squared:  0.3638 
## F-statistic: 93.92 on 14 and 2261 DF,  p-value: < 2.2e-16

The initial results produced an Adjusted R-squared of 0.1857, meaning that only 0.1857 of the variance in wins can be explained by the variables in the model. Two variables, TEAM_PITCHING_H and TEAM_PITCHING_BB, had p-values greater than 0.05, which indicates that each feature does not have statistical significance when explaining the variance of wins. We removed both variables and re-ran the model:

set.seed(123)
m1_train_revised <- lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_DP, data = mlb_train_features)
summary(m1_train_revised)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_DP, data = mlb_train_features)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.836  -8.987   0.189   9.450  64.404 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      50.9419986  2.4345521  20.925  < 2e-16 ***
## TEAM_BATTING_2B   0.1052782  0.0067785  15.531  < 2e-16 ***
## TEAM_BATTING_BB   0.0340953  0.0027860  12.238  < 2e-16 ***
## TEAM_BASERUN_SB   0.0308966  0.0036161   8.544  < 2e-16 ***
## TEAM_PITCHING_SO -0.0029254  0.0005523  -5.297 1.29e-07 ***
## TEAM_FIELDING_DP -0.0992912  0.0140364  -7.074 2.00e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.22 on 2270 degrees of freedom
## Multiple R-squared:  0.1873, Adjusted R-squared:  0.1856 
## F-statistic: 104.7 on 5 and 2270 DF,  p-value: < 2.2e-16

In the revised model, the Adjusted R-squared decreased by 0.01 to 0.1856. The p-value of the overall model is the same as the initial model, with a value below 0.05. The residual standard error increased by 0.01 to 14.22. The F-statistic increased from 75.12 to 104.7, which may indicate that removing the two variables with high p-values helped improve the ability of the predictors to explain the variability in wins

ggplot(m1_train_revised, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m1_train_revised, aes(x = m1_train_revised$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m1_train_revised, aes(x = .resid)) +
  geom_histogram(binwidth = 0.4) +
  xlab("Residuals")

qqnorm(resid(m1_train_revised))
qqline(resid(m1_train_revised))

Model 2 - Using findCorrelation function for Log-Transformed Train Set

set.seed(123)
highlyCorDescr2 <- findCorrelation(cor(mlb_train_log), cutoff = .50, verbose = TRUE)
## Compare row 6  and column  15 with corr  0.78 
##   Means:  0.489 vs 0.325 so flagging column 6 
## Compare row 15  and column  12 with corr  0.659 
##   Means:  0.455 vs 0.305 so flagging column 15 
## Compare row 12  and column  8 with corr  0.522 
##   Means:  0.396 vs 0.282 so flagging column 12 
## Compare row 8  and column  3 with corr  0.801 
##   Means:  0.392 vs 0.263 so flagging column 8 
## Compare row 3  and column  5 with corr  0.593 
##   Means:  0.355 vs 0.247 so flagging column 3 
## Compare row 5  and column  14 with corr  0.563 
##   Means:  0.28 vs 0.237 so flagging column 5 
## Compare row 7  and column  11 with corr  0.685 
##   Means:  0.313 vs 0.219 so flagging column 7 
## Compare row 16  and column  9 with corr  0.544 
##   Means:  0.248 vs 0.198 so flagging column 16 
## Compare row 2  and column  4 with corr  0.578 
##   Means:  0.309 vs 0.175 so flagging column 2 
## All correlations <= 0.5
set.seed(123)
keep_these2 <- names(mlb_train_log)[!(names(mlb_train_log) %in% colnames(mlb_train_log)[highlyCorDescr2])]
mlb_train_features2 <- mlb_train_log[, keep_these2]
reactable(mlb_train_features2)
set.seed(123)
m2_log <- lm(TARGET_WINS ~., data = mlb_train_features2)
summary(m2_log)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = mlb_train_features2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21655 -0.10935  0.02095  0.12652  0.65467 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.928902   0.206293  14.198  < 2e-16 ***
## TEAM_BATTING_2B   0.373551   0.022051  16.941  < 2e-16 ***
## TEAM_BASERUN_SB   0.064488   0.006880   9.373  < 2e-16 ***
## TEAM_BASERUN_CS   0.004051   0.009499   0.426     0.67    
## TEAM_PITCHING_H  -0.129390   0.014424  -8.971  < 2e-16 ***
## TEAM_PITCHING_BB  0.123832   0.017761   6.972 4.08e-12 ***
## TEAM_PITCHING_SO -0.111174   0.012877  -8.634  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1991 on 2269 degrees of freedom
## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1884 
## F-statistic: 89.03 on 6 and 2269 DF,  p-value: < 2.2e-16

The initial results did not produce a trustworthy model. The Adjusted R-squared value was 18.84%, meaning the independent variables in the model can only account for 18.84% of the variance of TARGET_WINS. TEAM_BASERUN_CS was the only variable with a p-value greater than 0.05, which indicates that it is not statistically significant and could be affecting the model’s ability to determine the impact other variables may have on the explaining the variance of TARGET_WINS. We decided to re-run the model without the variable:

set.seed(123)
m2_log2 <- lm(TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_2B + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO, data = mlb_train_features2)
summary(m2_log2)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_2B + 
##     TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO, data = mlb_train_features2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22005 -0.10941  0.02093  0.12710  0.65297 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.968279   0.184439  16.094  < 2e-16 ***
## TEAM_BASERUN_SB   0.065409   0.006531  10.016  < 2e-16 ***
## TEAM_BATTING_2B   0.372693   0.021955  16.976  < 2e-16 ***
## TEAM_PITCHING_H  -0.130712   0.014084  -9.281  < 2e-16 ***
## TEAM_PITCHING_BB  0.122628   0.017532   6.994 3.49e-12 ***
## TEAM_PITCHING_SO -0.112025   0.012719  -8.808  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1991 on 2270 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1887 
## F-statistic: 106.8 on 5 and 2270 DF,  p-value: < 2.2e-16

Removing the variable had a slight impact the Adjusted R-squared value, which shows an increase of 0.03 to 18.87%.

ggplot(m2_log2, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m2_log2, aes(x = .resid)) +
  geom_histogram(binwidth = 0.04) +
  xlab("Residuals")

qqnorm(resid(m2_log2))
qqline(resid(m2_log2))

Model 3 - Using Recursive Feature Elimination on Imputed Train Set

set.seed(123)
filterCtrl <- rfeControl(functions=rfFuncs, method="cv", number=3)
results <- rfe(x= mlb_train_imp[,2:16],y= mlb_train_imp[,1], sizes=c(2:16), rfeControl=filterCtrl)
results
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (3 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD  MAESD Selected
##          2 14.17   0.2137 11.131 0.1632    0.03138 0.2706         
##          3 12.98   0.3209 10.263 0.2405    0.04370 0.2790         
##          4 12.73   0.3512 10.040 0.1634    0.03715 0.1218         
##          5 12.48   0.3842  9.825 0.2171    0.04292 0.1585         
##          6 12.00   0.4274  9.383 0.3385    0.05343 0.2741         
##          7 12.10   0.4205  9.515 0.2491    0.04685 0.2315         
##          8 12.08   0.4255  9.468 0.2882    0.05011 0.2251         
##          9 11.88   0.4419  9.290 0.2191    0.04224 0.2042         
##         10 11.90   0.4423  9.303 0.2775    0.04907 0.2164         
##         11 11.93   0.4430  9.347 0.2345    0.04506 0.2115         
##         12 11.85   0.4481  9.271 0.2436    0.04671 0.2161        *
##         13 11.90   0.4430  9.316 0.2755    0.04766 0.2605         
##         14 11.93   0.4416  9.327 0.2527    0.04601 0.2363         
##         15 11.87   0.4466  9.261 0.2487    0.04392 0.2278         
## 
## The top 5 variables (out of 12):
##    TEAM_FIELDING_E, TEAM_BATTING_H, TEAM_BASERUN_CS, TEAM_BATTING_BB, TEAM_PITCHING_SO
set.seed(123)
m3_train <- lm(TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + TEAM_BASERUN_CS + TEAM_BATTING_BB + TEAM_PITCHING_SO, data = mlb_train_imp)
summary(m3_train)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + 
##     TEAM_BASERUN_CS + TEAM_BATTING_BB + TEAM_PITCHING_SO, data = mlb_train_imp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.609  -8.786   0.103   9.000  50.032 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.347e+01  3.712e+00  -3.629 0.000291 ***
## TEAM_FIELDING_E  -1.618e-02  1.708e-03  -9.476  < 2e-16 ***
## TEAM_BATTING_H    5.447e-02  2.126e-03  25.624  < 2e-16 ***
## TEAM_BASERUN_CS   8.828e-02  8.222e-03  10.737  < 2e-16 ***
## TEAM_BATTING_BB   2.200e-02  3.109e-03   7.075 1.99e-12 ***
## TEAM_PITCHING_SO  1.731e-03  5.409e-04   3.199 0.001396 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.37 on 2270 degrees of freedom
## Multiple R-squared:  0.2809, Adjusted R-squared:  0.2793 
## F-statistic: 177.3 on 5 and 2270 DF,  p-value: < 2.2e-16
ggplot(m3_train, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m3_train, aes(x = m3_train$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m3_train, aes(x = .resid)) +
  geom_histogram(binwidth = 0.4) +
  xlab("Residuals")

qqnorm(resid(m3_train))
qqline(resid(m3_train))

Model 4 - Using Recursive Feature Elimination for Log-Transformed Imputed Train Set

set.seed(123)
filterCtrl2 <- rfeControl(functions=rfFuncs, method="cv", number=3)
results2 <- rfe(x= mlb_train_log[,2:16],y= mlb_train_log[,1], sizes=c(2:16), rfeControl=filterCtrl2)
results2
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (3 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE   RMSESD RsquaredSD    MAESD Selected
##          2 0.1980   0.2139 0.1480 0.008784    0.03466 0.005485         
##          3 0.1798   0.3368 0.1343 0.005996    0.08552 0.005938         
##          4 0.1751   0.3781 0.1298 0.001170    0.06430 0.002825         
##          5 0.1748   0.3855 0.1289 0.001654    0.06539 0.001581         
##          6 0.1705   0.4096 0.1256 0.003244    0.05195 0.003023         
##          7 0.1709   0.4084 0.1256 0.004063    0.05415 0.002838         
##          8 0.1701   0.4184 0.1251 0.003443    0.05773 0.002629         
##          9 0.1700   0.4153 0.1253 0.004262    0.06991 0.003601         
##         10 0.1700   0.4175 0.1252 0.004390    0.07165 0.003373         
##         11 0.1696   0.4225 0.1253 0.003666    0.07539 0.003654         
##         12 0.1694   0.4223 0.1245 0.003572    0.06565 0.002294        *
##         13 0.1697   0.4195 0.1245 0.003871    0.06149 0.002530         
##         14 0.1697   0.4202 0.1247 0.003687    0.07013 0.003204         
##         15 0.1698   0.4156 0.1249 0.003739    0.06493 0.002897         
## 
## The top 5 variables (out of 12):
##    TEAM_FIELDING_E, TEAM_BATTING_H, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_BATTING_BB
set.seed(123)
m4_log <- lm(TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_BATTING_BB, data=mlb_train_log)
summary(m4_log)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_BATTING_BB, data = mlb_train_log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19699 -0.10778  0.00665  0.11636  0.80712 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.67180    0.40736  -6.559 6.70e-11 ***
## TEAM_FIELDING_E  -0.14556    0.01060 -13.736  < 2e-16 ***
## TEAM_BATTING_H    0.96713    0.04543  21.289  < 2e-16 ***
## TEAM_BASERUN_SB   0.10406    0.00710  14.656  < 2e-16 ***
## TEAM_PITCHING_SO -0.04167    0.01371  -3.040  0.00239 ** 
## TEAM_BATTING_BB   0.08924    0.01447   6.167 8.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1823 on 2270 degrees of freedom
## Multiple R-squared:  0.3215, Adjusted R-squared:  0.3201 
## F-statistic: 215.2 on 5 and 2270 DF,  p-value: < 2.2e-16
set.seed(123)
vip(m4_log)

ggplot(m4_log, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m4_log, aes(x = m4_log$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m4_log, aes(x = .resid)) +
  geom_histogram(binwidth = 0.2) +
  xlab("Residuals")

qqnorm(resid(m4_log))
qqline(resid(m4_log))

Model Selection

set.seed(123)
compare_performance(m1_train_revised, m2_log2, m3_train, m4_log)
## # Comparison of Model Performance Indices
## 
## Name             | Model |   AIC (weights) |  AICc (weights) |   BIC (weights) |    R2 | R2 (adj.) |   RMSE |  Sigma
## --------------------------------------------------------------------------------------------------------------------
## m1_train_revised |    lm | 18549.6 (<.001) | 18549.7 (<.001) | 18589.7 (<.001) | 0.187 |     0.186 | 14.197 | 14.216
## m2_log2          |    lm |  -879.5 (<.001) |  -879.5 (<.001) |  -839.4 (<.001) | 0.190 |     0.189 |  0.199 |  0.199
## m3_train         |    lm | 18271.3 (<.001) | 18271.3 (<.001) | 18311.4 (<.001) | 0.281 |     0.279 | 13.355 | 13.373
## m4_log           |    lm | -1281.5 (>.999) | -1281.5 (>.999) | -1241.4 (>.999) | 0.322 |     0.320 |  0.182 |  0.182

Based on the results from each model, we decided to go with the 4th model, m4_log, which produced the highest Adjusted R-squared at 32% and the lowest Root Mean Squared Error at 0.182. The final variables for the model are TEAM_FIELDING_E, TEAM_BATTING_H TEAM_BASERUN_SB, TEAM_PITCHING_SO, and TEAM_BATTING_BB

set.seed(123)
predictions <- predict(m4_log, newdata = mlb_test_log)
summary(predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.490   4.296   4.378   4.362   4.448   4.740
set.seed(123)
mlb_test_log$predicted_wins <- round(exp(predictions))
set.seed(123)
mlb_test_log_final <- mlb_test_log %>%
  relocate(predicted_wins) 
reactable(mlb_test_log_final)
write.csv(mlb_test_log_final, "/Users/mohamedhassan/Downloads/data_621_moneyball_pred.csv", row.names = FALSE)

References