Data Exploration

Here we load and explore the baseball data. Some key insights include:

  • The dataset’s variables show a wide range in means and standard deviations with some variables showing significant variation.

  • Several predictors present significant outliers, especially TEAM_PITCHING_H and TEAM_PITCHING_SO, indicating that some teams have extreme values in these categories.

  • 9.6% of the dataset has missing values, with TEAM_BATTING_HBP and TEAM_BASERUN_CS having the highest proportion of missing data. We will need to perform imputation to estimate these missing values and cannot simply drop the corresponding row.

  • TEAM_BATTING_HBP is missing 92% of its values; it may be too spare to impute reliably and will be dropped in modeling

  • TEAM_BATTING_H, TEAM_BATTING_2B, and TEAM_BATTING_BB have the strongest positive correlations with TARGET_WINS indicating singles, doubles, walks have the highest correlation to wins . TEAM_FIELDING_E shows a negative correlation, indicating that more errors reduce the chances of winning.

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(naniar)
library(GGally)
library(knitr)
library(kableExtra)

# Load dataset
moneyball_training <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data621/refs/heads/main/moneyball-training-data.csv", 
                               header = TRUE, sep = ',', na.strings="", fill = TRUE)
moneyball_training <- moneyball_training %>% select(-INDEX)

# View basic dataset information
str(moneyball_training)
## 'data.frame':    2276 obs. of  16 variables:
##  $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
##  $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
##  $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
##  $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
##  $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
##  $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
##  $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
##  $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
##  $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
##  $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
##  $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
##  $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
##  $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
##  $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
##  $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...
# Summary statistics for all numeric predictors
summary_stats <- data.frame(
  Variable = names(moneyball_training %>% select(where(is.numeric))),
  Mean = sapply(moneyball_training %>% select(where(is.numeric)), mean, na.rm = TRUE),
  Median = sapply(moneyball_training %>% select(where(is.numeric)), median, na.rm = TRUE),
  SD = sapply(moneyball_training %>% select(where(is.numeric)), sd, na.rm = TRUE)) %>%
  mutate(across(where(is.numeric), ~round(., 2)))

# Display summary statistics
kable(summary_stats, caption = "Summary Statistics for Predictors", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE)
Summary Statistics for Predictors
Variable Mean Median SD
TARGET_WINS 80.79 82.0 15.75
TEAM_BATTING_H 1469.27 1454.0 144.59
TEAM_BATTING_2B 241.25 238.0 46.80
TEAM_BATTING_3B 55.25 47.0 27.94
TEAM_BATTING_HR 99.61 102.0 60.55
TEAM_BATTING_BB 501.56 512.0 122.67
TEAM_BATTING_SO 735.61 750.0 248.53
TEAM_BASERUN_SB 124.76 101.0 87.79
TEAM_BASERUN_CS 52.80 49.0 22.96
TEAM_BATTING_HBP 59.36 58.0 12.97
TEAM_PITCHING_H 1779.21 1518.0 1406.84
TEAM_PITCHING_HR 105.70 107.0 61.30
TEAM_PITCHING_BB 553.01 536.5 166.36
TEAM_PITCHING_SO 817.73 813.5 553.09
TEAM_FIELDING_E 246.48 159.0 227.77
TEAM_FIELDING_DP 146.39 149.0 26.23
# Boxplots for numeric predictors
moneyball_training %>%
  select(where(is.numeric)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "darkblue") +
  ggtitle("Box Plot of All Predictors") +
  xlab("Variable") + ylab("Value") +
  theme_minimal(base_size = 8) + 
  coord_flip()

# Missing data visualization
vis_miss(moneyball_training) + 
  ggtitle("Missing Data Map") +
  theme(text = element_text(size = 8))

gg_miss_var(moneyball_training) +
  ggtitle("Missing Values per Variable") +
  theme(text = element_text(size = 8))

# Prepare data for correlation analysis
cor_data <- moneyball_training %>%
  select(where(is.numeric))

# Compute correlations with TARGET_WINS
correlation_values <- cor_data %>%
  summarise(across(everything(), 
                   ~ cor(., cor_data$TARGET_WINS, use = "complete.obs"), 
                   .names = "cor_{col}")) %>%
  pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
  mutate(Predictor = gsub("cor_", "", Predictor)) %>%
  filter(Predictor != "TARGET_WINS") %>% 
  arrange(desc(abs(Correlation)))

# Display sorted correlations
kable(correlation_values, caption = "Correlations with TARGET_WINS", digits = 2) %>%
  kable_styling(full_width = TRUE)
Correlations with TARGET_WINS
Predictor Correlation
TEAM_BATTING_H 0.39
TEAM_BATTING_2B 0.29
TEAM_BATTING_BB 0.23
TEAM_PITCHING_HR 0.19
TEAM_FIELDING_E -0.18
TEAM_BATTING_HR 0.18
TEAM_BATTING_3B 0.14
TEAM_BASERUN_SB 0.14
TEAM_PITCHING_BB 0.12
TEAM_PITCHING_H -0.11
TEAM_PITCHING_SO -0.08
TEAM_BATTING_HBP 0.07
TEAM_FIELDING_DP -0.03
TEAM_BATTING_SO -0.03
TEAM_BASERUN_CS 0.02
# Correlation matrix visualization
corr_matrix <- cor(cor_data, use = "pairwise.complete.obs")
corrplot(corr_matrix, method = "color", type = "lower", diag = FALSE, 
         tl.col = "black", tl.cex = 0.5,
         col = colorRampPalette(c("blue", "white", "red"))(200))

# Scatterplot for correlated variables
top_corr_vars <- correlation_values %>%
  top_n(10, abs(Correlation)) %>% 
  pull(Predictor)

ggpairs(cor_data, columns = match(top_corr_vars, names(cor_data)), 
        upper = list(continuous = wrap("cor", size = 2)), 
        title = "Correlation Scatterplot Matrix for Top Predictors") +
  theme(
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 6),
    strip.text = element_text(size = 6),
    plot.title = element_text(size = 10))


Data Preparation

After exploration, we prepare the data for modeling. The goal is convert the data set to make it suitable for modeling and prediction. To accomplish this, we:

  1. Dropped TEAM_BATTING_HBP; it mis missing 92% of its values - making it too sparse for reliable imputation. It may introduce bias or inaccuracies.

  2. Imputed Missing Values Using mice with Predictive Mean Matching (PMM) to replace missing values with realistic estimates.

  3. Verified No Missing Values Remain by checking the missing values count. All predictors are now complete.

  4. Recalculated Correlations with TARGET_WINS to identify variables most strongly associated with team wins. This selects teh top 6 correlated predictors.

# Load library
library(mice)

# Drop TEAM_BATTING_HBP predictor
moneyball_training <- moneyball_training %>% select(-TEAM_BATTING_HBP)

# Impute Missing Values
set.seed(123)
imputed_data <- mice(moneyball_training, method = "pmm", m = 5, maxit = 10, seed = 123, printFlag = FALSE)

# Convert imputed data to dataframe
moneyball_training <- complete(imputed_data)

# Verify No Missing Values Remain
missing_counts <- colSums(is.na(moneyball_training))

# Display number of missing values
kable(
  data.frame(
    Variable = names(missing_counts), 
    Missing_Values = missing_counts,
    row.names = NULL
  ) %>%
    filter(Variable != "INDEX"),
  caption = "Missing Values Count After Imputation"
) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Missing Values Count After Imputation
Variable Missing_Values
TARGET_WINS 0
TEAM_BATTING_H 0
TEAM_BATTING_2B 0
TEAM_BATTING_3B 0
TEAM_BATTING_HR 0
TEAM_BATTING_BB 0
TEAM_BATTING_SO 0
TEAM_BASERUN_SB 0
TEAM_BASERUN_CS 0
TEAM_PITCHING_H 0
TEAM_PITCHING_HR 0
TEAM_PITCHING_BB 0
TEAM_PITCHING_SO 0
TEAM_FIELDING_E 0
TEAM_FIELDING_DP 0
# Recalculate Correlations with TARGET_WINS
cor_data <- moneyball_training %>%
  select(where(is.numeric))

correlation_values <- cor_data %>%
  summarise(across(everything(), 
                   ~ cor(., cor_data$TARGET_WINS, use = "complete.obs"), 
                   .names = "cor_{col}")) %>%
  pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
  mutate(Predictor = gsub("cor_", "", Predictor)) %>%
  filter(Predictor != "TARGET_WINS") %>% 
  arrange(desc(abs(Correlation)))

# Select Top Predictors
top_6_vars <- correlation_values %>%
  top_n(6, abs(Correlation)) %>% 
  arrange(desc(abs(Correlation)))  # Ensures proper ranking

# Display Top predictors
kable(top_6_vars, caption = "Top 6 Correlations with TARGET_WINS", digits = 2) %>%
  kable_styling(full_width = TRUE)
Top 6 Correlations with TARGET_WINS
Predictor Correlation
TEAM_BATTING_H 0.39
TEAM_BATTING_2B 0.29
TEAM_BATTING_BB 0.23
TEAM_PITCHING_HR 0.19
TEAM_FIELDING_E -0.18
TEAM_BATTING_HR 0.18
# Create new dataset with top predictors and target variable
moneyball_training_selected <- moneyball_training %>%
  select(all_of(c("TARGET_WINS", top_6_vars$Predictor)))


Build Models

Overall, we evaluated two types of models

  • in the first section, we fit single predictor regressions
  • in the second section, we fit multiple predictor regressions


Simple Linear Regression

In this section, we built simple linear regression models for each predictor to analyze the relationship between individual predictors and TARGET_WINS to determine the strength of each variable’s contribution. For each regression, we also generated the diagnostic plots.


MODEL RESULTS

While Hits/Single was the strongest predictor, the low R² values for individual models indicate that a multiple regression approach is necessary to capture interactions and improve predictive power. Diagnostic plots suggest some violations of linearity and normality assumptions, which might require transformations or alternative modeling techniques.

  • TEAM_BATTING_H has the strongest positive effect on TARGET_WINS with a high R² = 0.1511,
  • TEAM_BATTING_2B is also positively correlated, but its R² = 0.0836 indicates a weaker explanatory power.
  • TEAM_BATTING_BB has a positive effect but contributes only slightly to wins.
  • TEAM_PITCHING_HR has a weaker correlation.
  • TEAM_FIELDING_E is negatively correlated with wins, indicating that more errors lead to fewer wins.
  • TEAM_BATTING_HR has a positive effect, but the low R² (0.0310) suggests it alone is not a strong predictor.
library(gridExtra)

# Define small font theme for better readability
small_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 9),
    axis.title.x = element_text(size = 7),
    axis.title.y = element_text(size = 7),
    axis.text.x = element_text(size = 5),
    axis.text.y = element_text(size = 5),
    legend.text = element_text(size = 5), 
    legend.title = element_text(size = 7), 
    strip.text = element_text(size = 7))

# List to store models
models <- list()

# Iterate through each predictor
for (predictor in top_6_vars$Predictor) {

  cat("\n\n### Running Model for Predictor:", predictor, "###\n")
  
  # Build simple linear regression model
  formula <- as.formula(paste("TARGET_WINS ~", predictor))
  model <- lm(formula, data = moneyball_training_selected)
  models[[predictor]] <- model

  # Print model summary
  print(summary(model))

  # Extract residuals and fitted values
  residuals <- resid(model)
  fitted_vals <- fitted(model)

  # Plot 1: Regression Plot
  p1 <- ggplot(moneyball_training_selected, aes_string(x = predictor, y = "TARGET_WINS")) +
    geom_point(alpha = 0.5, color = "blue") +
    geom_smooth(method = "lm", se = TRUE, color = "red") +
    ggtitle(paste("Regression Plot:", predictor)) +
    xlab(predictor) + ylab("TARGET_WINS") +
    small_theme

  # Plot 2: Residuals vs Fitted
  p2 <- ggplot(data = data.frame(Fitted = fitted_vals, Residuals = residuals), aes(x = Fitted, y = Residuals)) +
    geom_point(alpha = 0.5) + 
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    ggtitle(paste("Residuals vs Fitted:", predictor)) + 
    xlab("Fitted Values") + ylab("Residuals") + 
    small_theme

  # Plot 3: Normal Q-Q Plot
  p3 <- ggplot(data = data.frame(SampleQuantiles = residuals), aes(sample = SampleQuantiles)) +
    stat_qq() + 
    stat_qq_line(color = "red") +
    ggtitle(paste("Q-Q Plot:", predictor)) + 
    xlab("Theoretical Quantiles") + ylab("Sample Quantiles") +
    small_theme

  # Plot 4: Scale-Location Plot
  p4 <- ggplot(data = data.frame(Fitted = fitted_vals, ScaledResiduals = sqrt(abs(residuals))), aes(x = Fitted, y = ScaledResiduals)) +
    geom_point(alpha = 0.5) + 
    geom_smooth(se = FALSE, color = "red") +
    ggtitle(paste("Scale-Location Plot:", predictor)) + 
    xlab("Fitted Values") + ylab("√|Standardized Residuals|") + 
    small_theme

  # Plot 5: Residuals Histogram
  p5 <- ggplot(data = data.frame(Residuals = residuals), aes(x = Residuals)) +
    geom_histogram(fill = "blue", color = "black", bins = 30, alpha = 0.7) +
    ggtitle(paste("Histogram of Residuals:", predictor)) + 
    xlab("Residuals") + ylab("Frequency") +
    small_theme

  # Display plots
  grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
}
## 
## 
## ### Running Model for Predictor: TEAM_BATTING_H ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.768  -8.757   0.856   9.762  46.016 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    18.562326   3.107523   5.973 2.69e-09 ***
## TEAM_BATTING_H  0.042353   0.002105  20.122  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.52 on 2274 degrees of freedom
## Multiple R-squared:  0.1511, Adjusted R-squared:  0.1508 
## F-statistic: 404.9 on 1 and 2274 DF,  p-value: < 2.2e-16

## 
## 
## ### Running Model for Predictor: TEAM_BATTING_2B ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.453  -9.572   0.636  10.135  57.351 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     57.316365   1.660403   34.52   <2e-16 ***
## TEAM_BATTING_2B  0.097305   0.006757   14.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.08 on 2274 degrees of freedom
## Multiple R-squared:  0.08358,    Adjusted R-squared:  0.08318 
## F-statistic: 207.4 on 1 and 2274 DF,  p-value: < 2.2e-16

## 
## 
## ### Running Model for Predictor: TEAM_BATTING_BB ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.813  -9.747   0.509   9.766  78.276 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     65.812815   1.352265   48.67   <2e-16 ***
## TEAM_BATTING_BB  0.029863   0.002619   11.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.32 on 2274 degrees of freedom
## Multiple R-squared:  0.05408,    Adjusted R-squared:  0.05367 
## F-statistic:   130 on 1 and 2274 DF,  p-value: < 2.2e-16

## 
## 
## ### Running Model for Predictor: TEAM_PITCHING_HR ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.657  -9.956   0.636  10.055  67.477 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.656920   0.646540 117.018   <2e-16 ***
## TEAM_PITCHING_HR  0.048572   0.005292   9.179   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared:  0.03573,    Adjusted R-squared:  0.0353 
## F-statistic: 84.25 on 1 and 2274 DF,  p-value: < 2.2e-16

## 
## 
## ### Running Model for Predictor: TEAM_FIELDING_E ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.461 -10.078   0.697  10.318  73.808 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83.799234   0.479030  174.94   <2e-16 ***
## TEAM_FIELDING_E -0.012205   0.001427   -8.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.51 on 2274 degrees of freedom
## Multiple R-squared:  0.03115,    Adjusted R-squared:  0.03072 
## F-statistic:  73.1 on 1 and 2274 DF,  p-value: < 2.2e-16

## 
## 
## ### Running Model for Predictor: TEAM_BATTING_HR ###
## 
## Call:
## lm(formula = formula, data = moneyball_training_selected)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.226  -9.909   0.520  10.218  68.445 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     76.22576    0.62599 121.768   <2e-16 ***
## TEAM_BATTING_HR  0.04583    0.00537   8.534   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.51 on 2274 degrees of freedom
## Multiple R-squared:  0.03103,    Adjusted R-squared:  0.0306 
## F-statistic: 72.82 on 1 and 2274 DF,  p-value: < 2.2e-16


Multple Linear Regression

In this section, we defined the multiple linear regression formula for the six correlated predictors: TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_BB, TEAM_PITCHING_HR, TEAM_FIELDING_E, and TEAM_BATTING_HR to model TARGET_WINS. We use regsubset to identify the optimal number of predictors for the multiple linear regression.

To visualize the results as predictors are added to the model, we use the AIC, Adjusted R-sqaured, and Mallow CP metrics to determine the number of predictors that balance predictability and complexity.


MODEL RESULTS

  • Adjusted R² vs. Number of Predictors: Shows that adding more predictors increases R² but levels off after 4 predictors.
  • BIC vs. Number of Predictors: The BIC decreases significantly until 4 predictors,
  • Mallows’ Cp vs. Number of Predictors: Cp stabilizes around 4 predictors – confirming this selection.
  • The model with 4 predictors (TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_BB, and TEAM_FIELDING_E) had the lowest BIC and is the best model choice.
# Load libraries
library(leaps)
library(ggplot2)
library(knitr)
library(kableExtra)

# Define small font theme
small_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 9),
    axis.title.x = element_text(size = 7),
    axis.title.y = element_text(size = 7),
    axis.text.x = element_text(size = 5),
    axis.text.y = element_text(size = 5),
    legend.text = element_text(size = 5),
    legend.title = element_text(size = 7),
    strip.text = element_text(size = 7))

# Define multiple regression with 6 predictors
mlr_formula <- TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_BB +
                            TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_BATTING_HR

# Run best subset
subset_model <- regsubsets(mlr_formula, data = moneyball_training_selected, nvmax = 6)

# Extract model summary
subset_summary <- summary(subset_model)

# Store selection metrics
metrics_df <- data.frame(
  Num_Predictors = 1:6,
  Adj_R2 = subset_summary$adjr2,
  Cp = subset_summary$cp,
  BIC = subset_summary$bic)  # Replaces manually computed AIC

# Display model selection metrics
kable(metrics_df, caption = "Model Selection Metrics for Multiple Linear Regression", digits = 2) %>%
  kable_styling(full_width = FALSE)
Model Selection Metrics for Multiple Linear Regression
Num_Predictors Adj_R2 Cp BIC
1 0.15 292.06 -357.49
2 0.23 40.34 -586.91
3 0.24 16.34 -604.94
4 0.25 4.26 -611.28
5 0.25 5.67 -604.14
6 0.25 7.00 -597.09
# Plot Adjusted R² vs. Number of Predictors
ggplot(metrics_df, aes(x = Num_Predictors, y = Adj_R2)) +
  geom_point(size = 3, color = "blue") +
  geom_line(size = 1, color = "blue") +
  ggtitle("Adjusted R² vs. Number of Predictors") +
  xlab("Number of Predictors") + ylab("Adjusted R²") +
  small_theme

# Plot BIC vs. Number of Predictors
ggplot(metrics_df, aes(x = Num_Predictors, y = BIC)) +
  geom_point(size = 3, color = "red") +
  geom_line(size = 1, color = "red") +
  ggtitle("BIC vs. Number of Predictors (Lower is Better)") +
  xlab("Number of Predictors") + ylab("BIC (Lower is Better)") +
  small_theme

# Plot Cp vs. Number of Predictors 
ggplot(metrics_df, aes(x = Num_Predictors, y = Cp)) +
  geom_point(size = 3, color = "green") +
  geom_line(size = 1, color = "green") +
  ggtitle("Mallows' Cp vs. Number of Predictors") +
  xlab("Number of Predictors") + ylab("Cp (Closer to Num Predictors is Better)") +
  small_theme

# Identify the best model
best_model_size <- which.min(subset_summary$bic)
cat("\n\n### Best Model Selected Based on BIC: Model with", best_model_size, "Predictors ###\n")
## 
## 
## ### Best Model Selected Based on BIC: Model with 4 Predictors ###


Select Model and Predict on Evaluation Data

We select the model with the best combination of BIC, Adjusted R-squared and Cp. Overally, this is not a strong linear regression model, it is the best of the single and multiple linear regression models that were fit.

  • The selected model has an Adjusted R² of 0.2471, explaining ~24.7% of the variance in wins.
  • The model with the lowest BIC (-611.28) was selected.
  • Hitting singles, doubles, walks, and fielding errors influence a team’s total wins and the predictors with strongest positive effect.

The model was used to predict the Target Wins in the unseen evaluation dataset (displayed below with the 4 predictors)

# Extract coefficients of the selected model
best_model_coefs <- coef(subset_model, best_model_size)

# Convert to dataframe
best_model_df <- data.frame(
  Predictor = names(best_model_coefs), 
  Coefficient = as.vector(best_model_coefs),
  row.names = NULL
) %>%
  filter(Predictor != "INDEX")

# Extract selection metrics
best_model_metrics <- data.frame(
  Adjusted_R2 = subset_summary$adjr2[best_model_size],
  Cp = subset_summary$cp[best_model_size],
  BIC = subset_summary$bic[best_model_size]
)

# Display model equation
model_equation <- paste0(
  "TARGET_WINS = ", round(best_model_coefs[1], 2), " ", 
  paste0("+ ", round(best_model_coefs[-1], 4), " * ", names(best_model_coefs[-1]), collapse = " "))

cat("\n### Selected Model Equation ###\n", model_equation, "\n")
## 
## ### Selected Model Equation ###
##  TARGET_WINS = 1.5 + 0.0564 * TEAM_BATTING_H + -0.0318 * TEAM_BATTING_2B + 0.0167 * TEAM_BATTING_BB + -0.0173 * TEAM_FIELDING_E
# Display the coefficients in a table
kable(best_model_df, caption = "Coefficients of Selected Model") %>%
  kable_styling(full_width = FALSE)
Coefficients of Selected Model
Predictor Coefficient
(Intercept) 1.5021479
TEAM_BATTING_H 0.0564057
TEAM_BATTING_2B -0.0317922
TEAM_BATTING_BB 0.0166626
TEAM_FIELDING_E -0.0173401
# Display the model selection metrics
kable(best_model_metrics, caption = "Selection Metrics of Best Model", digits = 4) %>%
  kable_styling(full_width = FALSE)
Selection Metrics of Best Model
Adjusted_R2 Cp BIC
0.2471 4.2625 -611.2815
# Load libraries
library(dplyr)
library(readr)
library(mice)

# Load the unseen evaluation dataset
moneyball_eval <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data621/refs/heads/main/moneyball-evaluation-data.csv", header = TRUE, sep = ',', na.strings="", fill = TRUE)
moneyball_eval <- moneyball_eval %>% select(-INDEX)

# Ensure the evaluation dataset has only the predictors
selected_predictors <- names(best_model_coefs)[-1]
moneyball_eval_selected <- moneyball_eval %>% select(all_of(selected_predictors))

# Perform MICE imputation
set.seed(123)
imputed_eval_data <- mice(moneyball_eval_selected, method = "pmm", m = 5, maxit = 10, seed = 123, printFlag = FALSE)
moneyball_eval_selected <- complete(imputed_eval_data)

# Fit the best model on training data
best_model_formula <- as.formula(paste("TARGET_WINS ~", paste(selected_predictors, collapse = " + ")))
best_model <- lm(best_model_formula, data = moneyball_training_selected)

# Predict TARGET_WINS using the trained model
moneyball_eval$Predicted_TARGET_WINS <- predict(best_model, newdata = moneyball_eval_selected)

# Save full dataset to CSV file
write.csv(moneyball_eval, "moneyball_predictions_full.csv", row.names = FALSE)

# Select the 4 predictor columns, and the Predicted_TARGET_WINS column
results_df <- moneyball_eval %>%
  select(1, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_BB, TEAM_FIELDING_E, Predicted_TARGET_WINS)

# Display the final results table
kable(head(results_df,10), caption = "Predictions with Selected Predictors (Sample 10)") %>%
  kable_styling(full_width = TRUE)
Predictions with Selected Predictors (Sample 10)
TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_BB TEAM_FIELDING_E Predicted_TARGET_WINS
1209 170 447 140 69.31255
1221 151 516 135 71.82989
1395 183 509 156 80.14635
1539 309 486 124 84.43460
1445 203 95 616 67.45603
1431 236 215 572 68.37969
1430 219 568 490 76.16754
1385 158 356 328 74.84523
1259 177 466 226 70.73563
1397 212 452 184 77.90190