Here we load and explore the baseball data. Some key insights include:
The dataset’s variables show a wide range in means and standard deviations with some variables showing significant variation.
Several predictors present significant outliers, especially TEAM_PITCHING_H and TEAM_PITCHING_SO, indicating that some teams have extreme values in these categories.
9.6% of the dataset has missing values, with TEAM_BATTING_HBP and TEAM_BASERUN_CS having the highest proportion of missing data. We will need to perform imputation to estimate these missing values and cannot simply drop the corresponding row.
TEAM_BATTING_HBP is missing 92% of its values; it may be too spare to impute reliably and will be dropped in modeling
TEAM_BATTING_H, TEAM_BATTING_2B, and TEAM_BATTING_BB have the strongest positive correlations with TARGET_WINS indicating singles, doubles, walks have the highest correlation to wins . TEAM_FIELDING_E shows a negative correlation, indicating that more errors reduce the chances of winning.
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
library(naniar)
library(GGally)
library(knitr)
library(kableExtra)
# Load dataset
moneyball_training <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data621/refs/heads/main/moneyball-training-data.csv",
header = TRUE, sep = ',', na.strings="", fill = TRUE)
moneyball_training <- moneyball_training %>% select(-INDEX)
# View basic dataset information
str(moneyball_training)
## 'data.frame': 2276 obs. of 16 variables:
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
# Summary statistics for all numeric predictors
summary_stats <- data.frame(
Variable = names(moneyball_training %>% select(where(is.numeric))),
Mean = sapply(moneyball_training %>% select(where(is.numeric)), mean, na.rm = TRUE),
Median = sapply(moneyball_training %>% select(where(is.numeric)), median, na.rm = TRUE),
SD = sapply(moneyball_training %>% select(where(is.numeric)), sd, na.rm = TRUE)) %>%
mutate(across(where(is.numeric), ~round(., 2)))
# Display summary statistics
kable(summary_stats, caption = "Summary Statistics for Predictors", row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(1, bold = TRUE)
Variable | Mean | Median | SD |
---|---|---|---|
TARGET_WINS | 80.79 | 82.0 | 15.75 |
TEAM_BATTING_H | 1469.27 | 1454.0 | 144.59 |
TEAM_BATTING_2B | 241.25 | 238.0 | 46.80 |
TEAM_BATTING_3B | 55.25 | 47.0 | 27.94 |
TEAM_BATTING_HR | 99.61 | 102.0 | 60.55 |
TEAM_BATTING_BB | 501.56 | 512.0 | 122.67 |
TEAM_BATTING_SO | 735.61 | 750.0 | 248.53 |
TEAM_BASERUN_SB | 124.76 | 101.0 | 87.79 |
TEAM_BASERUN_CS | 52.80 | 49.0 | 22.96 |
TEAM_BATTING_HBP | 59.36 | 58.0 | 12.97 |
TEAM_PITCHING_H | 1779.21 | 1518.0 | 1406.84 |
TEAM_PITCHING_HR | 105.70 | 107.0 | 61.30 |
TEAM_PITCHING_BB | 553.01 | 536.5 | 166.36 |
TEAM_PITCHING_SO | 817.73 | 813.5 | 553.09 |
TEAM_FIELDING_E | 246.48 | 159.0 | 227.77 |
TEAM_FIELDING_DP | 146.39 | 149.0 | 26.23 |
# Boxplots for numeric predictors
moneyball_training %>%
select(where(is.numeric)) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Variable, y = Value)) +
geom_boxplot(fill = "darkblue") +
ggtitle("Box Plot of All Predictors") +
xlab("Variable") + ylab("Value") +
theme_minimal(base_size = 8) +
coord_flip()
# Missing data visualization
vis_miss(moneyball_training) +
ggtitle("Missing Data Map") +
theme(text = element_text(size = 8))
gg_miss_var(moneyball_training) +
ggtitle("Missing Values per Variable") +
theme(text = element_text(size = 8))
# Prepare data for correlation analysis
cor_data <- moneyball_training %>%
select(where(is.numeric))
# Compute correlations with TARGET_WINS
correlation_values <- cor_data %>%
summarise(across(everything(),
~ cor(., cor_data$TARGET_WINS, use = "complete.obs"),
.names = "cor_{col}")) %>%
pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
mutate(Predictor = gsub("cor_", "", Predictor)) %>%
filter(Predictor != "TARGET_WINS") %>%
arrange(desc(abs(Correlation)))
# Display sorted correlations
kable(correlation_values, caption = "Correlations with TARGET_WINS", digits = 2) %>%
kable_styling(full_width = TRUE)
Predictor | Correlation |
---|---|
TEAM_BATTING_H | 0.39 |
TEAM_BATTING_2B | 0.29 |
TEAM_BATTING_BB | 0.23 |
TEAM_PITCHING_HR | 0.19 |
TEAM_FIELDING_E | -0.18 |
TEAM_BATTING_HR | 0.18 |
TEAM_BATTING_3B | 0.14 |
TEAM_BASERUN_SB | 0.14 |
TEAM_PITCHING_BB | 0.12 |
TEAM_PITCHING_H | -0.11 |
TEAM_PITCHING_SO | -0.08 |
TEAM_BATTING_HBP | 0.07 |
TEAM_FIELDING_DP | -0.03 |
TEAM_BATTING_SO | -0.03 |
TEAM_BASERUN_CS | 0.02 |
# Correlation matrix visualization
corr_matrix <- cor(cor_data, use = "pairwise.complete.obs")
corrplot(corr_matrix, method = "color", type = "lower", diag = FALSE,
tl.col = "black", tl.cex = 0.5,
col = colorRampPalette(c("blue", "white", "red"))(200))
# Scatterplot for correlated variables
top_corr_vars <- correlation_values %>%
top_n(10, abs(Correlation)) %>%
pull(Predictor)
ggpairs(cor_data, columns = match(top_corr_vars, names(cor_data)),
upper = list(continuous = wrap("cor", size = 2)),
title = "Correlation Scatterplot Matrix for Top Predictors") +
theme(
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6),
strip.text = element_text(size = 6),
plot.title = element_text(size = 10))
After exploration, we prepare the data for modeling. The goal is convert the data set to make it suitable for modeling and prediction. To accomplish this, we:
Dropped TEAM_BATTING_HBP; it mis missing 92% of its values - making it too sparse for reliable imputation. It may introduce bias or inaccuracies.
Imputed Missing Values Using mice with Predictive Mean Matching (PMM) to replace missing values with realistic estimates.
Verified No Missing Values Remain by checking the missing values count. All predictors are now complete.
Recalculated Correlations with TARGET_WINS to identify variables most strongly associated with team wins. This selects teh top 6 correlated predictors.
# Load library
library(mice)
# Drop TEAM_BATTING_HBP predictor
moneyball_training <- moneyball_training %>% select(-TEAM_BATTING_HBP)
# Impute Missing Values
set.seed(123)
imputed_data <- mice(moneyball_training, method = "pmm", m = 5, maxit = 10, seed = 123, printFlag = FALSE)
# Convert imputed data to dataframe
moneyball_training <- complete(imputed_data)
# Verify No Missing Values Remain
missing_counts <- colSums(is.na(moneyball_training))
# Display number of missing values
kable(
data.frame(
Variable = names(missing_counts),
Missing_Values = missing_counts,
row.names = NULL
) %>%
filter(Variable != "INDEX"),
caption = "Missing Values Count After Imputation"
) %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Variable | Missing_Values |
---|---|
TARGET_WINS | 0 |
TEAM_BATTING_H | 0 |
TEAM_BATTING_2B | 0 |
TEAM_BATTING_3B | 0 |
TEAM_BATTING_HR | 0 |
TEAM_BATTING_BB | 0 |
TEAM_BATTING_SO | 0 |
TEAM_BASERUN_SB | 0 |
TEAM_BASERUN_CS | 0 |
TEAM_PITCHING_H | 0 |
TEAM_PITCHING_HR | 0 |
TEAM_PITCHING_BB | 0 |
TEAM_PITCHING_SO | 0 |
TEAM_FIELDING_E | 0 |
TEAM_FIELDING_DP | 0 |
# Recalculate Correlations with TARGET_WINS
cor_data <- moneyball_training %>%
select(where(is.numeric))
correlation_values <- cor_data %>%
summarise(across(everything(),
~ cor(., cor_data$TARGET_WINS, use = "complete.obs"),
.names = "cor_{col}")) %>%
pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
mutate(Predictor = gsub("cor_", "", Predictor)) %>%
filter(Predictor != "TARGET_WINS") %>%
arrange(desc(abs(Correlation)))
# Select Top Predictors
top_6_vars <- correlation_values %>%
top_n(6, abs(Correlation)) %>%
arrange(desc(abs(Correlation))) # Ensures proper ranking
# Display Top predictors
kable(top_6_vars, caption = "Top 6 Correlations with TARGET_WINS", digits = 2) %>%
kable_styling(full_width = TRUE)
Predictor | Correlation |
---|---|
TEAM_BATTING_H | 0.39 |
TEAM_BATTING_2B | 0.29 |
TEAM_BATTING_BB | 0.23 |
TEAM_PITCHING_HR | 0.19 |
TEAM_FIELDING_E | -0.18 |
TEAM_BATTING_HR | 0.18 |
Overall, we evaluated two types of models
In this section, we built simple linear regression models for each predictor to analyze the relationship between individual predictors and TARGET_WINS to determine the strength of each variable’s contribution. For each regression, we also generated the diagnostic plots.
MODEL RESULTS
While Hits/Single was the strongest predictor, the low R² values for individual models indicate that a multiple regression approach is necessary to capture interactions and improve predictive power. Diagnostic plots suggest some violations of linearity and normality assumptions, which might require transformations or alternative modeling techniques.
library(gridExtra)
# Define small font theme for better readability
small_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 9),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 5),
legend.text = element_text(size = 5),
legend.title = element_text(size = 7),
strip.text = element_text(size = 7))
# List to store models
models <- list()
# Iterate through each predictor
for (predictor in top_6_vars$Predictor) {
cat("\n\n### Running Model for Predictor:", predictor, "###\n")
# Build simple linear regression model
formula <- as.formula(paste("TARGET_WINS ~", predictor))
model <- lm(formula, data = moneyball_training_selected)
models[[predictor]] <- model
# Print model summary
print(summary(model))
# Extract residuals and fitted values
residuals <- resid(model)
fitted_vals <- fitted(model)
# Plot 1: Regression Plot
p1 <- ggplot(moneyball_training_selected, aes_string(x = predictor, y = "TARGET_WINS")) +
geom_point(alpha = 0.5, color = "blue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
ggtitle(paste("Regression Plot:", predictor)) +
xlab(predictor) + ylab("TARGET_WINS") +
small_theme
# Plot 2: Residuals vs Fitted
p2 <- ggplot(data = data.frame(Fitted = fitted_vals, Residuals = residuals), aes(x = Fitted, y = Residuals)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Residuals vs Fitted:", predictor)) +
xlab("Fitted Values") + ylab("Residuals") +
small_theme
# Plot 3: Normal Q-Q Plot
p3 <- ggplot(data = data.frame(SampleQuantiles = residuals), aes(sample = SampleQuantiles)) +
stat_qq() +
stat_qq_line(color = "red") +
ggtitle(paste("Q-Q Plot:", predictor)) +
xlab("Theoretical Quantiles") + ylab("Sample Quantiles") +
small_theme
# Plot 4: Scale-Location Plot
p4 <- ggplot(data = data.frame(Fitted = fitted_vals, ScaledResiduals = sqrt(abs(residuals))), aes(x = Fitted, y = ScaledResiduals)) +
geom_point(alpha = 0.5) +
geom_smooth(se = FALSE, color = "red") +
ggtitle(paste("Scale-Location Plot:", predictor)) +
xlab("Fitted Values") + ylab("√|Standardized Residuals|") +
small_theme
# Plot 5: Residuals Histogram
p5 <- ggplot(data = data.frame(Residuals = residuals), aes(x = Residuals)) +
geom_histogram(fill = "blue", color = "black", bins = 30, alpha = 0.7) +
ggtitle(paste("Histogram of Residuals:", predictor)) +
xlab("Residuals") + ylab("Frequency") +
small_theme
# Display plots
grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
}
##
##
## ### Running Model for Predictor: TEAM_BATTING_H ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.768 -8.757 0.856 9.762 46.016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.562326 3.107523 5.973 2.69e-09 ***
## TEAM_BATTING_H 0.042353 0.002105 20.122 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.52 on 2274 degrees of freedom
## Multiple R-squared: 0.1511, Adjusted R-squared: 0.1508
## F-statistic: 404.9 on 1 and 2274 DF, p-value: < 2.2e-16
##
##
## ### Running Model for Predictor: TEAM_BATTING_2B ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.453 -9.572 0.636 10.135 57.351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.316365 1.660403 34.52 <2e-16 ***
## TEAM_BATTING_2B 0.097305 0.006757 14.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.08 on 2274 degrees of freedom
## Multiple R-squared: 0.08358, Adjusted R-squared: 0.08318
## F-statistic: 207.4 on 1 and 2274 DF, p-value: < 2.2e-16
##
##
## ### Running Model for Predictor: TEAM_BATTING_BB ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.813 -9.747 0.509 9.766 78.276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.812815 1.352265 48.67 <2e-16 ***
## TEAM_BATTING_BB 0.029863 0.002619 11.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.32 on 2274 degrees of freedom
## Multiple R-squared: 0.05408, Adjusted R-squared: 0.05367
## F-statistic: 130 on 1 and 2274 DF, p-value: < 2.2e-16
##
##
## ### Running Model for Predictor: TEAM_PITCHING_HR ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.657 -9.956 0.636 10.055 67.477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.656920 0.646540 117.018 <2e-16 ***
## TEAM_PITCHING_HR 0.048572 0.005292 9.179 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared: 0.03573, Adjusted R-squared: 0.0353
## F-statistic: 84.25 on 1 and 2274 DF, p-value: < 2.2e-16
##
##
## ### Running Model for Predictor: TEAM_FIELDING_E ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.461 -10.078 0.697 10.318 73.808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.799234 0.479030 174.94 <2e-16 ***
## TEAM_FIELDING_E -0.012205 0.001427 -8.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.51 on 2274 degrees of freedom
## Multiple R-squared: 0.03115, Adjusted R-squared: 0.03072
## F-statistic: 73.1 on 1 and 2274 DF, p-value: < 2.2e-16
##
##
## ### Running Model for Predictor: TEAM_BATTING_HR ###
##
## Call:
## lm(formula = formula, data = moneyball_training_selected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.226 -9.909 0.520 10.218 68.445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.22576 0.62599 121.768 <2e-16 ***
## TEAM_BATTING_HR 0.04583 0.00537 8.534 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.51 on 2274 degrees of freedom
## Multiple R-squared: 0.03103, Adjusted R-squared: 0.0306
## F-statistic: 72.82 on 1 and 2274 DF, p-value: < 2.2e-16
In this section, we defined the multiple linear regression formula for the six correlated predictors: TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_BB, TEAM_PITCHING_HR, TEAM_FIELDING_E, and TEAM_BATTING_HR to model TARGET_WINS. We use regsubset to identify the optimal number of predictors for the multiple linear regression.
To visualize the results as predictors are added to the model, we use the AIC, Adjusted R-sqaured, and Mallow CP metrics to determine the number of predictors that balance predictability and complexity.
MODEL RESULTS
# Load libraries
library(leaps)
library(ggplot2)
library(knitr)
library(kableExtra)
# Define small font theme
small_theme <- theme_minimal() +
theme(
plot.title = element_text(size = 9),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 5),
legend.text = element_text(size = 5),
legend.title = element_text(size = 7),
strip.text = element_text(size = 7))
# Define multiple regression with 6 predictors
mlr_formula <- TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_BB +
TEAM_PITCHING_HR + TEAM_FIELDING_E + TEAM_BATTING_HR
# Run best subset
subset_model <- regsubsets(mlr_formula, data = moneyball_training_selected, nvmax = 6)
# Extract model summary
subset_summary <- summary(subset_model)
# Store selection metrics
metrics_df <- data.frame(
Num_Predictors = 1:6,
Adj_R2 = subset_summary$adjr2,
Cp = subset_summary$cp,
BIC = subset_summary$bic) # Replaces manually computed AIC
# Display model selection metrics
kable(metrics_df, caption = "Model Selection Metrics for Multiple Linear Regression", digits = 2) %>%
kable_styling(full_width = FALSE)
Num_Predictors | Adj_R2 | Cp | BIC |
---|---|---|---|
1 | 0.15 | 292.06 | -357.49 |
2 | 0.23 | 40.34 | -586.91 |
3 | 0.24 | 16.34 | -604.94 |
4 | 0.25 | 4.26 | -611.28 |
5 | 0.25 | 5.67 | -604.14 |
6 | 0.25 | 7.00 | -597.09 |
# Plot Adjusted R² vs. Number of Predictors
ggplot(metrics_df, aes(x = Num_Predictors, y = Adj_R2)) +
geom_point(size = 3, color = "blue") +
geom_line(size = 1, color = "blue") +
ggtitle("Adjusted R² vs. Number of Predictors") +
xlab("Number of Predictors") + ylab("Adjusted R²") +
small_theme
# Plot BIC vs. Number of Predictors
ggplot(metrics_df, aes(x = Num_Predictors, y = BIC)) +
geom_point(size = 3, color = "red") +
geom_line(size = 1, color = "red") +
ggtitle("BIC vs. Number of Predictors (Lower is Better)") +
xlab("Number of Predictors") + ylab("BIC (Lower is Better)") +
small_theme
# Plot Cp vs. Number of Predictors
ggplot(metrics_df, aes(x = Num_Predictors, y = Cp)) +
geom_point(size = 3, color = "green") +
geom_line(size = 1, color = "green") +
ggtitle("Mallows' Cp vs. Number of Predictors") +
xlab("Number of Predictors") + ylab("Cp (Closer to Num Predictors is Better)") +
small_theme
# Identify the best model
best_model_size <- which.min(subset_summary$bic)
cat("\n\n### Best Model Selected Based on BIC: Model with", best_model_size, "Predictors ###\n")
##
##
## ### Best Model Selected Based on BIC: Model with 4 Predictors ###
We select the model with the best combination of BIC, Adjusted R-squared and Cp. Overally, this is not a strong linear regression model, it is the best of the single and multiple linear regression models that were fit.
The model was used to predict the Target Wins in the unseen evaluation dataset (displayed below with the 4 predictors)
# Extract coefficients of the selected model
best_model_coefs <- coef(subset_model, best_model_size)
# Convert to dataframe
best_model_df <- data.frame(
Predictor = names(best_model_coefs),
Coefficient = as.vector(best_model_coefs),
row.names = NULL
) %>%
filter(Predictor != "INDEX")
# Extract selection metrics
best_model_metrics <- data.frame(
Adjusted_R2 = subset_summary$adjr2[best_model_size],
Cp = subset_summary$cp[best_model_size],
BIC = subset_summary$bic[best_model_size]
)
# Display model equation
model_equation <- paste0(
"TARGET_WINS = ", round(best_model_coefs[1], 2), " ",
paste0("+ ", round(best_model_coefs[-1], 4), " * ", names(best_model_coefs[-1]), collapse = " "))
cat("\n### Selected Model Equation ###\n", model_equation, "\n")
##
## ### Selected Model Equation ###
## TARGET_WINS = 1.5 + 0.0564 * TEAM_BATTING_H + -0.0318 * TEAM_BATTING_2B + 0.0167 * TEAM_BATTING_BB + -0.0173 * TEAM_FIELDING_E
# Display the coefficients in a table
kable(best_model_df, caption = "Coefficients of Selected Model") %>%
kable_styling(full_width = FALSE)
Predictor | Coefficient |
---|---|
(Intercept) | 1.5021479 |
TEAM_BATTING_H | 0.0564057 |
TEAM_BATTING_2B | -0.0317922 |
TEAM_BATTING_BB | 0.0166626 |
TEAM_FIELDING_E | -0.0173401 |
# Display the model selection metrics
kable(best_model_metrics, caption = "Selection Metrics of Best Model", digits = 4) %>%
kable_styling(full_width = FALSE)
Adjusted_R2 | Cp | BIC |
---|---|---|
0.2471 | 4.2625 | -611.2815 |
# Load libraries
library(dplyr)
library(readr)
library(mice)
# Load the unseen evaluation dataset
moneyball_eval <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data621/refs/heads/main/moneyball-evaluation-data.csv", header = TRUE, sep = ',', na.strings="", fill = TRUE)
moneyball_eval <- moneyball_eval %>% select(-INDEX)
# Ensure the evaluation dataset has only the predictors
selected_predictors <- names(best_model_coefs)[-1]
moneyball_eval_selected <- moneyball_eval %>% select(all_of(selected_predictors))
# Perform MICE imputation
set.seed(123)
imputed_eval_data <- mice(moneyball_eval_selected, method = "pmm", m = 5, maxit = 10, seed = 123, printFlag = FALSE)
moneyball_eval_selected <- complete(imputed_eval_data)
# Fit the best model on training data
best_model_formula <- as.formula(paste("TARGET_WINS ~", paste(selected_predictors, collapse = " + ")))
best_model <- lm(best_model_formula, data = moneyball_training_selected)
# Predict TARGET_WINS using the trained model
moneyball_eval$Predicted_TARGET_WINS <- predict(best_model, newdata = moneyball_eval_selected)
# Save full dataset to CSV file
write.csv(moneyball_eval, "moneyball_predictions_full.csv", row.names = FALSE)
# Select the 4 predictor columns, and the Predicted_TARGET_WINS column
results_df <- moneyball_eval %>%
select(1, TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_BB, TEAM_FIELDING_E, Predicted_TARGET_WINS)
# Display the final results table
kable(head(results_df,10), caption = "Predictions with Selected Predictors (Sample 10)") %>%
kable_styling(full_width = TRUE)
TEAM_BATTING_H | TEAM_BATTING_2B | TEAM_BATTING_BB | TEAM_FIELDING_E | Predicted_TARGET_WINS |
---|---|---|---|---|
1209 | 170 | 447 | 140 | 69.31255 |
1221 | 151 | 516 | 135 | 71.82989 |
1395 | 183 | 509 | 156 | 80.14635 |
1539 | 309 | 486 | 124 | 84.43460 |
1445 | 203 | 95 | 616 | 67.45603 |
1431 | 236 | 215 | 572 | 68.37969 |
1430 | 219 | 568 | 490 | 76.16754 |
1385 | 158 | 356 | 328 | 74.84523 |
1259 | 177 | 466 | 226 | 70.73563 |
1397 | 212 | 452 | 184 | 77.90190 |