Reading the file
#Read the csv file
baseball_data <- read.csv("baseball_data.csv")
Missing value analysis
# Create missing flags based on original data patterns
baseball_data$CS_MISSING <- ifelse(is.na(baseball_data$TEAM_BASERUN_CS), 1, 0)
baseball_data$DP_MISSING <- ifelse(is.na(baseball_data$TEAM_FIELDING_DP), 1, 0)
baseball_data$SO_BAT_MISSING <- ifelse(is.na(baseball_data$TEAM_BATTING_SO), 1, 0)
baseball_data$SO_PITCH_MISSING <- ifelse(is.na(baseball_data$TEAM_PITCHING_SO), 1, 0)
baseball_data$SB_MISSING <- ifelse(is.na(baseball_data$TEAM_BASERUN_SB), 1, 0)
baseball_data$HBP_MISSING <- ifelse(is.na(baseball_data$TEAM_BATTING_HBP), 1, 0)
# Missing value visualization
VIM::aggr(baseball_data, col = c('navyblue','red'),
numbers = TRUE, sortVars = TRUE)
##
## Variables sorted by number of missings:
## Variable Count
## TEAM_BATTING_HBP 0.91608084
## TEAM_BASERUN_CS 0.33919156
## TEAM_FIELDING_DP 0.12565905
## TEAM_BASERUN_SB 0.05755712
## TEAM_BATTING_SO 0.04481547
## TEAM_PITCHING_SO 0.04481547
## INDEX 0.00000000
## TARGET_WINS 0.00000000
## TEAM_BATTING_H 0.00000000
## TEAM_BATTING_2B 0.00000000
## TEAM_BATTING_3B 0.00000000
## TEAM_BATTING_HR 0.00000000
## TEAM_BATTING_BB 0.00000000
## TEAM_PITCHING_H 0.00000000
## TEAM_PITCHING_HR 0.00000000
## TEAM_PITCHING_BB 0.00000000
## TEAM_FIELDING_E 0.00000000
## CS_MISSING 0.00000000
## DP_MISSING 0.00000000
## SO_BAT_MISSING 0.00000000
## SO_PITCH_MISSING 0.00000000
## SB_MISSING 0.00000000
## HBP_MISSING 0.00000000
Outlier Detection
# Function to identify outliers using IQR method
identify_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
return(x < lower | x > upper)
}
# Apply to numerical columns (excluding INDEX and missing flags)
numeric_cols <- sapply(baseball_data, is.numeric)
numeric_cols <- names(numeric_cols[numeric_cols == TRUE])
numeric_cols <- numeric_cols[!grepl("INDEX|MISSING", numeric_cols)]
# Create outlier flags
for(col in numeric_cols) {
flag_name <- paste0(col, "_OUTLIER")
baseball_data[[flag_name]] <- identify_outliers(baseball_data[[col]])
}
Missing Value Imputation
# Missing Value Imputation - CORRECTED VERSION
# Decision: Drop HBP due to 92.7% missing
baseball_data <- baseball_data %>% select(-TEAM_BATTING_HBP, -HBP_MISSING)
# Impute remaining missing values with median (more robust than mean)
numeric_impute_cols <- c("TEAM_BATTING_SO", "TEAM_BASERUN_SB", "TEAM_BASERUN_CS",
"TEAM_PITCHING_SO", "TEAM_FIELDING_DP")
for(col in numeric_impute_cols) {
# COUNT MISSING VALUES BEFORE IMPUTATION
missing_count_before <- sum(is.na(baseball_data[[col]]))
if(missing_count_before > 0) {
median_val <- median(baseball_data[[col]], na.rm = TRUE)
baseball_data[[col]][is.na(baseball_data[[col]])] <- median_val
# COUNT MISSING VALUES AFTER IMPUTATION (should be 0)
missing_count_after <- sum(is.na(baseball_data[[col]]))
cat("✓ Imputed", missing_count_before, "missing values in", col,
"with median:", median_val,
"(", missing_count_after, "remaining )\n")
} else {
cat("• No missing values found in", col, "\n")
}
}
## ✓ Imputed 102 missing values in TEAM_BATTING_SO with median: 750 ( 0 remaining )
## ✓ Imputed 131 missing values in TEAM_BASERUN_SB with median: 101 ( 0 remaining )
## ✓ Imputed 772 missing values in TEAM_BASERUN_CS with median: 49 ( 0 remaining )
## ✓ Imputed 102 missing values in TEAM_PITCHING_SO with median: 813.5 ( 0 remaining )
## ✓ Imputed 286 missing values in TEAM_FIELDING_DP with median: 149 ( 0 remaining )
} }
Feature Engineering
``` r
# Create meaningful baseball metrics
baseball_data <- baseball_data %>%
mutate(
# Offensive Metrics
SINGLES = TEAM_BATTING_H - TEAM_BATTING_2B - TEAM_BATTING_3B - TEAM_BATTING_HR,
TOTAL_BASES = SINGLES + (2 * TEAM_BATTING_2B) + (3 * TEAM_BATTING_3B) + (4 * TEAM_BATTING_HR),
# Base running efficiency
SB_SUCCESS_RATE = ifelse(TEAM_BASERUN_SB + TEAM_BASERUN_CS > 0,
TEAM_BASERUN_SB / (TEAM_BASERUN_SB + TEAM_BASERUN_CS), 0),
# Pitching effectiveness (lower is better)
WHIP_PROXY = (TEAM_PITCHING_H + TEAM_PITCHING_BB) / 162, # Assuming 162 games
# Defensive efficiency
ERROR_RATE = TEAM_FIELDING_E / (TEAM_PITCHING_H + TEAM_PITCHING_BB), # Rough proxy
# Power metrics
POWER_RATIO = (TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR) / TEAM_BATTING_H,
HR_RATIO = TEAM_BATTING_HR / TEAM_BATTING_H,
# Discipline metrics
BB_SO_RATIO = TEAM_BATTING_BB / TEAM_BATTING_SO,
PITCHING_K_BB_RATIO = TEAM_PITCHING_SO / TEAM_PITCHING_BB
)
Log Transformation of Skewed Variables
# Skewness checks
skewed_vars <- baseball_data %>%
select_if(is.numeric) %>%
select(-INDEX) %>%
summarise_all(~abs(psych::skew(., na.rm = TRUE))) %>%
gather(key = "Variable", value = "Skewness") %>%
filter(Skewness > 1) %>%
arrange(desc(Skewness))
print("Highly skewed variables (|skewness| > 1):")
## [1] "Highly skewed variables (|skewness| > 1):"
print(skewed_vars)
## Variable Skewness
## 1 TEAM_PITCHING_SO 22.690450
## 2 WHIP_PROXY 10.330139
## 3 TEAM_PITCHING_H 10.329511
## 4 TEAM_PITCHING_BB 6.743899
## 5 SO_BAT_MISSING 4.397174
## 6 SO_PITCH_MISSING 4.397174
## 7 SB_MISSING 3.796854
## 8 TEAM_FIELDING_E 2.990466
## 9 TEAM_BASERUN_CS 2.602172
## 10 DP_MISSING 2.257219
## 11 TEAM_BASERUN_SB 2.065828
## 12 PITCHING_K_BB_RATIO 2.047632
## 13 SINGLES 2.046819
## 14 ERROR_RATE 1.781655
## 15 TEAM_BATTING_H 1.571333
## 16 TEAM_BATTING_3B 1.109465
## 17 TEAM_BATTING_BB 1.025760
# Apply log transformation to highly skewed variables
for(var in skewed_vars$Variable) {
if(min(baseball_data[[var]], na.rm = TRUE) > 0) {
new_var_name <- paste0("LOG_", var)
baseball_data[[new_var_name]] <- log(baseball_data[[var]])
}
}
Categorical Variables and Bucketing
# Create performance tiers based on key metrics
baseball_data <- baseball_data %>%
mutate(
# Offensive performance tiers
OFFENSIVE_TIER = case_when(
TEAM_BATTING_H >= quantile(TEAM_BATTING_H, 0.75, na.rm = TRUE) ~ "High",
TEAM_BATTING_H >= quantile(TEAM_BATTING_H, 0.25, na.rm = TRUE) ~ "Medium",
TRUE ~ "Low"
),
# Pitching performance tiers (lower hits allowed = better)
PITCHING_TIER = case_when(
TEAM_PITCHING_H <= quantile(TEAM_PITCHING_H, 0.25, na.rm = TRUE) ~ "Elite",
TEAM_PITCHING_H <= quantile(TEAM_PITCHING_H, 0.75, na.rm = TRUE) ~ "Average",
TRUE ~ "Poor"
),
# Error buckets
ERROR_BUCKET = case_when(
TEAM_FIELDING_E <= quantile(TEAM_FIELDING_E, 0.33, na.rm = TRUE) ~ "Low_Errors",
TEAM_FIELDING_E <= quantile(TEAM_FIELDING_E, 0.67, na.rm = TRUE) ~ "Medium_Errors",
TRUE ~ "High_Errors"
)
)
# Convert categorical variables to factors
categorical_vars <- c("OFFENSIVE_TIER", "PITCHING_TIER", "ERROR_BUCKET")
baseball_data[categorical_vars] <- lapply(baseball_data[categorical_vars], as.factor)
Data Quality Ckeck
# Summary of final dataset
cat("\nFinal Dataset Summary:\n")
##
## Final Dataset Summary:
cat("Dimensions:", dim(baseball_data), "\n")
## Dimensions: 2276 55
cat("Missing values remaining:", sum(is.na(baseball_data)), "\n")
## Missing values remaining: 3480
# Display structure of key engineered features
engineered_features <- c("SINGLES", "TOTAL_BASES", "SB_SUCCESS_RATE", "WHIP_PROXY",
"POWER_RATIO", "BB_SO_RATIO", "OFFENSIVE_TIER", "PITCHING_TIER")
cat("\nEngineered Features Summary:\n")
##
## Engineered Features Summary:
print(summary(baseball_data[engineered_features]))
## SINGLES TOTAL_BASES SB_SUCCESS_RATE WHIP_PROXY
## Min. : 709.0 Min. :1026 Min. :0.0000 Min. : 9.469
## 1st Qu.: 990.8 1st Qu.:1947 1st Qu.:0.5913 1st Qu.: 11.969
## Median :1050.0 Median :2126 Median :0.6730 Median : 12.802
## Mean :1073.2 Mean :2120 Mean :0.6635 Mean : 14.396
## 3rd Qu.:1129.0 3rd Qu.:2285 3rd Qu.:0.7373 3rd Qu.: 13.995
## Max. :2112.0 Max. :3290 Max. :0.9343 Max. :194.000
##
## POWER_RATIO BB_SO_RATIO OFFENSIVE_TIER PITCHING_TIER
## Min. :0.1134 Min. :0.1180 High : 569 Average:1129
## 1st Qu.:0.2366 1st Qu.:0.5450 Low : 567 Elite : 578
## Median :0.2699 Median :0.6564 Medium:1140 Poor : 569
## Mean :0.2694 Mean : Inf
## 3rd Qu.:0.3029 3rd Qu.:0.9069
## Max. :0.3937 Max. : Inf
## NA's :1
Correlation Analysis
# Correlation matrix for key variables
numeric_for_corr <- baseball_data %>%
select_if(is.numeric) %>%
select(-INDEX, -contains("MISSING"), -contains("OUTLIER"))
# Simple correlation plot without clustering
correlation_matrix <- cor(numeric_for_corr, use = "complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper",
order = "original", # No clustering
tl.cex = 0.7, tl.col = "black")
cat("\nData preparation completed successfully!\n")
##
## Data preparation completed successfully!
cat("Ready for modeling with", ncol(baseball_data), "variables and", nrow(baseball_data), "observations.\n")
## Ready for modeling with 55 variables and 2276 observations.