Preparing a Dataset for Data Modelling

Reading the file

#Read the csv file
baseball_data <- read.csv("baseball_data.csv")

Missing value analysis

# Create missing flags based on original data patterns
baseball_data$CS_MISSING <- ifelse(is.na(baseball_data$TEAM_BASERUN_CS), 1, 0)
baseball_data$DP_MISSING <- ifelse(is.na(baseball_data$TEAM_FIELDING_DP), 1, 0)
baseball_data$SO_BAT_MISSING <- ifelse(is.na(baseball_data$TEAM_BATTING_SO), 1, 0)
baseball_data$SO_PITCH_MISSING <- ifelse(is.na(baseball_data$TEAM_PITCHING_SO), 1, 0)
baseball_data$SB_MISSING <- ifelse(is.na(baseball_data$TEAM_BASERUN_SB), 1, 0)
baseball_data$HBP_MISSING <- ifelse(is.na(baseball_data$TEAM_BATTING_HBP), 1, 0)

# Missing value visualization
VIM::aggr(baseball_data, col = c('navyblue','red'), 
          numbers = TRUE, sortVars = TRUE)

## 
##  Variables sorted by number of missings: 
##          Variable      Count
##  TEAM_BATTING_HBP 0.91608084
##   TEAM_BASERUN_CS 0.33919156
##  TEAM_FIELDING_DP 0.12565905
##   TEAM_BASERUN_SB 0.05755712
##   TEAM_BATTING_SO 0.04481547
##  TEAM_PITCHING_SO 0.04481547
##             INDEX 0.00000000
##       TARGET_WINS 0.00000000
##    TEAM_BATTING_H 0.00000000
##   TEAM_BATTING_2B 0.00000000
##   TEAM_BATTING_3B 0.00000000
##   TEAM_BATTING_HR 0.00000000
##   TEAM_BATTING_BB 0.00000000
##   TEAM_PITCHING_H 0.00000000
##  TEAM_PITCHING_HR 0.00000000
##  TEAM_PITCHING_BB 0.00000000
##   TEAM_FIELDING_E 0.00000000
##        CS_MISSING 0.00000000
##        DP_MISSING 0.00000000
##    SO_BAT_MISSING 0.00000000
##  SO_PITCH_MISSING 0.00000000
##        SB_MISSING 0.00000000
##       HBP_MISSING 0.00000000

Outlier Detection

# Function to identify outliers using IQR method
identify_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  return(x < lower | x > upper)
}

# Apply to numerical columns (excluding INDEX and missing flags)
numeric_cols <- sapply(baseball_data, is.numeric)
numeric_cols <- names(numeric_cols[numeric_cols == TRUE])
numeric_cols <- numeric_cols[!grepl("INDEX|MISSING", numeric_cols)]

# Create outlier flags
for(col in numeric_cols) {
  flag_name <- paste0(col, "_OUTLIER")
  baseball_data[[flag_name]] <- identify_outliers(baseball_data[[col]])
}

Missing Value Imputation

# Missing Value Imputation - CORRECTED VERSION
# Decision: Drop HBP due to 92.7% missing
baseball_data <- baseball_data %>% select(-TEAM_BATTING_HBP, -HBP_MISSING)

# Impute remaining missing values with median (more robust than mean)
numeric_impute_cols <- c("TEAM_BATTING_SO", "TEAM_BASERUN_SB", "TEAM_BASERUN_CS", 
                        "TEAM_PITCHING_SO", "TEAM_FIELDING_DP")

for(col in numeric_impute_cols) {
  # COUNT MISSING VALUES BEFORE IMPUTATION
  missing_count_before <- sum(is.na(baseball_data[[col]]))
  
  if(missing_count_before > 0) {
    median_val <- median(baseball_data[[col]], na.rm = TRUE)
    baseball_data[[col]][is.na(baseball_data[[col]])] <- median_val
    
    # COUNT MISSING VALUES AFTER IMPUTATION (should be 0)
    missing_count_after <- sum(is.na(baseball_data[[col]]))
    
    cat("✓ Imputed", missing_count_before, "missing values in", col, 
        "with median:", median_val, 
        "(", missing_count_after, "remaining )\n")
  } else {
    cat("• No missing values found in", col, "\n")
  }
}

## ✓ Imputed 102 missing values in TEAM_BATTING_SO with median: 750 ( 0 remaining )
## ✓ Imputed 131 missing values in TEAM_BASERUN_SB with median: 101 ( 0 remaining )
## ✓ Imputed 772 missing values in TEAM_BASERUN_CS with median: 49 ( 0 remaining )
## ✓ Imputed 102 missing values in TEAM_PITCHING_SO with median: 813.5 ( 0 remaining )
## ✓ Imputed 286 missing values in TEAM_FIELDING_DP with median: 149 ( 0 remaining )

} }

Feature Engineering

``` r
# Create meaningful baseball metrics
baseball_data <- baseball_data %>%
  mutate(
    # Offensive Metrics
    SINGLES = TEAM_BATTING_H - TEAM_BATTING_2B - TEAM_BATTING_3B - TEAM_BATTING_HR,
    TOTAL_BASES = SINGLES + (2 * TEAM_BATTING_2B) + (3 * TEAM_BATTING_3B) + (4 * TEAM_BATTING_HR),
    
    # Base running efficiency
    SB_SUCCESS_RATE = ifelse(TEAM_BASERUN_SB + TEAM_BASERUN_CS > 0, 
                            TEAM_BASERUN_SB / (TEAM_BASERUN_SB + TEAM_BASERUN_CS), 0),
    
    # Pitching effectiveness (lower is better)
    WHIP_PROXY = (TEAM_PITCHING_H + TEAM_PITCHING_BB) / 162, # Assuming 162 games
    
    # Defensive efficiency
    ERROR_RATE = TEAM_FIELDING_E / (TEAM_PITCHING_H + TEAM_PITCHING_BB), # Rough proxy
    
    # Power metrics
    POWER_RATIO = (TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR) / TEAM_BATTING_H,
    HR_RATIO = TEAM_BATTING_HR / TEAM_BATTING_H,
    
    # Discipline metrics
    BB_SO_RATIO = TEAM_BATTING_BB / TEAM_BATTING_SO,
    PITCHING_K_BB_RATIO = TEAM_PITCHING_SO / TEAM_PITCHING_BB
  )

Log Transformation of Skewed Variables

# Skewness checks
skewed_vars <- baseball_data %>%
  select_if(is.numeric) %>%
  select(-INDEX) %>%
  summarise_all(~abs(psych::skew(., na.rm = TRUE))) %>%
  gather(key = "Variable", value = "Skewness") %>%
  filter(Skewness > 1) %>%
  arrange(desc(Skewness))

print("Highly skewed variables (|skewness| > 1):")

## [1] "Highly skewed variables (|skewness| > 1):"

print(skewed_vars)

##               Variable  Skewness
## 1     TEAM_PITCHING_SO 22.690450
## 2           WHIP_PROXY 10.330139
## 3      TEAM_PITCHING_H 10.329511
## 4     TEAM_PITCHING_BB  6.743899
## 5       SO_BAT_MISSING  4.397174
## 6     SO_PITCH_MISSING  4.397174
## 7           SB_MISSING  3.796854
## 8      TEAM_FIELDING_E  2.990466
## 9      TEAM_BASERUN_CS  2.602172
## 10          DP_MISSING  2.257219
## 11     TEAM_BASERUN_SB  2.065828
## 12 PITCHING_K_BB_RATIO  2.047632
## 13             SINGLES  2.046819
## 14          ERROR_RATE  1.781655
## 15      TEAM_BATTING_H  1.571333
## 16     TEAM_BATTING_3B  1.109465
## 17     TEAM_BATTING_BB  1.025760

# Apply log transformation to highly skewed variables
for(var in skewed_vars$Variable) {
  if(min(baseball_data[[var]], na.rm = TRUE) > 0) {
    new_var_name <- paste0("LOG_", var)
    baseball_data[[new_var_name]] <- log(baseball_data[[var]])
  }
}

Categorical Variables and Bucketing

# Create performance tiers based on key metrics
baseball_data <- baseball_data %>%
  mutate(
    # Offensive performance tiers
    OFFENSIVE_TIER = case_when(
      TEAM_BATTING_H >= quantile(TEAM_BATTING_H, 0.75, na.rm = TRUE) ~ "High",
      TEAM_BATTING_H >= quantile(TEAM_BATTING_H, 0.25, na.rm = TRUE) ~ "Medium",
      TRUE ~ "Low"
    ),
    
    # Pitching performance tiers (lower hits allowed = better)
    PITCHING_TIER = case_when(
      TEAM_PITCHING_H <= quantile(TEAM_PITCHING_H, 0.25, na.rm = TRUE) ~ "Elite",
      TEAM_PITCHING_H <= quantile(TEAM_PITCHING_H, 0.75, na.rm = TRUE) ~ "Average",
      TRUE ~ "Poor"
    ),
    
    # Error buckets
    ERROR_BUCKET = case_when(
      TEAM_FIELDING_E <= quantile(TEAM_FIELDING_E, 0.33, na.rm = TRUE) ~ "Low_Errors",
      TEAM_FIELDING_E <= quantile(TEAM_FIELDING_E, 0.67, na.rm = TRUE) ~ "Medium_Errors",
      TRUE ~ "High_Errors"
    )
  )

# Convert categorical variables to factors
categorical_vars <- c("OFFENSIVE_TIER", "PITCHING_TIER", "ERROR_BUCKET")
baseball_data[categorical_vars] <- lapply(baseball_data[categorical_vars], as.factor)

Data Quality Ckeck

# Summary of final dataset
cat("\nFinal Dataset Summary:\n")

## 
## Final Dataset Summary:

cat("Dimensions:", dim(baseball_data), "\n")

## Dimensions: 2276 55

cat("Missing values remaining:", sum(is.na(baseball_data)), "\n")

## Missing values remaining: 3480

# Display structure of key engineered features
engineered_features <- c("SINGLES", "TOTAL_BASES", "SB_SUCCESS_RATE", "WHIP_PROXY", 
                        "POWER_RATIO", "BB_SO_RATIO", "OFFENSIVE_TIER", "PITCHING_TIER")

cat("\nEngineered Features Summary:\n")

## 
## Engineered Features Summary:

print(summary(baseball_data[engineered_features]))

##     SINGLES        TOTAL_BASES   SB_SUCCESS_RATE    WHIP_PROXY     
##  Min.   : 709.0   Min.   :1026   Min.   :0.0000   Min.   :  9.469  
##  1st Qu.: 990.8   1st Qu.:1947   1st Qu.:0.5913   1st Qu.: 11.969  
##  Median :1050.0   Median :2126   Median :0.6730   Median : 12.802  
##  Mean   :1073.2   Mean   :2120   Mean   :0.6635   Mean   : 14.396  
##  3rd Qu.:1129.0   3rd Qu.:2285   3rd Qu.:0.7373   3rd Qu.: 13.995  
##  Max.   :2112.0   Max.   :3290   Max.   :0.9343   Max.   :194.000  
##                                                                    
##   POWER_RATIO      BB_SO_RATIO     OFFENSIVE_TIER PITCHING_TIER 
##  Min.   :0.1134   Min.   :0.1180   High  : 569    Average:1129  
##  1st Qu.:0.2366   1st Qu.:0.5450   Low   : 567    Elite  : 578  
##  Median :0.2699   Median :0.6564   Medium:1140    Poor   : 569  
##  Mean   :0.2694   Mean   :   Inf                                
##  3rd Qu.:0.3029   3rd Qu.:0.9069                                
##  Max.   :0.3937   Max.   :   Inf                                
##                   NA's   :1

Correlation Analysis

# Correlation matrix for key variables
numeric_for_corr <- baseball_data %>%
  select_if(is.numeric) %>%
  select(-INDEX, -contains("MISSING"), -contains("OUTLIER"))

# Simple correlation plot without clustering
correlation_matrix <- cor(numeric_for_corr, use = "complete.obs")
corrplot(correlation_matrix, method = "color", type = "upper", 
         order = "original",  # No clustering
         tl.cex = 0.7, tl.col = "black")

cat("\nData preparation completed successfully!\n")

## 
## Data preparation completed successfully!

cat("Ready for modeling with", ncol(baseball_data), "variables and", nrow(baseball_data), "observations.\n")

## Ready for modeling with 55 variables and 2276 observations.

Preparing a Dataset for Data Modelling

Candace Grant

2025-09-23