Homework 4 – DATA 624

data(Glass)

# Quick check
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

# Split predictors/response
x <- Glass %>% select(-Type)
y <- Glass$Type

# ---------------------------
# (a) Visualizations: distributions + relationships
# ---------------------------

# 1) Histograms of predictors
Glass %>%
  pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(bins = 20, color = "white") +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Glass predictors: distributions")

# 2) Density plots by class (helps compare distributions by Type)
Glass %>%
  pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Type)) +
  geom_density(alpha = 0.35) +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Predictor densities by Glass Type")

# 3) Pairwise scatterplot matrix (relationships; colored by Type)
GGally::ggpairs(
  Glass,
  columns = 1:9,
  aes(color = Type, alpha = 0.6)
) + theme_minimal()

# 4) Correlation heatmap (numeric predictors only)
cor_mat <- cor(x)
corrplot::corrplot(cor_mat, method = "color", tl.cex = 0.8, number.cex = 0.7)

# Optional: scatterplot + smoother for a couple strongly related pairs
# (Mg vs Al is often strongly negatively related)
ggplot(Glass, aes(x = Mg, y = Al, color = Type)) +
  geom_point(alpha = 0.7) +
  geom_smooth(se = FALSE, method = "loess") +
  theme_minimal() +
  labs(title = "Mg vs Al (colored by Type)")

# ---------------------------
# (b) Outliers + skewness
# ---------------------------

# 1) Boxplots by Type (outliers show as points)
Glass %>%
  pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Type, y = Value, fill = Type)) +
  geom_boxplot(outlier.alpha = 0.8) +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Boxplots by Type (outliers visible)")

# 2) Global outlier counts using 1.5*IQR rule per predictor (overall, not by class)
outlier_summary <- map_dfr(names(x), function(v) {
  vals <- x[[v]]
  q1 <- quantile(vals, 0.25, na.rm = TRUE)
  q3 <- quantile(vals, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lo <- q1 - 1.5 * iqr
  hi <- q3 + 1.5 * iqr
  tibble(
    Variable = v,
    LowerBound = lo,
    UpperBound = hi,
    OutlierCount = sum(vals < lo | vals > hi, na.rm = TRUE),
    OutlierPct = mean(vals < lo | vals > hi, na.rm = TRUE)
  )
})

outlier_summary %>% arrange(desc(OutlierCount))

# 3) Skewness per predictor
skew_tbl <- tibble(
  Variable = names(x),
  Skewness = sapply(x, e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))

skew_tbl

# Optional: visualize skewness
ggplot(skew_tbl, aes(x = reorder(Variable, Skewness), y = Skewness)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  labs(title = "Skewness of predictors", x = "Predictor", y = "Skewness")

# ---------------------------
# (c) Transformations that might improve classification
# ---------------------------

# Common helpful steps for many classifiers:
# - log1p() for heavily right-skewed predictors with zeros (Ba, Fe, K often)
# - centering/scaling for distance-based models (KNN, SVM, etc.)
# - BoxCox / YeoJohnson for normalizing predictors

# 1) Identify skewed predictors (choose a threshold, e.g., |skew| > 1)
skewed_vars <- skew_tbl %>%
  filter(abs(Skewness) > 1) %>%
  pull(Variable)

skewed_vars

## [1] "K"  "Ba" "Ca" "Fe" "RI" "Mg"

# 2) Log1p transform for nonnegative skewed variables (safe with zeros)
# Only apply log1p if the variable is >= 0 (all values nonnegative)
Glass_log <- Glass
for (v in skewed_vars) {
  if (min(Glass_log[[v]], na.rm = TRUE) >= 0) {
    Glass_log[[v]] <- log1p(Glass_log[[v]])
  }
}

# Compare distributions before/after log (for skewed vars)
Glass %>%
  select(all_of(skewed_vars), Type) %>%
  pivot_longer(cols = -Type) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20, color = "white") +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(title = "Before transform: skewed predictors")

Glass_log %>%
  select(all_of(skewed_vars), Type) %>%
  pivot_longer(cols = -Type) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20, color = "white") +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(title = "After log1p transform: skewed predictors")

# 3) Center/scale (recommended for KNN/SVM)
pre_cs <- caret::preProcess(Glass_log %>% select(-Type), method = c("center", "scale"))
Glass_cs <- predict(pre_cs, Glass_log %>% select(-Type)) %>%
  bind_cols(Type = Glass$Type)

# 4) Box-Cox transform (requires strictly positive values)
# Since Glass has zeros (Ba, Fe, etc.), BoxCox may fail.
# Instead, use YeoJohnson which can handle zeros/negatives.
pre_yj <- caret::preProcess(Glass %>% select(-Type), method = c("YeoJohnson", "center", "scale"))
Glass_yj <- predict(pre_yj, Glass %>% select(-Type)) %>%
  bind_cols(Type = Glass$Type)

# Quick check: skewness after transformations
skew_after_log_cs <- tibble(
  Variable = names(Glass_cs %>% select(-Type)),
  Skewness = sapply(Glass_cs %>% select(-Type), e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))

skew_after_yj <- tibble(
  Variable = names(Glass_yj %>% select(-Type)),
  Skewness = sapply(Glass_yj %>% select(-Type), e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))

skew_after_log_cs

skew_after_yj

# Optional: Side-by-side density comparison for one variable (example: Ba)
# (If Ba is not skewed_vars, replace with another variable)
v <- "Ba"
if (v %in% names(Glass)) {
  bind_rows(
    Glass %>% transmute(Value = .data[[v]], Dataset = "Original"),
    Glass_log %>% transmute(Value = .data[[v]], Dataset = "Log1p"),
    Glass_yj %>% transmute(Value = .data[[v]], Dataset = "YeoJohnson")
  ) %>%
    ggplot(aes(x = Value, fill = Dataset)) +
    geom_density(alpha = 0.35) +
    theme_minimal() +
    labs(title = paste("Distribution comparison for", v))
}

library(mlbench)
library(tidyverse)
library(caret)

data(Soybean)   # see ?Soybean
str(Soybean)

## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

# Outcome variable name in this dataset is typically "Class"
table(Soybean$Class)

## 
##                2-4-d-injury         alternarialeaf-spot 
##                          16                          91 
##                 anthracnose            bacterial-blight 
##                          44                          20 
##           bacterial-pustule                  brown-spot 
##                          20                          92 
##              brown-stem-rot                charcoal-rot 
##                          44                          20 
##               cyst-nematode diaporthe-pod-&-stem-blight 
##                          14                          15 
##       diaporthe-stem-canker                downy-mildew 
##                          20                          20 
##          frog-eye-leaf-spot            herbicide-injury 
##                          91                           8 
##      phyllosticta-leaf-spot            phytophthora-rot 
##                          20                          88 
##              powdery-mildew           purple-seed-stain 
##                          20                          20 
##        rhizoctonia-root-rot 
##                          20

# Split predictors/response
y <- Soybean$Class
x <- Soybean %>% select(-Class)

# ---------------------------
# (a) Frequency distributions for categorical predictors
#     + Check for degenerate predictors
# ---------------------------

# 1) Frequency tables for every predictor (prints to console)
freq_list <- lapply(names(x), function(v) {
  tab <- sort(table(x[[v]], useNA = "ifany"), decreasing = TRUE)
  tab
})
names(freq_list) <- names(x)

# Print all frequency tables (can be long)
for (v in names(freq_list)) {
  cat("\n====================\n", v, "\n")
  print(freq_list[[v]])
}

## 
## ====================
##  date 
## 
##    5    4    3    2    6    1    0 <NA> 
##  149  131  118   93   90   75   26    1 
## 
## ====================
##  plant.stand 
## 
##    0    1 <NA> 
##  354  293   36 
## 
## ====================
##  precip 
## 
##    2    1    0 <NA> 
##  459  112   74   38 
## 
## ====================
##  temp 
## 
##    1    2    0 <NA> 
##  374  199   80   30 
## 
## ====================
##  hail 
## 
##    0    1 <NA> 
##  435  127  121 
## 
## ====================
##  crop.hist 
## 
##    2    3    1    0 <NA> 
##  219  218  165   65   16 
## 
## ====================
##  area.dam 
## 
##    1    3    2    0 <NA> 
##  227  187  145  123    1 
## 
## ====================
##  sever 
## 
##    1    0 <NA>    2 
##  322  195  121   45 
## 
## ====================
##  seed.tmt 
## 
##    0    1 <NA>    2 
##  305  222  121   35 
## 
## ====================
##  germ 
## 
##    1    2    0 <NA> 
##  213  193  165  112 
## 
## ====================
##  plant.growth 
## 
##    0    1 <NA> 
##  441  226   16 
## 
## ====================
##  leaves 
## 
##   1   0 
## 606  77 
## 
## ====================
##  leaf.halo 
## 
##    2    0 <NA>    1 
##  342  221   84   36 
## 
## ====================
##  leaf.marg 
## 
##    0    2 <NA>    1 
##  357  221   84   21 
## 
## ====================
##  leaf.size 
## 
##    1    2 <NA>    0 
##  327  221   84   51 
## 
## ====================
##  leaf.shread 
## 
##    0 <NA>    1 
##  487  100   96 
## 
## ====================
##  leaf.malf 
## 
##    0 <NA>    1 
##  554   84   45 
## 
## ====================
##  leaf.mild 
## 
##    0 <NA>    1    2 
##  535  108   20   20 
## 
## ====================
##  stem 
## 
##    1    0 <NA> 
##  371  296   16 
## 
## ====================
##  lodging 
## 
##    0 <NA>    1 
##  520  121   42 
## 
## ====================
##  stem.cankers 
## 
##    0    3    1 <NA>    2 
##  379  191   39   38   36 
## 
## ====================
##  canker.lesion 
## 
##    0    2    1    3 <NA> 
##  320  177   83   65   38 
## 
## ====================
##  fruiting.bodies 
## 
##    0 <NA>    1 
##  473  106  104 
## 
## ====================
##  ext.decay 
## 
##    0    1 <NA>    2 
##  497  135   38   13 
## 
## ====================
##  mycelium 
## 
##    0 <NA>    1 
##  639   38    6 
## 
## ====================
##  int.discolor 
## 
##    0    1 <NA>    2 
##  581   44   38   20 
## 
## ====================
##  sclerotia 
## 
##    0 <NA>    1 
##  625   38   20 
## 
## ====================
##  fruit.pods 
## 
##    0    1 <NA>    3    2 
##  407  130   84   48   14 
## 
## ====================
##  fruit.spots 
## 
##    0 <NA>    4    1    2 
##  345  106  100   75   57 
## 
## ====================
##  seed 
## 
##    0    1 <NA> 
##  476  115   92 
## 
## ====================
##  mold.growth 
## 
##    0 <NA>    1 
##  524   92   67 
## 
## ====================
##  seed.discolor 
## 
##    0 <NA>    1 
##  513  106   64 
## 
## ====================
##  seed.size 
## 
##    0 <NA>    1 
##  532   92   59 
## 
## ====================
##  shriveling 
## 
##    0 <NA>    1 
##  539  106   38 
## 
## ====================
##  roots 
## 
##    0    1 <NA>    2 
##  551   86   31   15

# 2) Summarize frequency distributions into a compact table
#    - n_levels: number of distinct values (including NA as a "level" here)
#    - top_prop: proportion of most common (non-NA) level
#    - degenerate flags:
#        * single_level_nonNA: only 1 level among non-missing values
#        * very_dominant_level: top level >= 95% of non-missing values
freq_summary <- map_dfr(names(x), function(v) {
  vec <- x[[v]]
  nonmiss <- vec[!is.na(vec)]
  tab_nonmiss <- sort(table(nonmiss), decreasing = TRUE)
  n_nonmiss <- length(nonmiss)
  n_levels_nonmiss <- length(tab_nonmiss)
  top_prop <- if (n_nonmiss > 0) as.numeric(tab_nonmiss[1]) / n_nonmiss else NA_real_

  tibble(
    predictor = v,
    n_levels_nonmiss = n_levels_nonmiss,
    nonmissing_n = n_nonmiss,
    missing_n = sum(is.na(vec)),
    missing_pct = mean(is.na(vec)),
    top_level = if (n_nonmiss > 0) names(tab_nonmiss)[1] else NA_character_,
    top_prop = top_prop,
    single_level_nonNA = (n_levels_nonmiss <= 1),
    very_dominant_level = (!is.na(top_prop) & top_prop >= 0.95)
  )
}) %>% arrange(desc(single_level_nonNA), desc(very_dominant_level), desc(top_prop))

freq_summary

# 3) Caret-style near-zero-variance check using dummy variables
#    (creates 0/1 indicators for factor levels, then checks NZV)
mm <- model.matrix(~ . - 1, data = x)  # dummy-encode factors
nzv_idx <- caret::nearZeroVar(mm)
nzv_vars <- if (length(nzv_idx) > 0) colnames(mm)[nzv_idx] else character(0)
nzv_vars

##  [1] "date0"         "leaf.marg1"    "leaf.malf1"    "leaf.mild1"   
##  [5] "leaf.mild2"    "stem.cankers2" "ext.decay2"    "mycelium1"    
##  [9] "int.discolor2" "sclerotia1"    "fruit.pods2"   "shriveling1"  
## [13] "roots1"        "roots2"

# ---------------------------
# (b) Missing data analysis:
#     - Which predictors are more missing?
#     - Is missingness related to classes?
# ---------------------------

# 1) Missingness by predictor
missing_by_pred <- tibble(
  predictor = names(x),
  missing_n = sapply(x, function(col) sum(is.na(col))),
  missing_pct = sapply(x, function(col) mean(is.na(col)))
) %>% arrange(desc(missing_pct))

missing_by_pred

# Visual: missing percent by predictor
ggplot(missing_by_pred, aes(x = reorder(predictor, missing_pct), y = missing_pct)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  labs(title = "Missing data by predictor", x = "Predictor", y = "Missing proportion")

# 2) Missingness per row (how many missing predictors each soybean has)
Soybean_rowmiss <- Soybean %>%
  mutate(missing_count = rowSums(is.na(across(-Class))),
         missing_prop = missing_count / ncol(x))

summary(Soybean_rowmiss$missing_prop)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.00000 0.00000 0.09776 0.00000 0.85714

ggplot(Soybean_rowmiss, aes(x = missing_prop)) +
  geom_histogram(bins = 25, color = "white") +
  theme_minimal() +
  labs(title = "Distribution of row-wise missingness", x = "Missing proportion per row")

# 3) Is missingness related to class?
#    Compare missing_prop across classes (boxplot)
ggplot(Soybean_rowmiss, aes(x = Class, y = missing_prop, fill = Class)) +
  geom_boxplot(outlier.alpha = 0.6) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Row-wise missingness by class", y = "Missing proportion")

# Optional: formal test (ANOVA on missing_prop ~ Class)
anova_fit <- aov(missing_prop ~ Class, data = Soybean_rowmiss)
summary(anova_fit)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## Class        18 28.376  1.5764   240.9 <2e-16 ***
## Residuals   664  4.345  0.0065                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 4) Predictor-level missingness by class:
#    For each predictor, compute missing rate within each class
missing_by_class_pred <- map_dfr(names(x), function(v) {
  Soybean %>%
    group_by(Class) %>%
    summarize(missing_pct = mean(is.na(.data[[v]])), .groups = "drop") %>%
    mutate(predictor = v)
})

# Find predictors where missingness varies a lot across classes
missing_class_spread <- missing_by_class_pred %>%
  group_by(predictor) %>%
  summarize(
    min_missing = min(missing_pct),
    max_missing = max(missing_pct),
    spread = max_missing - min_missing,
    avg_missing = mean(missing_pct),
    .groups = "drop"
  ) %>%
  arrange(desc(spread))

missing_class_spread

# Visualize top 8 predictors with biggest missingness spread by class
top_preds <- missing_class_spread %>% slice_head(n = 8) %>% pull(predictor)

missing_by_class_pred %>%
  filter(predictor %in% top_preds) %>%
  ggplot(aes(x = Class, y = missing_pct, fill = Class)) +
  geom_col() +
  facet_wrap(~ predictor, scales = "free_y") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Missingness by class for predictors with largest spread",
       y = "Missing proportion")

# ---------------------------
# (c) Strategy for handling missing data (remove vs impute)
# ---------------------------

# Strategy (common + reasonable for this dataset):
# 1) Remove predictors that are:
#    - degenerate (single level among non-missing values), and/or
#    - extremely sparse (very high missing rate), and/or
#    - near-zero variance (optional; based on dummy columns)
# 2) Impute remaining categorical predictors with the MODE (most frequent level),
#    optionally adding a "missing" level first.

# 1) Choose predictors to remove:
#    - single level among non-missing
#    - missing_pct > threshold (choose e.g. 0.50)
missing_threshold <- 0.50

drop_preds <- freq_summary %>%
  filter(single_level_nonNA | missing_pct > missing_threshold) %>%
  pull(predictor)

drop_preds

## character(0)

Soybean_reduced <- Soybean %>% select(-all_of(drop_preds))
x_reduced <- Soybean_reduced %>% select(-Class)

# 2) Simple MODE imputation function for factors/characters
mode_impute_factor <- function(vec) {
  # Ensure factor for consistent levels
  if (!is.factor(vec)) vec <- as.factor(vec)

  # If all missing, keep as is (or create "missing" level)
  if (all(is.na(vec))) {
    vec <- fct_explicit_na(vec, na_level = "missing")
    return(vec)
  }

  # Add explicit missing level, then impute NAs with mode of non-missing
  vec2 <- fct_explicit_na(vec, na_level = "missing")
  nonmiss <- vec2[vec2 != "missing"]
  # If nonmiss empty, just return with missing level
  if (length(nonmiss) == 0) return(vec2)

  tab <- table(nonmiss)
  mode_level <- names(tab)[which.max(tab)]
  vec2[vec2 == "missing"] <- mode_level
  droplevels(vec2)
}

# 3) Apply imputation to all predictors in reduced dataset
Soybean_imputed <- Soybean_reduced

Soybean_imputed <- Soybean_imputed %>%
  mutate(across(-Class, mode_impute_factor))

# Verify: no missing left in predictors
sum(is.na(Soybean_imputed %>% select(-Class)))

## [1] 0

# Optional: Confirm levels still look reasonable after imputation
# (example)
# table(Soybean_imputed$precip, useNA = "ifany")

# 4) (Optional but recommended) Encode for modeling
#    Many classifiers need numeric inputs; create dummy variables
dummies <- caret::dummyVars(Class ~ ., data = Soybean_imputed)
X_num <- predict(dummies, newdata = Soybean_imputed) %>% as.data.frame()
Soybean_model_ready <- bind_cols(X_num, Class = Soybean_imputed$Class)

str(Soybean_model_ready)

## 'data.frame':    683 obs. of  95 variables:
##  $ date.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ date.1           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ date.2           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ date.3           : num  0 0 1 1 0 0 0 0 0 0 ...
##  $ date.4           : num  0 1 0 0 0 0 0 1 0 1 ...
##  $ date.5           : num  0 0 0 0 0 1 1 0 0 0 ...
##  $ date.6           : num  1 0 0 0 1 0 0 0 1 0 ...
##  $ plant.stand.L    : num  -0.707 -0.707 -0.707 -0.707 -0.707 ...
##  $ precip.L         : num  0.707 0.707 0.707 0.707 0.707 ...
##  $ precip.Q         : num  0.408 0.408 0.408 0.408 0.408 ...
##  $ temp.L           : num  -9.68e-17 -9.68e-17 -9.68e-17 -9.68e-17 -9.68e-17 ...
##  $ temp.Q           : num  -0.816 -0.816 -0.816 -0.816 -0.816 ...
##  $ hail.0           : num  1 1 1 1 1 1 1 0 1 1 ...
##  $ hail.1           : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ crop.hist.0      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ crop.hist.1      : num  1 0 1 1 0 0 0 1 0 0 ...
##  $ crop.hist.2      : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ crop.hist.3      : num  0 0 0 0 0 1 0 0 1 0 ...
##  $ area.dam.0       : num  0 1 1 1 1 1 1 1 1 1 ...
##  $ area.dam.1       : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ area.dam.2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ area.dam.3       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ sever.0          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ sever.1          : num  1 0 0 0 1 1 1 1 1 0 ...
##  $ sever.2          : num  0 1 1 1 0 0 0 0 0 1 ...
##  $ seed.tmt.0       : num  1 0 0 1 1 1 0 1 0 1 ...
##  $ seed.tmt.1       : num  0 1 1 0 0 0 1 0 1 0 ...
##  $ seed.tmt.2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ germ.L           : num  -7.07e-01 -9.68e-17 7.07e-01 -9.68e-17 7.07e-01 ...
##  $ germ.Q           : num  0.408 -0.816 0.408 -0.816 0.408 ...
##  $ plant.growth.0   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ plant.growth.1   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaves.0         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaves.1         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.halo.0      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.halo.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.halo.2      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.marg.0      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.marg.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.marg.2      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.size.L      : num  0.707 0.707 0.707 0.707 0.707 ...
##  $ leaf.size.Q      : num  0.408 0.408 0.408 0.408 0.408 ...
##  $ leaf.shread.0    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.shread.1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.malf.0      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.mild.0      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ leaf.mild.2      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stem.0           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stem.1           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ lodging.0        : num  0 1 1 1 1 1 0 1 1 1 ...
##  $ lodging.1        : num  1 0 0 0 0 0 1 0 0 0 ...
##  $ stem.cankers.0   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stem.cankers.1   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stem.cankers.2   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ stem.cankers.3   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ canker.lesion.0  : num  0 0 1 1 0 1 0 0 0 0 ...
##  $ canker.lesion.1  : num  1 1 0 0 1 0 1 1 1 1 ...
##  $ canker.lesion.2  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ canker.lesion.3  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruiting.bodies.0: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruiting.bodies.1: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ext.decay.0      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ext.decay.1      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ext.decay.2      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ mycelium.0       : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ mycelium.1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ int.discolor.0   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor.1   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ int.discolor.2   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ sclerotia.0      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.pods.0     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods.1     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.pods.2     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.pods.3     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.spots.0    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.spots.1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.spots.2    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ fruit.spots.4    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.0           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.1           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ mold.growth.0    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth.1    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ seed.discolor.0  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor.1  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ seed.size.0      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size.1      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ shriveling.0     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling.1     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ roots.0          : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ roots.1          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ roots.2          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Class            : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...

Homework 4 – DATA 624

Ariba Mandavia

2026-03-02