data(Glass)
# Quick check
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
# Split predictors/response
x <- Glass %>% select(-Type)
y <- Glass$Type
# ---------------------------
# (a) Visualizations: distributions + relationships
# ---------------------------
# 1) Histograms of predictors
Glass %>%
pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Value)) +
geom_histogram(bins = 20, color = "white") +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Glass predictors: distributions")

# 2) Density plots by class (helps compare distributions by Type)
Glass %>%
pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Value, fill = Type)) +
geom_density(alpha = 0.35) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Predictor densities by Glass Type")

# 3) Pairwise scatterplot matrix (relationships; colored by Type)
GGally::ggpairs(
Glass,
columns = 1:9,
aes(color = Type, alpha = 0.6)
) + theme_minimal()

# 4) Correlation heatmap (numeric predictors only)
cor_mat <- cor(x)
corrplot::corrplot(cor_mat, method = "color", tl.cex = 0.8, number.cex = 0.7)

# Optional: scatterplot + smoother for a couple strongly related pairs
# (Mg vs Al is often strongly negatively related)
ggplot(Glass, aes(x = Mg, y = Al, color = Type)) +
geom_point(alpha = 0.7) +
geom_smooth(se = FALSE, method = "loess") +
theme_minimal() +
labs(title = "Mg vs Al (colored by Type)")

# ---------------------------
# (b) Outliers + skewness
# ---------------------------
# 1) Boxplots by Type (outliers show as points)
Glass %>%
pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Type, y = Value, fill = Type)) +
geom_boxplot(outlier.alpha = 0.8) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Boxplots by Type (outliers visible)")

# 2) Global outlier counts using 1.5*IQR rule per predictor (overall, not by class)
outlier_summary <- map_dfr(names(x), function(v) {
vals <- x[[v]]
q1 <- quantile(vals, 0.25, na.rm = TRUE)
q3 <- quantile(vals, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lo <- q1 - 1.5 * iqr
hi <- q3 + 1.5 * iqr
tibble(
Variable = v,
LowerBound = lo,
UpperBound = hi,
OutlierCount = sum(vals < lo | vals > hi, na.rm = TRUE),
OutlierPct = mean(vals < lo | vals > hi, na.rm = TRUE)
)
})
outlier_summary %>% arrange(desc(OutlierCount))
# 3) Skewness per predictor
skew_tbl <- tibble(
Variable = names(x),
Skewness = sapply(x, e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))
skew_tbl
# Optional: visualize skewness
ggplot(skew_tbl, aes(x = reorder(Variable, Skewness), y = Skewness)) +
geom_col() +
coord_flip() +
theme_minimal() +
labs(title = "Skewness of predictors", x = "Predictor", y = "Skewness")

# ---------------------------
# (c) Transformations that might improve classification
# ---------------------------
# Common helpful steps for many classifiers:
# - log1p() for heavily right-skewed predictors with zeros (Ba, Fe, K often)
# - centering/scaling for distance-based models (KNN, SVM, etc.)
# - BoxCox / YeoJohnson for normalizing predictors
# 1) Identify skewed predictors (choose a threshold, e.g., |skew| > 1)
skewed_vars <- skew_tbl %>%
filter(abs(Skewness) > 1) %>%
pull(Variable)
skewed_vars
## [1] "K" "Ba" "Ca" "Fe" "RI" "Mg"
# 2) Log1p transform for nonnegative skewed variables (safe with zeros)
# Only apply log1p if the variable is >= 0 (all values nonnegative)
Glass_log <- Glass
for (v in skewed_vars) {
if (min(Glass_log[[v]], na.rm = TRUE) >= 0) {
Glass_log[[v]] <- log1p(Glass_log[[v]])
}
}
# Compare distributions before/after log (for skewed vars)
Glass %>%
select(all_of(skewed_vars), Type) %>%
pivot_longer(cols = -Type) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 20, color = "white") +
facet_wrap(~name, scales = "free") +
theme_minimal() +
labs(title = "Before transform: skewed predictors")

Glass_log %>%
select(all_of(skewed_vars), Type) %>%
pivot_longer(cols = -Type) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 20, color = "white") +
facet_wrap(~name, scales = "free") +
theme_minimal() +
labs(title = "After log1p transform: skewed predictors")

# 3) Center/scale (recommended for KNN/SVM)
pre_cs <- caret::preProcess(Glass_log %>% select(-Type), method = c("center", "scale"))
Glass_cs <- predict(pre_cs, Glass_log %>% select(-Type)) %>%
bind_cols(Type = Glass$Type)
# 4) Box-Cox transform (requires strictly positive values)
# Since Glass has zeros (Ba, Fe, etc.), BoxCox may fail.
# Instead, use YeoJohnson which can handle zeros/negatives.
pre_yj <- caret::preProcess(Glass %>% select(-Type), method = c("YeoJohnson", "center", "scale"))
Glass_yj <- predict(pre_yj, Glass %>% select(-Type)) %>%
bind_cols(Type = Glass$Type)
# Quick check: skewness after transformations
skew_after_log_cs <- tibble(
Variable = names(Glass_cs %>% select(-Type)),
Skewness = sapply(Glass_cs %>% select(-Type), e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))
skew_after_yj <- tibble(
Variable = names(Glass_yj %>% select(-Type)),
Skewness = sapply(Glass_yj %>% select(-Type), e1071::skewness, na.rm = TRUE, type = 2)
) %>% arrange(desc(abs(Skewness)))
skew_after_log_cs
skew_after_yj
# Optional: Side-by-side density comparison for one variable (example: Ba)
# (If Ba is not skewed_vars, replace with another variable)
v <- "Ba"
if (v %in% names(Glass)) {
bind_rows(
Glass %>% transmute(Value = .data[[v]], Dataset = "Original"),
Glass_log %>% transmute(Value = .data[[v]], Dataset = "Log1p"),
Glass_yj %>% transmute(Value = .data[[v]], Dataset = "YeoJohnson")
) %>%
ggplot(aes(x = Value, fill = Dataset)) +
geom_density(alpha = 0.35) +
theme_minimal() +
labs(title = paste("Distribution comparison for", v))
}

library(mlbench)
library(tidyverse)
library(caret)
data(Soybean) # see ?Soybean
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
# Outcome variable name in this dataset is typically "Class"
table(Soybean$Class)
##
## 2-4-d-injury alternarialeaf-spot
## 16 91
## anthracnose bacterial-blight
## 44 20
## bacterial-pustule brown-spot
## 20 92
## brown-stem-rot charcoal-rot
## 44 20
## cyst-nematode diaporthe-pod-&-stem-blight
## 14 15
## diaporthe-stem-canker downy-mildew
## 20 20
## frog-eye-leaf-spot herbicide-injury
## 91 8
## phyllosticta-leaf-spot phytophthora-rot
## 20 88
## powdery-mildew purple-seed-stain
## 20 20
## rhizoctonia-root-rot
## 20
# Split predictors/response
y <- Soybean$Class
x <- Soybean %>% select(-Class)
# ---------------------------
# (a) Frequency distributions for categorical predictors
# + Check for degenerate predictors
# ---------------------------
# 1) Frequency tables for every predictor (prints to console)
freq_list <- lapply(names(x), function(v) {
tab <- sort(table(x[[v]], useNA = "ifany"), decreasing = TRUE)
tab
})
names(freq_list) <- names(x)
# Print all frequency tables (can be long)
for (v in names(freq_list)) {
cat("\n====================\n", v, "\n")
print(freq_list[[v]])
}
##
## ====================
## date
##
## 5 4 3 2 6 1 0 <NA>
## 149 131 118 93 90 75 26 1
##
## ====================
## plant.stand
##
## 0 1 <NA>
## 354 293 36
##
## ====================
## precip
##
## 2 1 0 <NA>
## 459 112 74 38
##
## ====================
## temp
##
## 1 2 0 <NA>
## 374 199 80 30
##
## ====================
## hail
##
## 0 1 <NA>
## 435 127 121
##
## ====================
## crop.hist
##
## 2 3 1 0 <NA>
## 219 218 165 65 16
##
## ====================
## area.dam
##
## 1 3 2 0 <NA>
## 227 187 145 123 1
##
## ====================
## sever
##
## 1 0 <NA> 2
## 322 195 121 45
##
## ====================
## seed.tmt
##
## 0 1 <NA> 2
## 305 222 121 35
##
## ====================
## germ
##
## 1 2 0 <NA>
## 213 193 165 112
##
## ====================
## plant.growth
##
## 0 1 <NA>
## 441 226 16
##
## ====================
## leaves
##
## 1 0
## 606 77
##
## ====================
## leaf.halo
##
## 2 0 <NA> 1
## 342 221 84 36
##
## ====================
## leaf.marg
##
## 0 2 <NA> 1
## 357 221 84 21
##
## ====================
## leaf.size
##
## 1 2 <NA> 0
## 327 221 84 51
##
## ====================
## leaf.shread
##
## 0 <NA> 1
## 487 100 96
##
## ====================
## leaf.malf
##
## 0 <NA> 1
## 554 84 45
##
## ====================
## leaf.mild
##
## 0 <NA> 1 2
## 535 108 20 20
##
## ====================
## stem
##
## 1 0 <NA>
## 371 296 16
##
## ====================
## lodging
##
## 0 <NA> 1
## 520 121 42
##
## ====================
## stem.cankers
##
## 0 3 1 <NA> 2
## 379 191 39 38 36
##
## ====================
## canker.lesion
##
## 0 2 1 3 <NA>
## 320 177 83 65 38
##
## ====================
## fruiting.bodies
##
## 0 <NA> 1
## 473 106 104
##
## ====================
## ext.decay
##
## 0 1 <NA> 2
## 497 135 38 13
##
## ====================
## mycelium
##
## 0 <NA> 1
## 639 38 6
##
## ====================
## int.discolor
##
## 0 1 <NA> 2
## 581 44 38 20
##
## ====================
## sclerotia
##
## 0 <NA> 1
## 625 38 20
##
## ====================
## fruit.pods
##
## 0 1 <NA> 3 2
## 407 130 84 48 14
##
## ====================
## fruit.spots
##
## 0 <NA> 4 1 2
## 345 106 100 75 57
##
## ====================
## seed
##
## 0 1 <NA>
## 476 115 92
##
## ====================
## mold.growth
##
## 0 <NA> 1
## 524 92 67
##
## ====================
## seed.discolor
##
## 0 <NA> 1
## 513 106 64
##
## ====================
## seed.size
##
## 0 <NA> 1
## 532 92 59
##
## ====================
## shriveling
##
## 0 <NA> 1
## 539 106 38
##
## ====================
## roots
##
## 0 1 <NA> 2
## 551 86 31 15
# 2) Summarize frequency distributions into a compact table
# - n_levels: number of distinct values (including NA as a "level" here)
# - top_prop: proportion of most common (non-NA) level
# - degenerate flags:
# * single_level_nonNA: only 1 level among non-missing values
# * very_dominant_level: top level >= 95% of non-missing values
freq_summary <- map_dfr(names(x), function(v) {
vec <- x[[v]]
nonmiss <- vec[!is.na(vec)]
tab_nonmiss <- sort(table(nonmiss), decreasing = TRUE)
n_nonmiss <- length(nonmiss)
n_levels_nonmiss <- length(tab_nonmiss)
top_prop <- if (n_nonmiss > 0) as.numeric(tab_nonmiss[1]) / n_nonmiss else NA_real_
tibble(
predictor = v,
n_levels_nonmiss = n_levels_nonmiss,
nonmissing_n = n_nonmiss,
missing_n = sum(is.na(vec)),
missing_pct = mean(is.na(vec)),
top_level = if (n_nonmiss > 0) names(tab_nonmiss)[1] else NA_character_,
top_prop = top_prop,
single_level_nonNA = (n_levels_nonmiss <= 1),
very_dominant_level = (!is.na(top_prop) & top_prop >= 0.95)
)
}) %>% arrange(desc(single_level_nonNA), desc(very_dominant_level), desc(top_prop))
freq_summary
# 3) Caret-style near-zero-variance check using dummy variables
# (creates 0/1 indicators for factor levels, then checks NZV)
mm <- model.matrix(~ . - 1, data = x) # dummy-encode factors
nzv_idx <- caret::nearZeroVar(mm)
nzv_vars <- if (length(nzv_idx) > 0) colnames(mm)[nzv_idx] else character(0)
nzv_vars
## [1] "date0" "leaf.marg1" "leaf.malf1" "leaf.mild1"
## [5] "leaf.mild2" "stem.cankers2" "ext.decay2" "mycelium1"
## [9] "int.discolor2" "sclerotia1" "fruit.pods2" "shriveling1"
## [13] "roots1" "roots2"
# ---------------------------
# (b) Missing data analysis:
# - Which predictors are more missing?
# - Is missingness related to classes?
# ---------------------------
# 1) Missingness by predictor
missing_by_pred <- tibble(
predictor = names(x),
missing_n = sapply(x, function(col) sum(is.na(col))),
missing_pct = sapply(x, function(col) mean(is.na(col)))
) %>% arrange(desc(missing_pct))
missing_by_pred
# Visual: missing percent by predictor
ggplot(missing_by_pred, aes(x = reorder(predictor, missing_pct), y = missing_pct)) +
geom_col() +
coord_flip() +
theme_minimal() +
labs(title = "Missing data by predictor", x = "Predictor", y = "Missing proportion")

# 2) Missingness per row (how many missing predictors each soybean has)
Soybean_rowmiss <- Soybean %>%
mutate(missing_count = rowSums(is.na(across(-Class))),
missing_prop = missing_count / ncol(x))
summary(Soybean_rowmiss$missing_prop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.09776 0.00000 0.85714
ggplot(Soybean_rowmiss, aes(x = missing_prop)) +
geom_histogram(bins = 25, color = "white") +
theme_minimal() +
labs(title = "Distribution of row-wise missingness", x = "Missing proportion per row")

# 3) Is missingness related to class?
# Compare missing_prop across classes (boxplot)
ggplot(Soybean_rowmiss, aes(x = Class, y = missing_prop, fill = Class)) +
geom_boxplot(outlier.alpha = 0.6) +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Row-wise missingness by class", y = "Missing proportion")

# Optional: formal test (ANOVA on missing_prop ~ Class)
anova_fit <- aov(missing_prop ~ Class, data = Soybean_rowmiss)
summary(anova_fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Class 18 28.376 1.5764 240.9 <2e-16 ***
## Residuals 664 4.345 0.0065
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 4) Predictor-level missingness by class:
# For each predictor, compute missing rate within each class
missing_by_class_pred <- map_dfr(names(x), function(v) {
Soybean %>%
group_by(Class) %>%
summarize(missing_pct = mean(is.na(.data[[v]])), .groups = "drop") %>%
mutate(predictor = v)
})
# Find predictors where missingness varies a lot across classes
missing_class_spread <- missing_by_class_pred %>%
group_by(predictor) %>%
summarize(
min_missing = min(missing_pct),
max_missing = max(missing_pct),
spread = max_missing - min_missing,
avg_missing = mean(missing_pct),
.groups = "drop"
) %>%
arrange(desc(spread))
missing_class_spread
# Visualize top 8 predictors with biggest missingness spread by class
top_preds <- missing_class_spread %>% slice_head(n = 8) %>% pull(predictor)
missing_by_class_pred %>%
filter(predictor %in% top_preds) %>%
ggplot(aes(x = Class, y = missing_pct, fill = Class)) +
geom_col() +
facet_wrap(~ predictor, scales = "free_y") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Missingness by class for predictors with largest spread",
y = "Missing proportion")

# ---------------------------
# (c) Strategy for handling missing data (remove vs impute)
# ---------------------------
# Strategy (common + reasonable for this dataset):
# 1) Remove predictors that are:
# - degenerate (single level among non-missing values), and/or
# - extremely sparse (very high missing rate), and/or
# - near-zero variance (optional; based on dummy columns)
# 2) Impute remaining categorical predictors with the MODE (most frequent level),
# optionally adding a "missing" level first.
# 1) Choose predictors to remove:
# - single level among non-missing
# - missing_pct > threshold (choose e.g. 0.50)
missing_threshold <- 0.50
drop_preds <- freq_summary %>%
filter(single_level_nonNA | missing_pct > missing_threshold) %>%
pull(predictor)
drop_preds
## character(0)
Soybean_reduced <- Soybean %>% select(-all_of(drop_preds))
x_reduced <- Soybean_reduced %>% select(-Class)
# 2) Simple MODE imputation function for factors/characters
mode_impute_factor <- function(vec) {
# Ensure factor for consistent levels
if (!is.factor(vec)) vec <- as.factor(vec)
# If all missing, keep as is (or create "missing" level)
if (all(is.na(vec))) {
vec <- fct_explicit_na(vec, na_level = "missing")
return(vec)
}
# Add explicit missing level, then impute NAs with mode of non-missing
vec2 <- fct_explicit_na(vec, na_level = "missing")
nonmiss <- vec2[vec2 != "missing"]
# If nonmiss empty, just return with missing level
if (length(nonmiss) == 0) return(vec2)
tab <- table(nonmiss)
mode_level <- names(tab)[which.max(tab)]
vec2[vec2 == "missing"] <- mode_level
droplevels(vec2)
}
# 3) Apply imputation to all predictors in reduced dataset
Soybean_imputed <- Soybean_reduced
Soybean_imputed <- Soybean_imputed %>%
mutate(across(-Class, mode_impute_factor))
# Verify: no missing left in predictors
sum(is.na(Soybean_imputed %>% select(-Class)))
## [1] 0
# Optional: Confirm levels still look reasonable after imputation
# (example)
# table(Soybean_imputed$precip, useNA = "ifany")
# 4) (Optional but recommended) Encode for modeling
# Many classifiers need numeric inputs; create dummy variables
dummies <- caret::dummyVars(Class ~ ., data = Soybean_imputed)
X_num <- predict(dummies, newdata = Soybean_imputed) %>% as.data.frame()
Soybean_model_ready <- bind_cols(X_num, Class = Soybean_imputed$Class)
str(Soybean_model_ready)
## 'data.frame': 683 obs. of 95 variables:
## $ date.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ date.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ date.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ date.3 : num 0 0 1 1 0 0 0 0 0 0 ...
## $ date.4 : num 0 1 0 0 0 0 0 1 0 1 ...
## $ date.5 : num 0 0 0 0 0 1 1 0 0 0 ...
## $ date.6 : num 1 0 0 0 1 0 0 0 1 0 ...
## $ plant.stand.L : num -0.707 -0.707 -0.707 -0.707 -0.707 ...
## $ precip.L : num 0.707 0.707 0.707 0.707 0.707 ...
## $ precip.Q : num 0.408 0.408 0.408 0.408 0.408 ...
## $ temp.L : num -9.68e-17 -9.68e-17 -9.68e-17 -9.68e-17 -9.68e-17 ...
## $ temp.Q : num -0.816 -0.816 -0.816 -0.816 -0.816 ...
## $ hail.0 : num 1 1 1 1 1 1 1 0 1 1 ...
## $ hail.1 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ crop.hist.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ crop.hist.1 : num 1 0 1 1 0 0 0 1 0 0 ...
## $ crop.hist.2 : num 0 1 0 0 1 0 1 0 0 1 ...
## $ crop.hist.3 : num 0 0 0 0 0 1 0 0 1 0 ...
## $ area.dam.0 : num 0 1 1 1 1 1 1 1 1 1 ...
## $ area.dam.1 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ area.dam.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ area.dam.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ sever.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ sever.1 : num 1 0 0 0 1 1 1 1 1 0 ...
## $ sever.2 : num 0 1 1 1 0 0 0 0 0 1 ...
## $ seed.tmt.0 : num 1 0 0 1 1 1 0 1 0 1 ...
## $ seed.tmt.1 : num 0 1 1 0 0 0 1 0 1 0 ...
## $ seed.tmt.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ germ.L : num -7.07e-01 -9.68e-17 7.07e-01 -9.68e-17 7.07e-01 ...
## $ germ.Q : num 0.408 -0.816 0.408 -0.816 0.408 ...
## $ plant.growth.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ plant.growth.1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaves.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaves.1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.halo.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.halo.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.halo.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.marg.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.marg.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.marg.2 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.size.L : num 0.707 0.707 0.707 0.707 0.707 ...
## $ leaf.size.Q : num 0.408 0.408 0.408 0.408 0.408 ...
## $ leaf.shread.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.shread.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.malf.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.mild.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ leaf.mild.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stem.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stem.1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ lodging.0 : num 0 1 1 1 1 1 0 1 1 1 ...
## $ lodging.1 : num 1 0 0 0 0 0 1 0 0 0 ...
## $ stem.cankers.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stem.cankers.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stem.cankers.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ stem.cankers.3 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ canker.lesion.0 : num 0 0 1 1 0 1 0 0 0 0 ...
## $ canker.lesion.1 : num 1 1 0 0 1 0 1 1 1 1 ...
## $ canker.lesion.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ canker.lesion.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruiting.bodies.0: num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruiting.bodies.1: num 1 1 1 1 1 1 1 1 1 1 ...
## $ ext.decay.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ext.decay.1 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ext.decay.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mycelium.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ mycelium.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ int.discolor.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ int.discolor.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ sclerotia.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.pods.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.pods.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.pods.3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.spots.0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.spots.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.spots.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ fruit.spots.4 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mold.growth.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ seed.discolor.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ seed.size.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ shriveling.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ roots.0 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ roots.1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ roots.2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...