# Load necessary libraries
library(mlbench)
library(ggplot2)
data(Soybean)
# Function to plot frequency distributions for categorical predictors
categorical_vars <- names(Filter(is.factor, Soybean))
for (var in categorical_vars) {
plot <- ggplot(Soybean, aes_string(x = var)) +
geom_bar(fill = "lightblue", color = "black") +
labs(title = paste("Frequency Distribution of", var), x = var, y = "Frequency") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Summary of missing values per column:
## Class date plant.stand precip temp
## 0 1 36 38 30
## hail crop.hist area.dam sever seed.tmt
## 121 16 1 121 121
## germ plant.growth leaves leaf.halo leaf.marg
## 112 16 0 84 84
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 84 100 84 108 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 121 38 38 106 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 84 106
## seed mold.growth seed.discolor seed.size shriveling
## 92 92 106 92 106
## roots
## 31
# Step 2: Remove Columns with Too Many Missing Values (Threshold: 40%)
threshold <- 0.4 * nrow(Soybean) # Define threshold (40% missing)
Soybean <- Soybean[, colSums(is.na(Soybean)) < threshold]
cat("\nColumns retained after removing high-missing predictors:\n")
##
## Columns retained after removing high-missing predictors:
## [1] "Class" "date" "plant.stand" "precip"
## [5] "temp" "hail" "crop.hist" "area.dam"
## [9] "sever" "seed.tmt" "germ" "plant.growth"
## [13] "leaves" "leaf.halo" "leaf.marg" "leaf.size"
## [17] "leaf.shread" "leaf.malf" "leaf.mild" "stem"
## [21] "lodging" "stem.cankers" "canker.lesion" "fruiting.bodies"
## [25] "ext.decay" "mycelium" "int.discolor" "sclerotia"
## [29] "fruit.pods" "fruit.spots" "seed" "mold.growth"
## [33] "seed.discolor" "seed.size" "shriveling" "roots"
# Step 3: Handle Missing Data
mode_impute <- function(x) {
if (any(is.na(x))) {
x[is.na(x)] <- names(sort(table(x), decreasing=TRUE)[1])
}
return(x)
}
Soybean[] <- lapply(Soybean, mode_impute)
# Step 4: Verify that all missing values have been handled
cat("\nFinal check for missing values (should be 0):\n")
##
## Final check for missing values (should be 0):
## [1] 0
To address missing data in the Soybean dataset, I began by identifying missing values. Predictors with over 40% missing data were removed to minimize the need for extensive imputation. For the remaining missing values, mode imputation (replacing with the most frequent category) was applied to categorical variables. Finally, I confirmed that all missing values had been handled, resulting in a dataset with zero missing values.