data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
# Histograms for each predictor
Glass_long <- reshape2::melt(Glass, id.vars = "Type")
ggplot(Glass_long, aes(x = value)) +
geom_histogram(bins = 30, fill = "green", color = "white") +
facet_wrap(~variable, scales = "free") +
theme_minimal()
# Boxplots by Type
ggplot(Glass_long, aes(x = Type, y = value, fill = Type)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free") +
theme_minimal()
# remove the categorical variable
Glass2 = select(Glass, -Type)
m = cor(Glass2)
corrplot(m, method = "color", type = "upper", tl.col = "black", order="hclust")
# Histograms for each predictor
Glass_long <- reshape2::melt(Glass, id.vars = "Type")
ggplot(Glass_long, aes(x = value)) +
geom_histogram(bins = 30, fill = "purple", color = "white") +
facet_wrap(~variable, scales = "free") +
theme_minimal()
# Boxplots by Type
ggplot(Glass_long, aes(x = Type, y = value, fill = Type)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free") +
theme_minimal()
Skewed predictors:
Outliers: A lot of the predictors show clear outliers.
# Duplicate dataset
Glass_trans <- Glass
# Variables that benefit from transformation
skewed_vars <- c("K", "Al", "Mg", "Ca")
# log transformation to skewed variables only
Glass_trans[skewed_vars] <- log1p(Glass_trans[skewed_vars])
# Keep Na, Si, RI, Ba, Fe as original(to be scaled later but not transformed)
# Scale all numeric predictors
Glass_num <- Glass_trans[, -which(names(Glass_trans) == "Type")]
Glass_scaled <- as.data.frame(scale(Glass_num))
Glass_scaled$Type <- Glass$Type
# Compare distributions before vs after
Glass_long_orig <- melt(Glass[, -which(names(Glass) == "Type")])
## No id variables; using all as measure variables
Glass_long_orig$dataset <- "Original"
Glass_long_trans <- melt(Glass_scaled[, -which(names(Glass_scaled) == "Type")])
## No id variables; using all as measure variables
Glass_long_trans$dataset <- "Transformed"
Glass_long <- rbind(Glass_long_orig, Glass_long_trans)
ggplot(Glass_long, aes(x = value, fill = dataset)) +
geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(title = "Selective Transformations on Glass Predictors")
Some predictors (K, Al, Mg, Ca) were highly skewed, so I applied a log
transformation to reduce skewness and improve the symmetry. For
predictors that were already fairly symmetric (like Na, Si, RI), I only
applied centering and scaling. However, the zero-inflated variables
(like Ba, Fe) were not changed because standard transformations like log
or BoxCox would not resolve the heavy concentration of zeros.
These selective transformations improve classification by reducing skewness and ensuring all predictors contribute on comparable ranges.
An alternative would be to use Box-Cox transformations, which automatically estimate the best power transformation for each variable, but here I decided to go with the targeted log approach to directly address the skewed predictors without over-processing well-behaved ones.
library(mlbench)
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
?Soybean
## Help on topic 'Soybean' was found in the following packages:
##
## Package Library
## mlbench /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## nlme /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
##
## Using the first match ...
majority_props <- Soybean %>%
dplyr::select(-Class) %>%
summarise(across(everything(), ~ max(table(.)) / length(.))) %>%
pivot_longer(cols = everything(), names_to = "Predictor", values_to = "PropMajority")
ggplot(majority_props, aes(x = reorder(Predictor, PropMajority),
y = PropMajority, fill = PropMajority)) +
geom_col() +
coord_flip() +
scale_fill_gradient(low = "blue", high = "red") +
theme_minimal() +
labs(title = "Proportion of Majority Class per Predictor",
x = "Predictor", y = "Proportion Majority Class")
# Get top 5 most degenerate predictors
top_degenerate <- majority_props %>%
arrange(desc(PropMajority)) %>%
head(5)
top_degenerate
## # A tibble: 5 × 2
## Predictor PropMajority
## <chr> <dbl>
## 1 mycelium 0.936
## 2 sclerotia 0.915
## 3 leaves 0.887
## 4 int.discolor 0.851
## 5 leaf.malf 0.811
Yes, many predictors in the soybean dataset are degenerate - not all 35 predictors will be equally useful for classification, and highly degenerate ones may not add much value.
# Duplicate dataset
soy <- Soybean
# Check missing data per predictor
missing_summary <- colSums(is.na(soy)) / nrow(soy)
missing_summary <- sort(missing_summary, decreasing = TRUE)
# Show top predictors with missing data
missing_summary[1:15]
## hail sever seed.tmt lodging germ
## 0.1771596 0.1771596 0.1771596 0.1771596 0.1639824
## leaf.mild fruiting.bodies fruit.spots seed.discolor shriveling
## 0.1581259 0.1551977 0.1551977 0.1551977 0.1551977
## leaf.shread seed mold.growth seed.size leaf.halo
## 0.1464129 0.1346999 0.1346999 0.1346999 0.1229868
# Drop predictors with too much missing data (> 15%)
high_missing <- names(missing_summary[missing_summary > 0.15])
soy_reduced <- soy %>%
dplyr::select(-dplyr::all_of(high_missing))
# Define function for mode/median imputation
mode_impute <- function(x) {
if (is.factor(x) || is.character(x)) {
return(names(which.max(table(x)))) # mode for categorical
} else {
return(median(x, na.rm = TRUE)) # median for numeric
}
}
# Apply imputation across dataset
soy_imputed <- soy_reduced %>%
mutate(across(-Class, ~ ifelse(is.na(.), mode_impute(.), .)))
# Verify no missing values remain
colSums(is.na(soy_imputed))
## Class date plant.stand precip temp
## 0 0 0 0 0
## crop.hist area.dam plant.growth leaves leaf.halo
## 0 0 0 0 0
## leaf.marg leaf.size leaf.shread leaf.malf stem
## 0 0 0 0 0
## stem.cankers canker.lesion ext.decay mycelium int.discolor
## 0 0 0 0 0
## sclerotia fruit.pods seed mold.growth seed.size
## 0 0 0 0 0
## roots
## 0
# Before: proportion missing
missing_df <- data.frame(
Predictor = names(missing_summary),
MissingRate = missing_summary
)
before <- ggplot(missing_df, aes(x = reorder(Predictor, -MissingRate), y = MissingRate)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title = "Before Imputation: Missing Data by Predictor",
y = "Proportion Missing", x = "Predictor") +
theme_minimal()
# After: confirm no missing values
missing_after <- colSums(is.na(soy_imputed)) / nrow(soy_imputed)
missing_after_df <- data.frame(
Predictor = names(missing_after),
MissingRate = missing_after
)
after <- ggplot(missing_after_df, aes(x = reorder(Predictor, -MissingRate), y = MissingRate)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "After Imputation: Missing Data by Predictor",
y = "Proportion Missing", x = "Predictor") +
theme_minimal()
before
after
To address the ~18% missing data in the soybean dataset, I developed/applied this strategy:
Verification: After this process, all predictors retained in the dataset had 0% missing values, confirming that the approach handled missing data effectively.
With this strategy - dropping prevents highly incomplete variables from overwhelming the model, while imputation preserves moderately incomplete predictors so useful information is not completely discarded.