(Applied statistical learning, Kuhn and Johnson)
The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.
library(mlbench)
data(Glass)
# str(Glass)
library(ggplot2)
library(GGally)
library(corrplot)
library(dplyr)
library(tidyr)
library(e1071)
# Convert 'Type' to a factor (since it's the class label)
Glass$Type <- as.factor(Glass$Type)
# 1. Histograms for each predictor
Glass_long <- Glass %>% pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value")
ggplot(Glass_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Predictor Variables")
# 2. Density plots by Type
ggplot(Glass_long, aes(x = Value, fill = Type)) +
geom_density(alpha = 0.5) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal() +
labs(title = "Density Plots of Predictors by Type")
# 3. Correlation Heatmap
corr_matrix <- cor(Glass[, -ncol(Glass)]) # Exclude 'Type' since it's categorical
corrplot(corr_matrix, method = "color", type = "lower", tl.cex = 0.8, tl.col = "black")
# boxplots for outliers
ggplot(Glass_long, aes(x = Variable, y = Value)) +
geom_boxplot(fill = "orange", alpha = 0.6) +
coord_flip() +
theme_minimal() +
labs(title = "Boxplots of Predictor Variables")
# skewness for each predictor
skew_values <- sapply(Glass[, -ncol(Glass)], skewness)
print(skew_values)
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
# highly skewed variables (threshold: abs(skewness) > 1)
skewed_vars <- names(skew_values[abs(skew_values) > 1])
print(skewed_vars)
## [1] "RI" "Mg" "K" "Ca" "Ba" "Fe"
Yes, there seems to be some outliers in the Ca and K variables in particular, when looking at the Box plots. Some potential outliers exist in Ba too. Testing skewness shows that several variables satisfy the statistical test for skewness, but in particular, K, Ca, and Ba are quite skewed (especially K).
# Copy dataset for transformation
Glass_transformed <- Glass
# transformations
Glass_transformed$RI_log <- log(Glass_transformed$RI)
Glass_transformed$Mg_sqrt <- sqrt(abs(Glass_transformed$Mg))
Glass_transformed$K_log <- log(Glass_transformed$K + 1)
Glass_transformed$Ca_log <- log(Glass_transformed$Ca + 1)
Glass_transformed$Ba_log <- log(Glass_transformed$Ba + 1)
Glass_transformed$Fe_sqrt <- sqrt(abs(Glass_transformed$Fe))
# Check new distributions
Glass_long_trans <- Glass_transformed %>%
pivot_longer(cols = starts_with(c("RI_", "Mg_", "K_", "Ca_", "Ba_", "Fe_")),
names_to = "Variable", values_to = "Value")
ggplot(Glass_long_trans, aes(x = Value)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
facet_wrap(~ Variable, scales = "free") +
theme_minimal() +
labs(title = "Transformed Predictor Distributions")
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
data(Soybean)
library(caret)
categorical_vars <- Soybean %>%
select(-Class, -date)
# frequency tables
freq_tables <- lapply(categorical_vars, table)
freq_tables
## $plant.stand
##
## 0 1
## 354 293
##
## $precip
##
## 0 1 2
## 74 112 459
##
## $temp
##
## 0 1 2
## 80 374 199
##
## $hail
##
## 0 1
## 435 127
##
## $crop.hist
##
## 0 1 2 3
## 65 165 219 218
##
## $area.dam
##
## 0 1 2 3
## 123 227 145 187
##
## $sever
##
## 0 1 2
## 195 322 45
##
## $seed.tmt
##
## 0 1 2
## 305 222 35
##
## $germ
##
## 0 1 2
## 165 213 193
##
## $plant.growth
##
## 0 1
## 441 226
##
## $leaves
##
## 0 1
## 77 606
##
## $leaf.halo
##
## 0 1 2
## 221 36 342
##
## $leaf.marg
##
## 0 1 2
## 357 21 221
##
## $leaf.size
##
## 0 1 2
## 51 327 221
##
## $leaf.shread
##
## 0 1
## 487 96
##
## $leaf.malf
##
## 0 1
## 554 45
##
## $leaf.mild
##
## 0 1 2
## 535 20 20
##
## $stem
##
## 0 1
## 296 371
##
## $lodging
##
## 0 1
## 520 42
##
## $stem.cankers
##
## 0 1 2 3
## 379 39 36 191
##
## $canker.lesion
##
## 0 1 2 3
## 320 83 177 65
##
## $fruiting.bodies
##
## 0 1
## 473 104
##
## $ext.decay
##
## 0 1 2
## 497 135 13
##
## $mycelium
##
## 0 1
## 639 6
##
## $int.discolor
##
## 0 1 2
## 581 44 20
##
## $sclerotia
##
## 0 1
## 625 20
##
## $fruit.pods
##
## 0 1 2 3
## 407 130 14 48
##
## $fruit.spots
##
## 0 1 2 4
## 345 75 57 100
##
## $seed
##
## 0 1
## 476 115
##
## $mold.growth
##
## 0 1
## 524 67
##
## $seed.discolor
##
## 0 1
## 513 64
##
## $seed.size
##
## 0 1
## 532 59
##
## $shriveling
##
## 0 1
## 539 38
##
## $roots
##
## 0 1 2
## 551 86 15
# ordered factors to regular factors
categorical_vars <- Soybean %>%
select(-Class, -date) %>%
mutate(across(where(is.ordered), ~ factor(as.character(.)))) %>%
mutate(across(where(is.factor), as.factor))
# Reshape
Soybean_long <- categorical_vars %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category")
# Plot
plot_categorical_distributions <- function(data, batch_size = 9) {
variable_list <- unique(data$Variable)
num_batches <- ceiling(length(variable_list) / batch_size)
for (i in 1:num_batches) {
batch_vars <- variable_list[((i - 1) * batch_size + 1) : min(i * batch_size, length(variable_list))]
p <- ggplot(data %>% filter(Variable %in% batch_vars), aes(x = Category)) +
geom_bar(fill = "steelblue", color = "black") +
facet_wrap(~ Variable, scales = "free_x", ncol = 3) +
theme_minimal() +
labs(title = paste("Categorical Predictors - Batch", i),
x = "Category", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.text = element_text(size = 12))
print(p)
}
}
plot_categorical_distributions(Soybean_long, batch_size = 9)
# zero-variance predictors
zero_var <- nearZeroVar(Soybean, saveMetrics = TRUE)
zero_var_predictors <- rownames(zero_var[zero_var$nzv == TRUE, ])
print(zero_var_predictors)
## [1] "leaf.mild" "mycelium" "sclerotia"
zero_var
## freqRatio percentUnique zeroVar nzv
## Class 1.010989 2.7818448 FALSE FALSE
## date 1.137405 1.0248902 FALSE FALSE
## plant.stand 1.208191 0.2928258 FALSE FALSE
## precip 4.098214 0.4392387 FALSE FALSE
## temp 1.879397 0.4392387 FALSE FALSE
## hail 3.425197 0.2928258 FALSE FALSE
## crop.hist 1.004587 0.5856515 FALSE FALSE
## area.dam 1.213904 0.5856515 FALSE FALSE
## sever 1.651282 0.4392387 FALSE FALSE
## seed.tmt 1.373874 0.4392387 FALSE FALSE
## germ 1.103627 0.4392387 FALSE FALSE
## plant.growth 1.951327 0.2928258 FALSE FALSE
## leaves 7.870130 0.2928258 FALSE FALSE
## leaf.halo 1.547511 0.4392387 FALSE FALSE
## leaf.marg 1.615385 0.4392387 FALSE FALSE
## leaf.size 1.479638 0.4392387 FALSE FALSE
## leaf.shread 5.072917 0.2928258 FALSE FALSE
## leaf.malf 12.311111 0.2928258 FALSE FALSE
## leaf.mild 26.750000 0.4392387 FALSE TRUE
## stem 1.253378 0.2928258 FALSE FALSE
## lodging 12.380952 0.2928258 FALSE FALSE
## stem.cankers 1.984293 0.5856515 FALSE FALSE
## canker.lesion 1.807910 0.5856515 FALSE FALSE
## fruiting.bodies 4.548077 0.2928258 FALSE FALSE
## ext.decay 3.681481 0.4392387 FALSE FALSE
## mycelium 106.500000 0.2928258 FALSE TRUE
## int.discolor 13.204545 0.4392387 FALSE FALSE
## sclerotia 31.250000 0.2928258 FALSE TRUE
## fruit.pods 3.130769 0.5856515 FALSE FALSE
## fruit.spots 3.450000 0.5856515 FALSE FALSE
## seed 4.139130 0.2928258 FALSE FALSE
## mold.growth 7.820896 0.2928258 FALSE FALSE
## seed.discolor 8.015625 0.2928258 FALSE FALSE
## seed.size 9.016949 0.2928258 FALSE FALSE
## shriveling 14.184211 0.2928258 FALSE FALSE
## roots 6.406977 0.4392387 FALSE FALSE
Some categorical variables have very few observations in one or more categories, such as:
These very low variance variables are dominated by a single category and may contribute little to classification.
I think the best strategy will have to be tailored, so:
So, to translate this for this dataset, I plan to:
missing_summary <- Soybean %>%
summarise(across(everything(), ~ mean(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "MissingPercent") %>%
arrange(desc(MissingPercent))
variables_to_remove <- missing_summary %>%
filter(MissingPercent > 0.50) %>%
pull(Variable)
variables_to_remove
## character(0)
One of the better ways is using KNN, for categorical and ordinal variables, which is this data.
library(VIM)
# KNN requires factors
Soybean_knn <- Soybean %>%
mutate(across(where(is.ordered), ~ factor(as.character(.))))
Soybean_imputed <- kNN(Soybean_knn, k = 5, imp_var = FALSE)
# check
colSums(is.na(Soybean_imputed))
## Class date plant.stand precip temp
## 0 0 0 0 0
## hail crop.hist area.dam sever seed.tmt
## 0 0 0 0 0
## germ plant.growth leaves leaf.halo leaf.marg
## 0 0 0 0 0
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 0 0 0 0 0
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 0 0 0 0
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 0 0 0 0 0
## seed mold.growth seed.discolor seed.size shriveling
## 0 0 0 0 0
## roots
## 0
Compare before and after imputation:
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
imputed_vars <- names(Soybean)[colSums(is.na(Soybean)) > 0]
# before/after bar plots
plot_imputation_comparison <- function(var_name, original_data, imputed_data) {
p1 <- ggplot(original_data, aes(x = get(var_name))) +
geom_bar(fill = "red") +
labs(title = paste(var_name, "- Before Imputation"), x = var_name, y = "Count") +
theme_minimal()
p2 <- ggplot(imputed_data, aes(x = get(var_name))) +
geom_bar(fill = "blue") +
labs(title = paste(var_name, "- After Imputation"), x = var_name, y = "Count") +
theme_minimal()
grid.arrange(p1, p2, ncol = 2)}
# Loop
for (var in imputed_vars) {
plot_imputation_comparison(var, Soybean, Soybean_imputed)
}