library(mlbench)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
library(ggcorrplot)
data(Glass)
glass_x <- Glass %>% select(-Type)
3.1 a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
glass_long <- glass_x %>%
pivot_longer(cols = everything(), names_to = "predictor", values_to = "value")
# Histograms for each predictor
ggplot(glass_long, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ predictor, scales = "free") +
labs(title = "Glass predictors: distributions")
# Boxplots for each predictor
ggplot(glass_long, aes(x = predictor, y = value)) +
geom_boxplot() +
coord_flip() +
labs(title = "Glass predictors: boxplots")
# Pairwise relationships
pairs(glass_x,
pch = 19,
cex = 0.6,
col = as.numeric(Glass$Type),
main = "Glass predictors: pairwise relationships")
ggcorrplot(cor(glass_x),
lab = TRUE,
lab_size = 3.5,
colors = c("darkgreen", "white", "magenta"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The predictor variables show diverse distributions, with some like Ba
and Fe are heavily concentrated near zero, while others like RI and Si
appear more normally distributed. Several predictors exhibit strong
correlations, like Ca and RI (r = 0.81) showing a strong positive
relationship, and Si and Al (r = -0.54) showing a moderate negative
relationship. The boxplots reveal that most variables have some outliers
like Ba, K, and Ca.
3.1 b) Do there appear to be any outliers in the data? Are any predictors skewed?
There do appear to be outliers in the data. From the boxplots, predictors such as Ba, Fe, K, Ca, and Al show observations that fall well outside the whiskers, indicating potential outliers. Ca has several high values, and Ba and Fe have many values near zero with a few much larger observations, which makes the unusual values stand out even more.
Some predictors also appear to be skewed. Ba, Fe, and K are strongly right-skewed, since most observations are close to zero and only a small number are much larger. Ca also appears somewhat right-skewed. Predictors like RI, Na, and Si look more symmetric, though they may still have a few unusual observations.
So overall, yes, there are visible outliers, and yes, several predictors are skewed, especially Ba, Fe, and K.
3.1 c) Are there any relevant transformations of one or more predictors that might improve the classification model?
skew_fun <- function(x) {
x <- x[!is.na(x)]
m <- mean(x)
s <- sd(x)
if (s == 0) return(0)
mean((x - m)^3) / (s^3)
}
# Skewness before transformation
skew_before <- sapply(glass_x, skew_fun)
pp <- preProcess(glass_x, method = c("YeoJohnson"))
glass_trans <- predict(pp, glass_x)
# Skewness after transformation
skew_after <- sapply(glass_trans, skew_fun)
# Compare before and after
skew_table <- data.frame(
Predictor = names(glass_x),
Skew_Before = round(skew_before, 3),
Skew_After = round(skew_after, 3)
)
print(skew_table)
## Predictor Skew_Before Skew_After
## RI RI 1.603 1.603
## Na Na 0.448 -0.009
## Mg Mg -1.136 -0.877
## Al Al 0.895 0.000
## Si Si -0.720 -0.720
## K K 6.460 -0.071
## Ca Ca 2.018 -0.206
## Ba Ba 3.369 3.369
## Fe Fe 1.730 1.730
# Plot transformed distributions
glass_trans_long <- glass_trans %>%
pivot_longer(cols = everything(),
names_to = "Predictor",
values_to = "Value")
ggplot(glass_trans_long, aes(x = Value)) +
geom_histogram(bins = 30) +
facet_wrap(~ Predictor, scales = "free") +
labs(title = "Glass predictors after Yeo-Johnson transformation")
Yes, Yeo-Johnson transformation successfully reduced skewness for K, Ca, Na, and Al, making these predictors more symmetric and suitable for modeling. However, Ba, Fe, RI, and Si remained highly skewed despite transformation and may require alternative approaches like binning or different transformation methods.
3.2 a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
data(Soybean)
convert_missing <- function(x) {
x <- as.character(x)
x[x %in% c("?", "", "NA")] <- NA
factor(x)
}
soy <- as.data.frame(lapply(Soybean, function(x) {
if (is.factor(x) || is.character(x)) convert_missing(x) else x
}))
soy_x <- soy %>% select(-Class)
soy_long <- soy_x %>%
pivot_longer(cols = everything(), names_to = "predictor", values_to = "value")
# Bar plots for all categorical predictors
ggplot(soy_long, aes(x = value, fill = is.na(value))) +
geom_bar() +
facet_wrap(~ predictor, scales = "free_x") +
scale_fill_manual(values = c("steelblue", "coral"),
labels = c("Observed", "Missing"),
name = NULL) +
labs(title = "Soybean predictors: frequency distributions") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Frequency summary for each predictor
freq_summary <- bind_rows(lapply(names(soy_x), function(v) {
tab <- sort(table(soy_x[[v]], useNA = "ifany"), decreasing = TRUE)
data.frame(
predictor = v,
n_levels = nlevels(soy_x[[v]]),
most_common = names(tab)[1],
most_common_n = as.integer(tab[1]),
most_common_prop = round(as.numeric(tab[1] / sum(tab)), 3)
)
}))
freq_summary %>%
arrange(desc(most_common_prop))
## predictor n_levels most_common most_common_n most_common_prop
## 1 mycelium 2 0 639 0.936
## 2 sclerotia 2 0 625 0.915
## 3 leaves 2 1 606 0.887
## 4 int.discolor 3 0 581 0.851
## 5 leaf.malf 2 0 554 0.811
## 6 roots 3 0 551 0.807
## 7 shriveling 2 0 539 0.789
## 8 leaf.mild 3 0 535 0.783
## 9 seed.size 2 0 532 0.779
## 10 mold.growth 2 0 524 0.767
## 11 lodging 2 0 520 0.761
## 12 seed.discolor 2 0 513 0.751
## 13 ext.decay 3 0 497 0.728
## 14 leaf.shread 2 0 487 0.713
## 15 seed 2 0 476 0.697
## 16 fruiting.bodies 2 0 473 0.693
## 17 precip 3 2 459 0.672
## 18 plant.growth 2 0 441 0.646
## 19 hail 2 0 435 0.637
## 20 fruit.pods 4 0 407 0.596
## 21 stem.cankers 4 0 379 0.555
## 22 temp 3 1 374 0.548
## 23 stem 2 1 371 0.543
## 24 leaf.marg 3 0 357 0.523
## 25 plant.stand 2 0 354 0.518
## 26 fruit.spots 4 0 345 0.505
## 27 leaf.halo 3 2 342 0.501
## 28 leaf.size 3 1 327 0.479
## 29 sever 3 1 322 0.471
## 30 canker.lesion 4 0 320 0.469
## 31 seed.tmt 3 0 305 0.447
## 32 area.dam 4 1 227 0.332
## 33 crop.hist 4 2 219 0.321
## 34 germ 3 1 213 0.312
## 35 date 7 5 149 0.218
degenerate_predictors <- freq_summary %>%
filter(n_levels == 1 | most_common_prop > 0.90)
degenerate_predictors
## predictor n_levels most_common most_common_n most_common_prop
## 1 mycelium 2 0 639 0.936
## 2 sclerotia 2 0 625 0.915
cat("Found", nrow(degenerate_predictors), "degenerate predictors:")
## Found 2 degenerate predictors:
print(degenerate_predictors$predictor)
## [1] "mycelium" "sclerotia"
Two predictors show degenerate distributions which are mycelium and sclerotia. Both have over 90% of observations concentrated in a single category. The visualization also reveals widespread missing data across many predictors, with some variables missing up to 20% of values.
3.2 b) Roughly18% ofthe data aremissing.Are there particularpredictorsthat are more likely to be missing? Is the pattern of missing data related to the classes?
# Percent missing by predictor
missing_percent <- sort(colMeans(is.na(soy_x)) * 100, decreasing = TRUE)
missing_percent
## hail sever seed.tmt lodging germ
## 17.7159590 17.7159590 17.7159590 17.7159590 16.3982430
## leaf.mild fruiting.bodies fruit.spots seed.discolor shriveling
## 15.8125915 15.5197657 15.5197657 15.5197657 15.5197657
## leaf.shread seed mold.growth seed.size leaf.halo
## 14.6412884 13.4699854 13.4699854 13.4699854 12.2986823
## leaf.marg leaf.size leaf.malf fruit.pods precip
## 12.2986823 12.2986823 12.2986823 12.2986823 5.5636896
## stem.cankers canker.lesion ext.decay mycelium int.discolor
## 5.5636896 5.5636896 5.5636896 5.5636896 5.5636896
## sclerotia plant.stand roots temp crop.hist
## 5.5636896 5.2708638 4.5387994 4.3923865 2.3426061
## plant.growth stem date area.dam leaves
## 2.3426061 2.3426061 0.1464129 0.1464129 0.0000000
missing_df <- data.frame(
predictor = names(missing_percent),
missing_percent = as.numeric(missing_percent)
)
ggplot(missing_df, aes(x = reorder(predictor, missing_percent), y = missing_percent)) +
geom_col() +
coord_flip() +
labs(title = "Percent missing by Soybean predictor",
x = "Predictor", y = "Percent missing")
# Missingness by class
missing_by_class <- soy %>%
mutate(across(-Class, ~ as.integer(is.na(.)))) %>%
group_by(Class) %>%
summarise(across(everything(), mean), .groups = "drop")
missing_by_class
## # A tibble: 19 × 36
## Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2-4-… 0.0625 1 1 1 1 1 0.0625 1 1
## 2 alte… 0 0 0 0 0 0 0 0 0
## 3 anth… 0 0 0 0 0 0 0 0 0
## 4 bact… 0 0 0 0 0 0 0 0 0
## 5 bact… 0 0 0 0 0 0 0 0 0
## 6 brow… 0 0 0 0 0 0 0 0 0
## 7 brow… 0 0 0 0 0 0 0 0 0
## 8 char… 0 0 0 0 0 0 0 0 0
## 9 cyst… 0 1 1 1 1 0 0 1 1
## 10 diap… 0 0.4 0 0 1 0 0 1 1
## 11 diap… 0 0 0 0 0 0 0 0 0
## 12 down… 0 0 0 0 0 0 0 0 0
## 13 frog… 0 0 0 0 0 0 0 0 0
## 14 herb… 0 0 1 0 1 0 0 1 1
## 15 phyl… 0 0 0 0 0 0 0 0 0
## 16 phyt… 0 0 0 0 0.773 0 0 0.773 0.773
## 17 powd… 0 0 0 0 0 0 0 0 0
## 18 purp… 0 0 0 0 0 0 0 0 0
## 19 rhiz… 0 0 0 0 0 0 0 0 0
## # ℹ 26 more variables: germ <dbl>, plant.growth <dbl>, leaves <dbl>,
## # leaf.halo <dbl>, leaf.marg <dbl>, leaf.size <dbl>, leaf.shread <dbl>,
## # leaf.malf <dbl>, leaf.mild <dbl>, stem <dbl>, lodging <dbl>,
## # stem.cankers <dbl>, canker.lesion <dbl>, fruiting.bodies <dbl>,
## # ext.decay <dbl>, mycelium <dbl>, int.discolor <dbl>, sclerotia <dbl>,
## # fruit.pods <dbl>, fruit.spots <dbl>, seed <dbl>, mold.growth <dbl>,
## # seed.discolor <dbl>, seed.size <dbl>, shriveling <dbl>, roots <dbl>
missing_class_tests <- bind_rows(lapply(names(soy_x), function(v) {
tbl <- table(is.na(soy[[v]]), soy$Class)
pval <- tryCatch(chisq.test(tbl)$p.value, error = function(e) NA_real_)
data.frame(
predictor = v,
p_value = pval
)
})) %>%
mutate(adj_p_value = p.adjust(p_value, method = "BH")) %>%
arrange(adj_p_value)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
missing_class_tests
## predictor p_value adj_p_value
## 1 precip 2.292469e-133 6.686367e-133
## 2 temp 2.292469e-133 6.686367e-133
## 3 crop.hist 2.292469e-133 6.686367e-133
## 4 plant.growth 2.292469e-133 6.686367e-133
## 5 stem 2.292469e-133 6.686367e-133
## 6 stem.cankers 2.292469e-133 6.686367e-133
## 7 canker.lesion 2.292469e-133 6.686367e-133
## 8 ext.decay 2.292469e-133 6.686367e-133
## 9 mycelium 2.292469e-133 6.686367e-133
## 10 int.discolor 2.292469e-133 6.686367e-133
## 11 sclerotia 2.292469e-133 6.686367e-133
## 12 roots 2.292469e-133 6.686367e-133
## 13 plant.stand 4.268778e-118 1.149286e-117
## 14 hail 6.272918e-111 1.291483e-110
## 15 sever 6.272918e-111 1.291483e-110
## 16 seed.tmt 6.272918e-111 1.291483e-110
## 17 lodging 6.272918e-111 1.291483e-110
## 18 fruiting.bodies 1.995997e-108 3.326662e-108
## 19 fruit.spots 1.995997e-108 3.326662e-108
## 20 seed.discolor 1.995997e-108 3.326662e-108
## 21 shriveling 1.995997e-108 3.326662e-108
## 22 seed 2.542290e-105 3.707506e-105
## 23 mold.growth 2.542290e-105 3.707506e-105
## 24 seed.size 2.542290e-105 3.707506e-105
## 25 germ 5.674080e-104 7.943712e-104
## 26 fruit.pods 4.553154e-103 6.129246e-103
## 27 leaf.mild 1.296453e-100 1.680588e-100
## 28 leaf.shread 1.733025e-98 2.166281e-98
## 29 leaf.halo 5.567557e-93 6.089516e-93
## 30 leaf.marg 5.567557e-93 6.089516e-93
## 31 leaf.size 5.567557e-93 6.089516e-93
## 32 leaf.malf 5.567557e-93 6.089516e-93
## 33 leaves 1.064624e-85 1.129147e-85
## 34 date 1.198859e-03 1.198859e-03
## 35 area.dam 1.198859e-03 1.198859e-03
cat("\nPredictors where missingness is significantly related to class (p < 0.05):\n")
##
## Predictors where missingness is significantly related to class (p < 0.05):
significant_missing <- missing_class_tests %>%
filter(adj_p_value < 0.05)
print(significant_missing)
## predictor p_value adj_p_value
## 1 precip 2.292469e-133 6.686367e-133
## 2 temp 2.292469e-133 6.686367e-133
## 3 crop.hist 2.292469e-133 6.686367e-133
## 4 plant.growth 2.292469e-133 6.686367e-133
## 5 stem 2.292469e-133 6.686367e-133
## 6 stem.cankers 2.292469e-133 6.686367e-133
## 7 canker.lesion 2.292469e-133 6.686367e-133
## 8 ext.decay 2.292469e-133 6.686367e-133
## 9 mycelium 2.292469e-133 6.686367e-133
## 10 int.discolor 2.292469e-133 6.686367e-133
## 11 sclerotia 2.292469e-133 6.686367e-133
## 12 roots 2.292469e-133 6.686367e-133
## 13 plant.stand 4.268778e-118 1.149286e-117
## 14 hail 6.272918e-111 1.291483e-110
## 15 sever 6.272918e-111 1.291483e-110
## 16 seed.tmt 6.272918e-111 1.291483e-110
## 17 lodging 6.272918e-111 1.291483e-110
## 18 fruiting.bodies 1.995997e-108 3.326662e-108
## 19 fruit.spots 1.995997e-108 3.326662e-108
## 20 seed.discolor 1.995997e-108 3.326662e-108
## 21 shriveling 1.995997e-108 3.326662e-108
## 22 seed 2.542290e-105 3.707506e-105
## 23 mold.growth 2.542290e-105 3.707506e-105
## 24 seed.size 2.542290e-105 3.707506e-105
## 25 germ 5.674080e-104 7.943712e-104
## 26 fruit.pods 4.553154e-103 6.129246e-103
## 27 leaf.mild 1.296453e-100 1.680588e-100
## 28 leaf.shread 1.733025e-98 2.166281e-98
## 29 leaf.halo 5.567557e-93 6.089516e-93
## 30 leaf.marg 5.567557e-93 6.089516e-93
## 31 leaf.size 5.567557e-93 6.089516e-93
## 32 leaf.malf 5.567557e-93 6.089516e-93
## 33 leaves 1.064624e-85 1.129147e-85
## 34 date 1.198859e-03 1.198859e-03
## 35 area.dam 1.198859e-03 1.198859e-03
Yes, some predictors are more likely to be missing than others. The highest missingness is in sever, seed.tmt, lodging, hail, and germ, each with about 15%–18% missing values, while some predictors have very little or no missing data. The results also suggest that missingness is related to disease class rather than completely random. After adjusting for multiple tests, all predictors had significant p-values, indicating that the pattern of missing data differs across classes. This suggests the missingness is systematic and may contain information about the response. However, the chi-square warnings mean these results should be interpreted with some caution because some expected cell counts were small.
3.2 c) Develop a strategy for handling missing data, either by eliminating predictors or imputation.
drop_vars <- names(missing_percent[missing_percent > 40])
drop_vars
## character(0)
soy_reduced <- soy %>%
select(-all_of(drop_vars))
# mode imputation helper
mode_value <- function(x) {
x <- na.omit(x)
if (length(x) == 0) return(NA)
names(sort(table(x), decreasing = TRUE))[1]
}
# Smode impute within class
impute_mode_by_class <- function(data, class_col = "Class") {
out <- data
predictors <- setdiff(names(out), class_col)
for (v in predictors) {
if (anyNA(out[[v]])) {
out[[v]] <- as.character(out[[v]])
for (cl in levels(out[[class_col]])) {
idx <- out[[class_col]] == cl
fill_val <- mode_value(out[idx, v])
out[idx & is.na(out[[v]]), v] <- fill_val
}
out[[v]] <- factor(out[[v]])
}
}
out
}
soy_imputed <- impute_mode_by_class(soy_reduced, class_col = "Class")
colSums(is.na(soy_imputed))
## Class date plant.stand precip temp
## 0 0 30 38 30
## hail crop.hist area.dam sever seed.tmt
## 53 16 0 53 53
## germ plant.growth leaves leaf.halo leaf.marg
## 38 16 0 29 29
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 29 45 29 53 16
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 53 38 38 38 38
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 38 38 38 16 38
## seed mold.growth seed.discolor seed.size shriveling
## 24 24 38 24 38
## roots
## 31