(Applied statistical learning, Kuhn and Johnson)

Exercise 3.1.

The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

library(mlbench)
data(Glass)
# str(Glass)

(a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

library(ggplot2)
library(GGally)
library(corrplot)
library(dplyr)
library(tidyr)
library(e1071)

# Convert 'Type' to a factor (since it's the class label)
Glass$Type <- as.factor(Glass$Type)

# 1. Histograms for each predictor
Glass_long <- Glass %>% pivot_longer(cols = -Type, names_to = "Variable", values_to = "Value")

ggplot(Glass_long, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
  facet_wrap(~ Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictor Variables")

# 2. Density plots by Type
ggplot(Glass_long, aes(x = Value, fill = Type)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~ Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Density Plots of Predictors by Type")

# 3. Correlation Heatmap
corr_matrix <- cor(Glass[, -ncol(Glass)])  # Exclude 'Type' since it's categorical
corrplot(corr_matrix, method = "color", type = "lower", tl.cex = 0.8, tl.col = "black")

(b) Do there appear to be any outliers in the data? Are any predictors skewed?

# boxplots for outliers
ggplot(Glass_long, aes(x = Variable, y = Value)) +
  geom_boxplot(fill = "orange", alpha = 0.6) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Boxplots of Predictor Variables")

# skewness for each predictor
skew_values <- sapply(Glass[, -ncol(Glass)], skewness)
print(skew_values)

##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107

# highly skewed variables (threshold: abs(skewness) > 1)
skewed_vars <- names(skew_values[abs(skew_values) > 1])
print(skewed_vars)

## [1] "RI" "Mg" "K"  "Ca" "Ba" "Fe"

Yes, there seems to be some outliers in the Ca and K variables in particular, when looking at the Box plots. Some potential outliers exist in Ba too. Testing skewness shows that several variables satisfy the statistical test for skewness, but in particular, K, Ca, and Ba are quite skewed (especially K).

(c) Are there any relevant transformations of one or more predictors that might improve the classification model?

Log transformation for highly skewed positive values (K, Ba, Ca).
Square root transformation for moderately skewed variables (Fe, RI, Mg since Mg has some negative values, we use sqrt(abs(Mg))).

# Copy dataset for transformation
Glass_transformed <- Glass

# transformations
Glass_transformed$RI_log <- log(Glass_transformed$RI)
Glass_transformed$Mg_sqrt <- sqrt(abs(Glass_transformed$Mg))  
Glass_transformed$K_log <- log(Glass_transformed$K + 1) 
Glass_transformed$Ca_log <- log(Glass_transformed$Ca + 1)
Glass_transformed$Ba_log <- log(Glass_transformed$Ba + 1)
Glass_transformed$Fe_sqrt <- sqrt(abs(Glass_transformed$Fe))

# Check new distributions
Glass_long_trans <- Glass_transformed %>%
  pivot_longer(cols = starts_with(c("RI_", "Mg_", "K_", "Ca_", "Ba_", "Fe_")), 
               names_to = "Variable", values_to = "Value")

ggplot(Glass_long_trans, aes(x = Value)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  facet_wrap(~ Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Transformed Predictor Distributions")

Exercise 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

data(Soybean)

(a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

library(caret)

categorical_vars <- Soybean %>%
  select(-Class, -date) 

# frequency tables
freq_tables <- lapply(categorical_vars, table)
freq_tables

## $plant.stand
## 
##   0   1 
## 354 293 
## 
## $precip
## 
##   0   1   2 
##  74 112 459 
## 
## $temp
## 
##   0   1   2 
##  80 374 199 
## 
## $hail
## 
##   0   1 
## 435 127 
## 
## $crop.hist
## 
##   0   1   2   3 
##  65 165 219 218 
## 
## $area.dam
## 
##   0   1   2   3 
## 123 227 145 187 
## 
## $sever
## 
##   0   1   2 
## 195 322  45 
## 
## $seed.tmt
## 
##   0   1   2 
## 305 222  35 
## 
## $germ
## 
##   0   1   2 
## 165 213 193 
## 
## $plant.growth
## 
##   0   1 
## 441 226 
## 
## $leaves
## 
##   0   1 
##  77 606 
## 
## $leaf.halo
## 
##   0   1   2 
## 221  36 342 
## 
## $leaf.marg
## 
##   0   1   2 
## 357  21 221 
## 
## $leaf.size
## 
##   0   1   2 
##  51 327 221 
## 
## $leaf.shread
## 
##   0   1 
## 487  96 
## 
## $leaf.malf
## 
##   0   1 
## 554  45 
## 
## $leaf.mild
## 
##   0   1   2 
## 535  20  20 
## 
## $stem
## 
##   0   1 
## 296 371 
## 
## $lodging
## 
##   0   1 
## 520  42 
## 
## $stem.cankers
## 
##   0   1   2   3 
## 379  39  36 191 
## 
## $canker.lesion
## 
##   0   1   2   3 
## 320  83 177  65 
## 
## $fruiting.bodies
## 
##   0   1 
## 473 104 
## 
## $ext.decay
## 
##   0   1   2 
## 497 135  13 
## 
## $mycelium
## 
##   0   1 
## 639   6 
## 
## $int.discolor
## 
##   0   1   2 
## 581  44  20 
## 
## $sclerotia
## 
##   0   1 
## 625  20 
## 
## $fruit.pods
## 
##   0   1   2   3 
## 407 130  14  48 
## 
## $fruit.spots
## 
##   0   1   2   4 
## 345  75  57 100 
## 
## $seed
## 
##   0   1 
## 476 115 
## 
## $mold.growth
## 
##   0   1 
## 524  67 
## 
## $seed.discolor
## 
##   0   1 
## 513  64 
## 
## $seed.size
## 
##   0   1 
## 532  59 
## 
## $shriveling
## 
##   0   1 
## 539  38 
## 
## $roots
## 
##   0   1   2 
## 551  86  15

# ordered factors to regular factors
categorical_vars <- Soybean %>%
  select(-Class, -date) %>%   
  mutate(across(where(is.ordered), ~ factor(as.character(.)))) %>%  
  mutate(across(where(is.factor), as.factor)) 

# Reshape
Soybean_long <- categorical_vars %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Category")

# Plot 
plot_categorical_distributions <- function(data, batch_size = 9) {
  variable_list <- unique(data$Variable)
  num_batches <- ceiling(length(variable_list) / batch_size)
  
  for (i in 1:num_batches) {
    batch_vars <- variable_list[((i - 1) * batch_size + 1) : min(i * batch_size, length(variable_list))]
    
    p <- ggplot(data %>% filter(Variable %in% batch_vars), aes(x = Category)) +
      geom_bar(fill = "steelblue", color = "black") +
      facet_wrap(~ Variable, scales = "free_x", ncol = 3) +  
      theme_minimal() +
      labs(title = paste("Categorical Predictors - Batch", i),
           x = "Category", y = "Count") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
            strip.text = element_text(size = 12))
    
    print(p)
  }
}
plot_categorical_distributions(Soybean_long, batch_size = 9)

# zero-variance predictors
zero_var <- nearZeroVar(Soybean, saveMetrics = TRUE)
zero_var_predictors <- rownames(zero_var[zero_var$nzv == TRUE, ])
print(zero_var_predictors)

## [1] "leaf.mild" "mycelium"  "sclerotia"

zero_var

##                  freqRatio percentUnique zeroVar   nzv
## Class             1.010989     2.7818448   FALSE FALSE
## date              1.137405     1.0248902   FALSE FALSE
## plant.stand       1.208191     0.2928258   FALSE FALSE
## precip            4.098214     0.4392387   FALSE FALSE
## temp              1.879397     0.4392387   FALSE FALSE
## hail              3.425197     0.2928258   FALSE FALSE
## crop.hist         1.004587     0.5856515   FALSE FALSE
## area.dam          1.213904     0.5856515   FALSE FALSE
## sever             1.651282     0.4392387   FALSE FALSE
## seed.tmt          1.373874     0.4392387   FALSE FALSE
## germ              1.103627     0.4392387   FALSE FALSE
## plant.growth      1.951327     0.2928258   FALSE FALSE
## leaves            7.870130     0.2928258   FALSE FALSE
## leaf.halo         1.547511     0.4392387   FALSE FALSE
## leaf.marg         1.615385     0.4392387   FALSE FALSE
## leaf.size         1.479638     0.4392387   FALSE FALSE
## leaf.shread       5.072917     0.2928258   FALSE FALSE
## leaf.malf        12.311111     0.2928258   FALSE FALSE
## leaf.mild        26.750000     0.4392387   FALSE  TRUE
## stem              1.253378     0.2928258   FALSE FALSE
## lodging          12.380952     0.2928258   FALSE FALSE
## stem.cankers      1.984293     0.5856515   FALSE FALSE
## canker.lesion     1.807910     0.5856515   FALSE FALSE
## fruiting.bodies   4.548077     0.2928258   FALSE FALSE
## ext.decay         3.681481     0.4392387   FALSE FALSE
## mycelium        106.500000     0.2928258   FALSE  TRUE
## int.discolor     13.204545     0.4392387   FALSE FALSE
## sclerotia        31.250000     0.2928258   FALSE  TRUE
## fruit.pods        3.130769     0.5856515   FALSE FALSE
## fruit.spots       3.450000     0.5856515   FALSE FALSE
## seed              4.139130     0.2928258   FALSE FALSE
## mold.growth       7.820896     0.2928258   FALSE FALSE
## seed.discolor     8.015625     0.2928258   FALSE FALSE
## seed.size         9.016949     0.2928258   FALSE FALSE
## shriveling       14.184211     0.2928258   FALSE FALSE
## roots             6.406977     0.4392387   FALSE FALSE

Some categorical variables have very few observations in one or more categories, such as:

mycelium: 639 in category 0 vs. only 6 in 1
sclerotia: 625 in category 0 vs. 20 in 1
leaf.mild: 535 in 0, only 20 each in 1 and 2

These very low variance variables are dominated by a single category and may contribute little to classification.

(b) Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

missing_summary <- Soybean %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "MissingCount")

missing_summary <- missing_summary %>%
  mutate(MissingPercent = (MissingCount / nrow(Soybean)) * 100) %>%
  arrange(desc(MissingPercent))
missing_summary

## # A tibble: 36 × 3
##    Variable        MissingCount MissingPercent
##    <chr>                  <int>          <dbl>
##  1 hail                     121           17.7
##  2 sever                    121           17.7
##  3 seed.tmt                 121           17.7
##  4 lodging                  121           17.7
##  5 germ                     112           16.4
##  6 leaf.mild                108           15.8
##  7 fruiting.bodies          106           15.5
##  8 fruit.spots              106           15.5
##  9 seed.discolor            106           15.5
## 10 shriveling               106           15.5
## # ℹ 26 more rows

library(naniar)
gg_miss_var(Soybean, show_pct = TRUE) +
  theme_minimal() +
  labs(title = "Missing Data by Predictor",
       x = "Predictors", y = "Number of Missing Values")

To check if missingness is related to class label:

Soybean_missing <- Soybean %>%
  mutate(across(everything(), ~ as.integer(is.na(.)))) 

missing_vars <- names(Soybean_missing)[colSums(Soybean_missing) > 0]

# Chi-square tests between missingness and Class
chi_sq_results <- Soybean_missing %>%
  select(all_of(c("Class", missing_vars))) %>% 
  pivot_longer(cols = -Class, names_to = "Variable", values_to = "MissingIndicator") %>%
  group_by(Variable) %>%
  summarise(ChiSqP = tryCatch(chisq.test(table(Class, MissingIndicator))$p.value, 
                              error = function(e) NA))  

chi_sq_results <- chi_sq_results %>% arrange(ChiSqP)
chi_sq_results

## # A tibble: 34 × 2
##    Variable         ChiSqP
##    <chr>             <dbl>
##  1 area.dam      1.10e-149
##  2 date          1.10e-149
##  3 crop.hist     5.82e-137
##  4 plant.growth  5.82e-137
##  5 stem          5.82e-137
##  6 temp          1.33e-125
##  7 roots         8.28e-125
##  8 plant.stand   6.95e-121
##  9 canker.lesion 2.48e-119
## 10 ext.decay     2.48e-119
## # ℹ 24 more rows

Yes, there are quite a few variables that are not missing at random, since there is quite a significant association between their missingness and the class: area.dam, date, crop.hist, plant.growth, stem, temp, roots, plant.stand, canker.lesion, ext.decay, int.discolor, mycelium, precip, sclerotia, stem.cankers.

(c) Develop a strategy for handling missing data, either by eliminating predictors or imputation.

I think the best strategy will have to be tailored, so:

Elimination of non-informative or redundant predictors.
Different imputation methods tailored to specific variables.

So, to translate this for this dataset, I plan to:

Remove a variable when:

If a variable is missing in >50% of observations, imputation may introduce too much uncertainty.
Variables where almost all values are the same.
Redundant predictors that can be inferred from other features.

missing_summary <- Soybean %>%
  summarise(across(everything(), ~ mean(is.na(.)))) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "MissingPercent") %>%
  arrange(desc(MissingPercent))

variables_to_remove <- missing_summary %>%
  filter(MissingPercent > 0.50) %>%
  pull(Variable)

variables_to_remove

## character(0)

impute missing values:

One of the better ways is using KNN, for categorical and ordinal variables, which is this data.

library(VIM)

# KNN requires factors
Soybean_knn <- Soybean %>%
  mutate(across(where(is.ordered), ~ factor(as.character(.))))

Soybean_imputed <- kNN(Soybean_knn, k = 5, imp_var = FALSE)

# check
colSums(is.na(Soybean_imputed))

##           Class            date     plant.stand          precip            temp 
##               0               0               0               0               0 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##               0               0               0               0               0 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##               0               0               0               0               0 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##               0               0               0               0               0 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##               0               0               0               0               0 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##               0               0               0               0               0 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##               0               0               0               0               0 
##           roots 
##               0

Compare before and after imputation:

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

imputed_vars <- names(Soybean)[colSums(is.na(Soybean)) > 0]

# before/after bar plots
plot_imputation_comparison <- function(var_name, original_data, imputed_data) {
  p1 <- ggplot(original_data, aes(x = get(var_name))) +
    geom_bar(fill = "red") +
    labs(title = paste(var_name, "- Before Imputation"), x = var_name, y = "Count") +
    theme_minimal()

  p2 <- ggplot(imputed_data, aes(x = get(var_name))) +
    geom_bar(fill = "blue") +
    labs(title = paste(var_name, "- After Imputation"), x = var_name, y = "Count") +
    theme_minimal()

  grid.arrange(p1, p2, ncol = 2)}

# Loop
for (var in imputed_vars) {
  plot_imputation_comparison(var, Soybean, Soybean_imputed)
}

624_M5

Fares A

2025-02-19