library(mlbench)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
library(ggcorrplot)

data(Glass)
glass_x <- Glass %>% select(-Type)

3.1 a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

glass_long <- glass_x %>%
  pivot_longer(cols = everything(), names_to = "predictor", values_to = "value")

# Histograms for each predictor
ggplot(glass_long, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ predictor, scales = "free") +
  labs(title = "Glass predictors: distributions")

# Boxplots for each predictor
ggplot(glass_long, aes(x = predictor, y = value)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Glass predictors: boxplots")

# Pairwise relationships
pairs(glass_x,
      pch = 19,
      cex = 0.6,
      col = as.numeric(Glass$Type),
      main = "Glass predictors: pairwise relationships")

ggcorrplot(cor(glass_x), 
           lab = TRUE,
           lab_size = 3.5,
           colors = c("darkgreen", "white", "magenta"))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

The predictor variables show diverse distributions, with some like Ba and Fe are heavily concentrated near zero, while others like RI and Si appear more normally distributed. Several predictors exhibit strong correlations, like Ca and RI (r = 0.81) showing a strong positive relationship, and Si and Al (r = -0.54) showing a moderate negative relationship. The boxplots reveal that most variables have some outliers like Ba, K, and Ca.

3.1 b) Do there appear to be any outliers in the data? Are any predictors skewed?

There do appear to be outliers in the data. From the boxplots, predictors such as Ba, Fe, K, Ca, and Al show observations that fall well outside the whiskers, indicating potential outliers. Ca has several high values, and Ba and Fe have many values near zero with a few much larger observations, which makes the unusual values stand out even more.

Some predictors also appear to be skewed. Ba, Fe, and K are strongly right-skewed, since most observations are close to zero and only a small number are much larger. Ca also appears somewhat right-skewed. Predictors like RI, Na, and Si look more symmetric, though they may still have a few unusual observations.

So overall, yes, there are visible outliers, and yes, several predictors are skewed, especially Ba, Fe, and K.

3.1 c) Are there any relevant transformations of one or more predictors that might improve the classification model?

skew_fun <- function(x) {
  x <- x[!is.na(x)]
  m <- mean(x)
  s <- sd(x)
  if (s == 0) return(0)
  mean((x - m)^3) / (s^3)
}

# Skewness before transformation
skew_before <- sapply(glass_x, skew_fun)

pp <- preProcess(glass_x, method = c("YeoJohnson"))
glass_trans <- predict(pp, glass_x)

# Skewness after transformation
skew_after <- sapply(glass_trans, skew_fun)

# Compare before and after
skew_table <- data.frame(
  Predictor = names(glass_x),
  Skew_Before = round(skew_before, 3),
  Skew_After = round(skew_after, 3)
)

print(skew_table)
##    Predictor Skew_Before Skew_After
## RI        RI       1.603      1.603
## Na        Na       0.448     -0.009
## Mg        Mg      -1.136     -0.877
## Al        Al       0.895      0.000
## Si        Si      -0.720     -0.720
## K          K       6.460     -0.071
## Ca        Ca       2.018     -0.206
## Ba        Ba       3.369      3.369
## Fe        Fe       1.730      1.730
# Plot transformed distributions
glass_trans_long <- glass_trans %>%
  pivot_longer(cols = everything(),
               names_to = "Predictor",
               values_to = "Value")

ggplot(glass_trans_long, aes(x = Value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ Predictor, scales = "free") +
  labs(title = "Glass predictors after Yeo-Johnson transformation")

Yes, Yeo-Johnson transformation successfully reduced skewness for K, Ca, Na, and Al, making these predictors more symmetric and suitable for modeling. However, Ba, Fe, RI, and Si remained highly skewed despite transformation and may require alternative approaches like binning or different transformation methods.

3.2 a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

data(Soybean)

convert_missing <- function(x) {
  x <- as.character(x)
  x[x %in% c("?", "", "NA")] <- NA
  factor(x)
}

soy <- as.data.frame(lapply(Soybean, function(x) {
  if (is.factor(x) || is.character(x)) convert_missing(x) else x
}))

soy_x <- soy %>% select(-Class)

soy_long <- soy_x %>%
  pivot_longer(cols = everything(), names_to = "predictor", values_to = "value")

# Bar plots for all categorical predictors
ggplot(soy_long, aes(x = value, fill = is.na(value))) +
  geom_bar() +
  facet_wrap(~ predictor, scales = "free_x") +
  scale_fill_manual(values = c("steelblue", "coral"), 
                    labels = c("Observed", "Missing"),
                    name = NULL) +
  labs(title = "Soybean predictors: frequency distributions") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Frequency summary for each predictor
freq_summary <- bind_rows(lapply(names(soy_x), function(v) {
  tab <- sort(table(soy_x[[v]], useNA = "ifany"), decreasing = TRUE)
  data.frame(
    predictor = v,
    n_levels = nlevels(soy_x[[v]]),
    most_common = names(tab)[1],
    most_common_n = as.integer(tab[1]),
    most_common_prop = round(as.numeric(tab[1] / sum(tab)), 3)
  )
}))

freq_summary %>%
  arrange(desc(most_common_prop))
##          predictor n_levels most_common most_common_n most_common_prop
## 1         mycelium        2           0           639            0.936
## 2        sclerotia        2           0           625            0.915
## 3           leaves        2           1           606            0.887
## 4     int.discolor        3           0           581            0.851
## 5        leaf.malf        2           0           554            0.811
## 6            roots        3           0           551            0.807
## 7       shriveling        2           0           539            0.789
## 8        leaf.mild        3           0           535            0.783
## 9        seed.size        2           0           532            0.779
## 10     mold.growth        2           0           524            0.767
## 11         lodging        2           0           520            0.761
## 12   seed.discolor        2           0           513            0.751
## 13       ext.decay        3           0           497            0.728
## 14     leaf.shread        2           0           487            0.713
## 15            seed        2           0           476            0.697
## 16 fruiting.bodies        2           0           473            0.693
## 17          precip        3           2           459            0.672
## 18    plant.growth        2           0           441            0.646
## 19            hail        2           0           435            0.637
## 20      fruit.pods        4           0           407            0.596
## 21    stem.cankers        4           0           379            0.555
## 22            temp        3           1           374            0.548
## 23            stem        2           1           371            0.543
## 24       leaf.marg        3           0           357            0.523
## 25     plant.stand        2           0           354            0.518
## 26     fruit.spots        4           0           345            0.505
## 27       leaf.halo        3           2           342            0.501
## 28       leaf.size        3           1           327            0.479
## 29           sever        3           1           322            0.471
## 30   canker.lesion        4           0           320            0.469
## 31        seed.tmt        3           0           305            0.447
## 32        area.dam        4           1           227            0.332
## 33       crop.hist        4           2           219            0.321
## 34            germ        3           1           213            0.312
## 35            date        7           5           149            0.218
degenerate_predictors <- freq_summary %>%
  filter(n_levels == 1 | most_common_prop > 0.90)

degenerate_predictors
##   predictor n_levels most_common most_common_n most_common_prop
## 1  mycelium        2           0           639            0.936
## 2 sclerotia        2           0           625            0.915
cat("Found", nrow(degenerate_predictors), "degenerate predictors:")
## Found 2 degenerate predictors:
print(degenerate_predictors$predictor)
## [1] "mycelium"  "sclerotia"

Two predictors show degenerate distributions which are mycelium and sclerotia. Both have over 90% of observations concentrated in a single category. The visualization also reveals widespread missing data across many predictors, with some variables missing up to 20% of values.

3.2 b) Roughly18% ofthe data aremissing.Are there particularpredictorsthat are more likely to be missing? Is the pattern of missing data related to the classes?

# Percent missing by predictor
missing_percent <- sort(colMeans(is.na(soy_x)) * 100, decreasing = TRUE)
missing_percent
##            hail           sever        seed.tmt         lodging            germ 
##      17.7159590      17.7159590      17.7159590      17.7159590      16.3982430 
##       leaf.mild fruiting.bodies     fruit.spots   seed.discolor      shriveling 
##      15.8125915      15.5197657      15.5197657      15.5197657      15.5197657 
##     leaf.shread            seed     mold.growth       seed.size       leaf.halo 
##      14.6412884      13.4699854      13.4699854      13.4699854      12.2986823 
##       leaf.marg       leaf.size       leaf.malf      fruit.pods          precip 
##      12.2986823      12.2986823      12.2986823      12.2986823       5.5636896 
##    stem.cankers   canker.lesion       ext.decay        mycelium    int.discolor 
##       5.5636896       5.5636896       5.5636896       5.5636896       5.5636896 
##       sclerotia     plant.stand           roots            temp       crop.hist 
##       5.5636896       5.2708638       4.5387994       4.3923865       2.3426061 
##    plant.growth            stem            date        area.dam          leaves 
##       2.3426061       2.3426061       0.1464129       0.1464129       0.0000000
missing_df <- data.frame(
  predictor = names(missing_percent),
  missing_percent = as.numeric(missing_percent)
)


ggplot(missing_df, aes(x = reorder(predictor, missing_percent), y = missing_percent)) +
  geom_col() +
  coord_flip() +
  labs(title = "Percent missing by Soybean predictor",
       x = "Predictor", y = "Percent missing")

# Missingness by class
missing_by_class <- soy %>%
  mutate(across(-Class, ~ as.integer(is.na(.)))) %>%
  group_by(Class) %>%
  summarise(across(everything(), mean), .groups = "drop")


missing_by_class
## # A tibble: 19 × 36
##    Class   date plant.stand precip  temp  hail crop.hist area.dam sever seed.tmt
##    <fct>  <dbl>       <dbl>  <dbl> <dbl> <dbl>     <dbl>    <dbl> <dbl>    <dbl>
##  1 2-4-… 0.0625         1        1     1 1             1   0.0625 1        1    
##  2 alte… 0              0        0     0 0             0   0      0        0    
##  3 anth… 0              0        0     0 0             0   0      0        0    
##  4 bact… 0              0        0     0 0             0   0      0        0    
##  5 bact… 0              0        0     0 0             0   0      0        0    
##  6 brow… 0              0        0     0 0             0   0      0        0    
##  7 brow… 0              0        0     0 0             0   0      0        0    
##  8 char… 0              0        0     0 0             0   0      0        0    
##  9 cyst… 0              1        1     1 1             0   0      1        1    
## 10 diap… 0              0.4      0     0 1             0   0      1        1    
## 11 diap… 0              0        0     0 0             0   0      0        0    
## 12 down… 0              0        0     0 0             0   0      0        0    
## 13 frog… 0              0        0     0 0             0   0      0        0    
## 14 herb… 0              0        1     0 1             0   0      1        1    
## 15 phyl… 0              0        0     0 0             0   0      0        0    
## 16 phyt… 0              0        0     0 0.773         0   0      0.773    0.773
## 17 powd… 0              0        0     0 0             0   0      0        0    
## 18 purp… 0              0        0     0 0             0   0      0        0    
## 19 rhiz… 0              0        0     0 0             0   0      0        0    
## # ℹ 26 more variables: germ <dbl>, plant.growth <dbl>, leaves <dbl>,
## #   leaf.halo <dbl>, leaf.marg <dbl>, leaf.size <dbl>, leaf.shread <dbl>,
## #   leaf.malf <dbl>, leaf.mild <dbl>, stem <dbl>, lodging <dbl>,
## #   stem.cankers <dbl>, canker.lesion <dbl>, fruiting.bodies <dbl>,
## #   ext.decay <dbl>, mycelium <dbl>, int.discolor <dbl>, sclerotia <dbl>,
## #   fruit.pods <dbl>, fruit.spots <dbl>, seed <dbl>, mold.growth <dbl>,
## #   seed.discolor <dbl>, seed.size <dbl>, shriveling <dbl>, roots <dbl>
missing_class_tests <- bind_rows(lapply(names(soy_x), function(v) {
  tbl <- table(is.na(soy[[v]]), soy$Class)
  pval <- tryCatch(chisq.test(tbl)$p.value, error = function(e) NA_real_)
  data.frame(
    predictor = v,
    p_value = pval
  )
})) %>%
  mutate(adj_p_value = p.adjust(p_value, method = "BH")) %>%
  arrange(adj_p_value)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
missing_class_tests
##          predictor       p_value   adj_p_value
## 1           precip 2.292469e-133 6.686367e-133
## 2             temp 2.292469e-133 6.686367e-133
## 3        crop.hist 2.292469e-133 6.686367e-133
## 4     plant.growth 2.292469e-133 6.686367e-133
## 5             stem 2.292469e-133 6.686367e-133
## 6     stem.cankers 2.292469e-133 6.686367e-133
## 7    canker.lesion 2.292469e-133 6.686367e-133
## 8        ext.decay 2.292469e-133 6.686367e-133
## 9         mycelium 2.292469e-133 6.686367e-133
## 10    int.discolor 2.292469e-133 6.686367e-133
## 11       sclerotia 2.292469e-133 6.686367e-133
## 12           roots 2.292469e-133 6.686367e-133
## 13     plant.stand 4.268778e-118 1.149286e-117
## 14            hail 6.272918e-111 1.291483e-110
## 15           sever 6.272918e-111 1.291483e-110
## 16        seed.tmt 6.272918e-111 1.291483e-110
## 17         lodging 6.272918e-111 1.291483e-110
## 18 fruiting.bodies 1.995997e-108 3.326662e-108
## 19     fruit.spots 1.995997e-108 3.326662e-108
## 20   seed.discolor 1.995997e-108 3.326662e-108
## 21      shriveling 1.995997e-108 3.326662e-108
## 22            seed 2.542290e-105 3.707506e-105
## 23     mold.growth 2.542290e-105 3.707506e-105
## 24       seed.size 2.542290e-105 3.707506e-105
## 25            germ 5.674080e-104 7.943712e-104
## 26      fruit.pods 4.553154e-103 6.129246e-103
## 27       leaf.mild 1.296453e-100 1.680588e-100
## 28     leaf.shread  1.733025e-98  2.166281e-98
## 29       leaf.halo  5.567557e-93  6.089516e-93
## 30       leaf.marg  5.567557e-93  6.089516e-93
## 31       leaf.size  5.567557e-93  6.089516e-93
## 32       leaf.malf  5.567557e-93  6.089516e-93
## 33          leaves  1.064624e-85  1.129147e-85
## 34            date  1.198859e-03  1.198859e-03
## 35        area.dam  1.198859e-03  1.198859e-03
cat("\nPredictors where missingness is significantly related to class (p < 0.05):\n")
## 
## Predictors where missingness is significantly related to class (p < 0.05):
significant_missing <- missing_class_tests %>% 
  filter(adj_p_value < 0.05)
print(significant_missing)
##          predictor       p_value   adj_p_value
## 1           precip 2.292469e-133 6.686367e-133
## 2             temp 2.292469e-133 6.686367e-133
## 3        crop.hist 2.292469e-133 6.686367e-133
## 4     plant.growth 2.292469e-133 6.686367e-133
## 5             stem 2.292469e-133 6.686367e-133
## 6     stem.cankers 2.292469e-133 6.686367e-133
## 7    canker.lesion 2.292469e-133 6.686367e-133
## 8        ext.decay 2.292469e-133 6.686367e-133
## 9         mycelium 2.292469e-133 6.686367e-133
## 10    int.discolor 2.292469e-133 6.686367e-133
## 11       sclerotia 2.292469e-133 6.686367e-133
## 12           roots 2.292469e-133 6.686367e-133
## 13     plant.stand 4.268778e-118 1.149286e-117
## 14            hail 6.272918e-111 1.291483e-110
## 15           sever 6.272918e-111 1.291483e-110
## 16        seed.tmt 6.272918e-111 1.291483e-110
## 17         lodging 6.272918e-111 1.291483e-110
## 18 fruiting.bodies 1.995997e-108 3.326662e-108
## 19     fruit.spots 1.995997e-108 3.326662e-108
## 20   seed.discolor 1.995997e-108 3.326662e-108
## 21      shriveling 1.995997e-108 3.326662e-108
## 22            seed 2.542290e-105 3.707506e-105
## 23     mold.growth 2.542290e-105 3.707506e-105
## 24       seed.size 2.542290e-105 3.707506e-105
## 25            germ 5.674080e-104 7.943712e-104
## 26      fruit.pods 4.553154e-103 6.129246e-103
## 27       leaf.mild 1.296453e-100 1.680588e-100
## 28     leaf.shread  1.733025e-98  2.166281e-98
## 29       leaf.halo  5.567557e-93  6.089516e-93
## 30       leaf.marg  5.567557e-93  6.089516e-93
## 31       leaf.size  5.567557e-93  6.089516e-93
## 32       leaf.malf  5.567557e-93  6.089516e-93
## 33          leaves  1.064624e-85  1.129147e-85
## 34            date  1.198859e-03  1.198859e-03
## 35        area.dam  1.198859e-03  1.198859e-03

Yes, some predictors are more likely to be missing than others. The highest missingness is in sever, seed.tmt, lodging, hail, and germ, each with about 15%–18% missing values, while some predictors have very little or no missing data. The results also suggest that missingness is related to disease class rather than completely random. After adjusting for multiple tests, all predictors had significant p-values, indicating that the pattern of missing data differs across classes. This suggests the missingness is systematic and may contain information about the response. However, the chi-square warnings mean these results should be interpreted with some caution because some expected cell counts were small.

3.2 c) Develop a strategy for handling missing data, either by eliminating predictors or imputation.

drop_vars <- names(missing_percent[missing_percent > 40])
drop_vars
## character(0)
soy_reduced <- soy %>%
  select(-all_of(drop_vars))

# mode imputation helper
mode_value <- function(x) {
  x <- na.omit(x)
  if (length(x) == 0) return(NA)
  names(sort(table(x), decreasing = TRUE))[1]
}

# Smode impute within class
impute_mode_by_class <- function(data, class_col = "Class") {
  out <- data
  predictors <- setdiff(names(out), class_col)

  for (v in predictors) {
    if (anyNA(out[[v]])) {
      out[[v]] <- as.character(out[[v]])

      for (cl in levels(out[[class_col]])) {
        idx <- out[[class_col]] == cl
        fill_val <- mode_value(out[idx, v])
        out[idx & is.na(out[[v]]), v] <- fill_val
      }

      out[[v]] <- factor(out[[v]])
    }
  }

  out
}

soy_imputed <- impute_mode_by_class(soy_reduced, class_col = "Class")

colSums(is.na(soy_imputed))
##           Class            date     plant.stand          precip            temp 
##               0               0              30              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##              53              16               0              53              53 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##              38              16               0              29              29 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##              29              45              29              53              16 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##              53              38              38              38              38 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##              38              38              38              16              38 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##              24              24              38              24              38 
##           roots 
##              31