library(tidyverse)
library(mlbench)
data(Glass)
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

3.1

#a
glass_long <- Glass %>%
  mutate(row_id = row_number()) %>%
  pivot_longer(cols = -c(Type, row_id), names_to = "predictor", values_to = "value")
ggplot(glass_long, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ predictor, scales = "free") +
  labs(title = "Glass predictors: distributions")

Based on the histograms, we can observe that Al, Ca, Na, Rl are right-skewed, while Si is left-skewed.

ggplot(glass_long, aes(x = Type, y = value)) +
  geom_boxplot(outlier.alpha = 0.6) +
  facet_wrap(~ predictor, scales = "free") +
  labs(title = "Glass predictors by class", x = "Type", y = NULL)

The visualizations show that the predictors vary widely in scale and distribution, and no single variable clearly separates the glass types on its own.

#b) Skewness
skewness <- function(x) {
  x <- x[is.finite(x)]
  m <- mean(x); s <- sd(x)
  if (s == 0) return(0)
  mean(((x - m) / s)^3)
}

skew_tbl <- Glass %>%
  select(-Type) %>%
  summarise(across(everything(), skewness)) %>%
  pivot_longer(everything(), names_to = "predictor", values_to = "skewness") %>%
  arrange(desc(abs(skewness)))

skew_tbl
## # A tibble: 9 × 2
##   predictor skewness
##   <chr>        <dbl>
## 1 K            6.46 
## 2 Ba           3.37 
## 3 Ca           2.02 
## 4 Fe           1.73 
## 5 RI           1.60 
## 6 Mg          -1.14 
## 7 Al           0.895
## 8 Si          -0.720
## 9 Na           0.448

K, Ba and Ca are the top 3 predictors with positive skewness.

# outliers table
outlier_tbl <- Glass %>%
  select(-Type) %>%
  summarise(across(everything(), ~{
    q1 <- quantile(.x, 0.25, na.rm = TRUE)
    q3 <- quantile(.x, 0.75, na.rm = TRUE)
    iqr <- q3 - q1
    lo <- q1 - 1.5 * iqr
    hi <- q3 + 1.5 * iqr
    sum(.x < lo | .x > hi, na.rm = TRUE)
  })) %>%
  pivot_longer(everything(), names_to = "predictor", values_to = "n_outliers") %>%
  arrange(desc(n_outliers))

outlier_tbl
## # A tibble: 9 × 2
##   predictor n_outliers
##   <chr>          <int>
## 1 Ba                38
## 2 Ca                26
## 3 Al                18
## 4 RI                17
## 5 Si                12
## 6 Fe                12
## 7 Na                 7
## 8 K                  7
## 9 Mg                 0

Ba, Ca and Al are the top 3 predictors with the most outliers. Such distributional issues could negatively affect models that rely on distance measures or normality assumptions.

#c) Transformation
Glass_trans <- Glass %>%
  mutate(
    K_log1p  = log1p(K),
    Ba_log1p = log1p(Ba),
    Fe_log1p = log1p(Fe)
  ) %>%
  select(-K, -Ba, -Fe)

#compare before and after 
before_after <- Glass %>%
  select(Type, K, Ba, Fe) %>%
  pivot_longer(-Type, names_to = "predictor", values_to = "value") %>%
  mutate(version = "raw") %>%
  bind_rows(
    Glass %>%
      transmute(Type,
                K = log1p(K),
                Ba = log1p(Ba),
                Fe = log1p(Fe)) %>%
      pivot_longer(-Type, names_to = "predictor", values_to = "value") %>%
      mutate(version = "log1p")
  )
ggplot(before_after, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_grid(version ~ predictor, scales = "free") +
  labs(title = "Raw vs log1p transforms for K, Ba, Fe")

Transformations such as log1p or Yeo–Johnson for skewed predictors like Ba, Fe, and K are likely to improve model stability and performance.

3.2

data(Soybean)
#a. Frequency
soy_long <- Soybean %>%
  mutate(across(-Class, as.character)) %>%   # unify types
  pivot_longer(
    cols = -Class,
    names_to = "predictor",
    values_to = "value"
  )

glimpse(soy_long)
## Rows: 23,905
## Columns: 3
## $ Class     <fct> diaporthe-stem-canker, diaporthe-stem-canker, diaporthe-stem…
## $ predictor <chr> "date", "plant.stand", "precip", "temp", "hail", "crop.hist"…
## $ value     <chr> "6", "0", "2", "1", "0", "1", "1", "1", "0", "0", "1", "1", …
#frequency table
freq_tbl <- soy_long %>%
  count(predictor, value) %>%
  group_by(predictor) %>%
  mutate(prop = n / sum(n)) %>%
  arrange(predictor, desc(prop))

freq_tbl
## # A tibble: 133 × 4
## # Groups:   predictor [35]
##    predictor     value     n    prop
##    <chr>         <chr> <int>   <dbl>
##  1 area.dam      1       227 0.332  
##  2 area.dam      3       187 0.274  
##  3 area.dam      2       145 0.212  
##  4 area.dam      0       123 0.180  
##  5 area.dam      <NA>      1 0.00146
##  6 canker.lesion 0       320 0.469  
##  7 canker.lesion 2       177 0.259  
##  8 canker.lesion 1        83 0.122  
##  9 canker.lesion 3        65 0.0952 
## 10 canker.lesion <NA>     38 0.0556 
## # ℹ 123 more rows

The categorical predictors show clear imbalance in their category frequencies. For example, area.dam is spread across multiple levels, but still unevenly: level 1 accounts for 33.2% (227/683) of observations, followed by level 3 at 27.4%, level 2 at 21.2%, and level 0 at 18.0%, with missing values occurring in only 0.15% of cases.

#b.missing data analysis
missing_pred <- Soybean %>%
  summarise(across(everything(), ~ mean(is.na(.x)))) %>%
  pivot_longer(everything(), names_to = "predictor", values_to = "pct_missing") %>%
  arrange(desc(pct_missing))

missing_pred
## # A tibble: 36 × 2
##    predictor       pct_missing
##    <chr>                 <dbl>
##  1 hail                  0.177
##  2 sever                 0.177
##  3 seed.tmt              0.177
##  4 lodging               0.177
##  5 germ                  0.164
##  6 leaf.mild             0.158
##  7 fruiting.bodies       0.155
##  8 fruit.spots           0.155
##  9 seed.discolor         0.155
## 10 shriveling            0.155
## # ℹ 26 more rows
drop_preds <- missing_pred %>%
  filter(pct_missing > 0.40) %>%
  pull(predictor)

Soybean_reduced <- Soybean %>%
  select(-all_of(drop_preds))
# missing by class
missing_class <- Soybean %>%
  mutate(row_missing = rowSums(is.na(across(-Class)))) %>%
  group_by(Class) %>%
  summarise(
    n = n(),
    avg_missing = mean(row_missing),
    pct_any_missing = mean(row_missing > 0)
  ) %>%
  arrange(desc(avg_missing))

missing_class
## # A tibble: 19 × 4
##    Class                           n avg_missing pct_any_missing
##    <fct>                       <int>       <dbl>           <dbl>
##  1 2-4-d-injury                   16        28.1           1    
##  2 cyst-nematode                  14        24             1    
##  3 herbicide-injury                8        20             1    
##  4 phytophthora-rot               88        13.8           0.773
##  5 diaporthe-pod-&-stem-blight    15        11.8           1    
##  6 alternarialeaf-spot            91         0             0    
##  7 anthracnose                    44         0             0    
##  8 bacterial-blight               20         0             0    
##  9 bacterial-pustule              20         0             0    
## 10 brown-spot                     92         0             0    
## 11 brown-stem-rot                 44         0             0    
## 12 charcoal-rot                   20         0             0    
## 13 diaporthe-stem-canker          20         0             0    
## 14 downy-mildew                   20         0             0    
## 15 frog-eye-leaf-spot             91         0             0    
## 16 phyllosticta-leaf-spot         20         0             0    
## 17 powdery-mildew                 20         0             0    
## 18 purple-seed-stain              20         0             0    
## 19 rhizoctonia-root-rot           20         0             0

2-4-d-injury has the highest average number of missing predictors at 28.1 missing values per observation (n = 16), followed by cyst-nematode with 24.0 missing values (n = 14) and herbicide-injury with 20.0 missing values (n = 8). This strong class-dependent pattern indicates that missing data are not missing completely at random.

#c. Handling missing data
mode_impute <- function(x) {
  ux <- na.omit(x)
  if (length(ux) == 0) return(x)
  m <- names(sort(table(ux), decreasing = TRUE))[1]
  replace(x, is.na(x), m)
}

Soybean_imputed <- Soybean_reduced %>%
  mutate(across(-Class, mode_impute))
colSums(is.na(Soybean_imputed))
##           Class            date     plant.stand          precip            temp 
##               0               0               0               0               0 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##               0               0               0               0               0 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##               0               0               0               0               0 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##               0               0               0               0               0 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##               0               0               0               0               0 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##               0               0               0               0               0 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##               0               0               0               0               0 
##           roots 
##               0
glimpse(Soybean_imputed)
## Rows: 683
## Columns: 36
## $ Class           <fct> diaporthe-stem-canker, diaporthe-stem-canker, diaporth…
## $ date            <fct> 6, 4, 3, 3, 6, 5, 5, 4, 6, 4, 6, 4, 3, 6, 6, 5, 6, 4, …
## $ plant.stand     <ord> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ precip          <ord> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ temp            <ord> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, …
## $ hail            <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, …
## $ crop.hist       <fct> 1, 2, 1, 1, 2, 3, 2, 1, 3, 2, 1, 1, 1, 3, 1, 3, 0, 2, …
## $ area.dam        <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 3, 3, 2, 2, …
## $ sever           <fct> 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ seed.tmt        <fct> 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, …
## $ germ            <ord> 0, 1, 2, 1, 2, 1, 0, 2, 1, 2, 0, 1, 0, 0, 1, 2, 0, 1, …
## $ plant.growth    <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ leaves          <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ leaf.halo       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.marg       <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ leaf.size       <ord> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ leaf.shread     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.malf       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.mild       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stem            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ lodging         <fct> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ stem.cankers    <fct> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ canker.lesion   <fct> 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ fruiting.bodies <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ext.decay       <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mycelium        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ int.discolor    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ sclerotia       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ fruit.pods      <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fruit.spots     <fct> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ seed            <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mold.growth     <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ seed.discolor   <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ seed.size       <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shriveling      <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ roots           <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Classes such as 2-4-d-injury (n = 16) and cyst-nematode (n = 14) have very high average missingness, with 28.1 and 24.0 missing predictors per observation.

This clear separation indicates that missingness is strongly related to the disease class and is not missing completely at random.