library(tidyverse)
library(mlbench)
data(Glass)
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
#a
glass_long <- Glass %>%
mutate(row_id = row_number()) %>%
pivot_longer(cols = -c(Type, row_id), names_to = "predictor", values_to = "value")
ggplot(glass_long, aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(~ predictor, scales = "free") +
labs(title = "Glass predictors: distributions")
Based on the histograms, we can observe that Al, Ca, Na, Rl are right-skewed, while Si is left-skewed.
ggplot(glass_long, aes(x = Type, y = value)) +
geom_boxplot(outlier.alpha = 0.6) +
facet_wrap(~ predictor, scales = "free") +
labs(title = "Glass predictors by class", x = "Type", y = NULL)
The visualizations show that the predictors vary widely in scale and distribution, and no single variable clearly separates the glass types on its own.
#b) Skewness
skewness <- function(x) {
x <- x[is.finite(x)]
m <- mean(x); s <- sd(x)
if (s == 0) return(0)
mean(((x - m) / s)^3)
}
skew_tbl <- Glass %>%
select(-Type) %>%
summarise(across(everything(), skewness)) %>%
pivot_longer(everything(), names_to = "predictor", values_to = "skewness") %>%
arrange(desc(abs(skewness)))
skew_tbl
## # A tibble: 9 × 2
## predictor skewness
## <chr> <dbl>
## 1 K 6.46
## 2 Ba 3.37
## 3 Ca 2.02
## 4 Fe 1.73
## 5 RI 1.60
## 6 Mg -1.14
## 7 Al 0.895
## 8 Si -0.720
## 9 Na 0.448
K, Ba and Ca are the top 3 predictors with positive skewness.
# outliers table
outlier_tbl <- Glass %>%
select(-Type) %>%
summarise(across(everything(), ~{
q1 <- quantile(.x, 0.25, na.rm = TRUE)
q3 <- quantile(.x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lo <- q1 - 1.5 * iqr
hi <- q3 + 1.5 * iqr
sum(.x < lo | .x > hi, na.rm = TRUE)
})) %>%
pivot_longer(everything(), names_to = "predictor", values_to = "n_outliers") %>%
arrange(desc(n_outliers))
outlier_tbl
## # A tibble: 9 × 2
## predictor n_outliers
## <chr> <int>
## 1 Ba 38
## 2 Ca 26
## 3 Al 18
## 4 RI 17
## 5 Si 12
## 6 Fe 12
## 7 Na 7
## 8 K 7
## 9 Mg 0
Ba, Ca and Al are the top 3 predictors with the most outliers. Such distributional issues could negatively affect models that rely on distance measures or normality assumptions.
#c) Transformation
Glass_trans <- Glass %>%
mutate(
K_log1p = log1p(K),
Ba_log1p = log1p(Ba),
Fe_log1p = log1p(Fe)
) %>%
select(-K, -Ba, -Fe)
#compare before and after
before_after <- Glass %>%
select(Type, K, Ba, Fe) %>%
pivot_longer(-Type, names_to = "predictor", values_to = "value") %>%
mutate(version = "raw") %>%
bind_rows(
Glass %>%
transmute(Type,
K = log1p(K),
Ba = log1p(Ba),
Fe = log1p(Fe)) %>%
pivot_longer(-Type, names_to = "predictor", values_to = "value") %>%
mutate(version = "log1p")
)
ggplot(before_after, aes(x = value)) +
geom_histogram(bins = 30) +
facet_grid(version ~ predictor, scales = "free") +
labs(title = "Raw vs log1p transforms for K, Ba, Fe")
Transformations such as log1p or Yeo–Johnson for skewed predictors like Ba, Fe, and K are likely to improve model stability and performance.
data(Soybean)
#a. Frequency
soy_long <- Soybean %>%
mutate(across(-Class, as.character)) %>% # unify types
pivot_longer(
cols = -Class,
names_to = "predictor",
values_to = "value"
)
glimpse(soy_long)
## Rows: 23,905
## Columns: 3
## $ Class <fct> diaporthe-stem-canker, diaporthe-stem-canker, diaporthe-stem…
## $ predictor <chr> "date", "plant.stand", "precip", "temp", "hail", "crop.hist"…
## $ value <chr> "6", "0", "2", "1", "0", "1", "1", "1", "0", "0", "1", "1", …
#frequency table
freq_tbl <- soy_long %>%
count(predictor, value) %>%
group_by(predictor) %>%
mutate(prop = n / sum(n)) %>%
arrange(predictor, desc(prop))
freq_tbl
## # A tibble: 133 × 4
## # Groups: predictor [35]
## predictor value n prop
## <chr> <chr> <int> <dbl>
## 1 area.dam 1 227 0.332
## 2 area.dam 3 187 0.274
## 3 area.dam 2 145 0.212
## 4 area.dam 0 123 0.180
## 5 area.dam <NA> 1 0.00146
## 6 canker.lesion 0 320 0.469
## 7 canker.lesion 2 177 0.259
## 8 canker.lesion 1 83 0.122
## 9 canker.lesion 3 65 0.0952
## 10 canker.lesion <NA> 38 0.0556
## # ℹ 123 more rows
The categorical predictors show clear imbalance in their category frequencies. For example, area.dam is spread across multiple levels, but still unevenly: level 1 accounts for 33.2% (227/683) of observations, followed by level 3 at 27.4%, level 2 at 21.2%, and level 0 at 18.0%, with missing values occurring in only 0.15% of cases.
#b.missing data analysis
missing_pred <- Soybean %>%
summarise(across(everything(), ~ mean(is.na(.x)))) %>%
pivot_longer(everything(), names_to = "predictor", values_to = "pct_missing") %>%
arrange(desc(pct_missing))
missing_pred
## # A tibble: 36 × 2
## predictor pct_missing
## <chr> <dbl>
## 1 hail 0.177
## 2 sever 0.177
## 3 seed.tmt 0.177
## 4 lodging 0.177
## 5 germ 0.164
## 6 leaf.mild 0.158
## 7 fruiting.bodies 0.155
## 8 fruit.spots 0.155
## 9 seed.discolor 0.155
## 10 shriveling 0.155
## # ℹ 26 more rows
drop_preds <- missing_pred %>%
filter(pct_missing > 0.40) %>%
pull(predictor)
Soybean_reduced <- Soybean %>%
select(-all_of(drop_preds))
# missing by class
missing_class <- Soybean %>%
mutate(row_missing = rowSums(is.na(across(-Class)))) %>%
group_by(Class) %>%
summarise(
n = n(),
avg_missing = mean(row_missing),
pct_any_missing = mean(row_missing > 0)
) %>%
arrange(desc(avg_missing))
missing_class
## # A tibble: 19 × 4
## Class n avg_missing pct_any_missing
## <fct> <int> <dbl> <dbl>
## 1 2-4-d-injury 16 28.1 1
## 2 cyst-nematode 14 24 1
## 3 herbicide-injury 8 20 1
## 4 phytophthora-rot 88 13.8 0.773
## 5 diaporthe-pod-&-stem-blight 15 11.8 1
## 6 alternarialeaf-spot 91 0 0
## 7 anthracnose 44 0 0
## 8 bacterial-blight 20 0 0
## 9 bacterial-pustule 20 0 0
## 10 brown-spot 92 0 0
## 11 brown-stem-rot 44 0 0
## 12 charcoal-rot 20 0 0
## 13 diaporthe-stem-canker 20 0 0
## 14 downy-mildew 20 0 0
## 15 frog-eye-leaf-spot 91 0 0
## 16 phyllosticta-leaf-spot 20 0 0
## 17 powdery-mildew 20 0 0
## 18 purple-seed-stain 20 0 0
## 19 rhizoctonia-root-rot 20 0 0
2-4-d-injury has the highest average number of missing predictors at 28.1 missing values per observation (n = 16), followed by cyst-nematode with 24.0 missing values (n = 14) and herbicide-injury with 20.0 missing values (n = 8). This strong class-dependent pattern indicates that missing data are not missing completely at random.
#c. Handling missing data
mode_impute <- function(x) {
ux <- na.omit(x)
if (length(ux) == 0) return(x)
m <- names(sort(table(ux), decreasing = TRUE))[1]
replace(x, is.na(x), m)
}
Soybean_imputed <- Soybean_reduced %>%
mutate(across(-Class, mode_impute))
colSums(is.na(Soybean_imputed))
## Class date plant.stand precip temp
## 0 0 0 0 0
## hail crop.hist area.dam sever seed.tmt
## 0 0 0 0 0
## germ plant.growth leaves leaf.halo leaf.marg
## 0 0 0 0 0
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 0 0 0 0 0
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 0 0 0 0
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 0 0 0 0 0
## seed mold.growth seed.discolor seed.size shriveling
## 0 0 0 0 0
## roots
## 0
glimpse(Soybean_imputed)
## Rows: 683
## Columns: 36
## $ Class <fct> diaporthe-stem-canker, diaporthe-stem-canker, diaporth…
## $ date <fct> 6, 4, 3, 3, 6, 5, 5, 4, 6, 4, 6, 4, 3, 6, 6, 5, 6, 4, …
## $ plant.stand <ord> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ precip <ord> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ temp <ord> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, …
## $ hail <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, …
## $ crop.hist <fct> 1, 2, 1, 1, 2, 3, 2, 1, 3, 2, 1, 1, 1, 3, 1, 3, 0, 2, …
## $ area.dam <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 3, 3, 2, 2, …
## $ sever <fct> 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ seed.tmt <fct> 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, …
## $ germ <ord> 0, 1, 2, 1, 2, 1, 0, 2, 1, 2, 0, 1, 0, 0, 1, 2, 0, 1, …
## $ plant.growth <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ leaves <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ leaf.halo <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.marg <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ leaf.size <ord> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ leaf.shread <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.malf <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ leaf.mild <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ stem <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ lodging <fct> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, …
## $ stem.cankers <fct> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ canker.lesion <fct> 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ fruiting.bodies <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ext.decay <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mycelium <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ int.discolor <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ sclerotia <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ fruit.pods <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ fruit.spots <fct> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ seed <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mold.growth <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ seed.discolor <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ seed.size <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ shriveling <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ roots <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
Classes such as 2-4-d-injury (n = 16) and cyst-nematode (n = 14) have very high average missingness, with 28.1 and 24.0 missing predictors per observation.
This clear separation indicates that missingness is strongly related to the disease class and is not missing completely at random.