library(mlbench)
library(caret)
library(corrplot)
library(moments)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass[,1:9])
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.05701
## 3rd Qu.:0.10000
## Max. :0.51000
par(mfrow = c(3, 3))
for(i in 1:9) {
hist(Glass[,i], main = names(Glass)[i],
xlab = names(Glass)[i], col = "lightblue", breaks = 20)
}
par(mfrow = c(3, 3))
for(i in 1:9) {
boxplot(Glass[,i] ~ Glass$Type,
main = names(Glass)[i],
xlab = "Type", ylab = names(Glass)[i])
}
cor_matrix <- cor(Glass[, 1:9])
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45)
Ca and RI show moderate positive correlation. Most other variables are weakly correlated.
skew_vals <- apply(Glass[,1:9], 2, skewness)
skew_df <- data.frame(
Variable = names(skew_vals),
Skewness = round(skew_vals, 3)
)
print(skew_df)
## Variable Skewness
## RI RI 1.614
## Na Na 0.451
## Mg Mg -1.144
## Al Al 0.901
## Si Si -0.725
## K K 6.506
## Ca Ca 2.033
## Ba Ba 3.392
## Fe Fe 1.742
high_skew <- names(skew_vals[abs(skew_vals) > 1])
cat("\nVariables with high skewness (|skew| > 1):", paste(high_skew, collapse = ", "))
##
## Variables with high skewness (|skew| > 1): RI, Mg, K, Ca, Ba, Fe
outlier_summary <- data.frame(
Variable = names(Glass)[1:9],
Outliers = sapply(Glass[,1:9], function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR_val <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR_val) | x > (Q3 + 1.5 * IQR_val))
})
)
print(outlier_summary)
## Variable Outliers
## RI RI 17
## Na Na 7
## Mg Mg 0
## Al Al 18
## Si Si 12
## K K 7
## Ca Ca 26
## Ba Ba 38
## Fe Fe 12
RI, Mg, K, Ca, Ba, and Fe all show high skewness (|skewness| > 1). K and Ba are particularly severe with skewness values over 3, and both have multiple outliers.
for(var in high_skew) {
x <- Glass[,var]
if(min(x) <= 0) {
x <- x + abs(min(x)) + 1
}
bc_trans <- BoxCoxTrans(x)
transformed <- predict(bc_trans, x)
cat(sprintf("\n%s:\n", var))
cat(sprintf(" Original skewness: %.3f\n", skewness(x)))
cat(sprintf(" Box-Cox lambda: %.3f\n", bc_trans$lambda))
cat(sprintf(" Transformed skewness: %.3f\n", skewness(transformed)))
}
##
## RI:
## Original skewness: 1.614
## Box-Cox lambda: -2.000
## Transformed skewness: 1.577
##
## Mg:
## Original skewness: -1.144
## Box-Cox lambda: 2.000
## Transformed skewness: -0.934
##
## K:
## Original skewness: 6.506
## Box-Cox lambda: -1.000
## Transformed skewness: -0.088
##
## Ca:
## Original skewness: 2.033
## Box-Cox lambda: -1.100
## Transformed skewness: -0.195
##
## Ba:
## Original skewness: 3.392
## Box-Cox lambda: -2.000
## Transformed skewness: 2.151
##
## Fe:
## Original skewness: 1.742
## Box-Cox lambda: -2.000
## Transformed skewness: 1.336
par(mfrow = c(2, length(high_skew)))
for(var in high_skew) {
x <- Glass[,var]
if(min(x) <= 0) x <- x + abs(min(x)) + 1
bc_trans <- BoxCoxTrans(x)
hist(Glass[,var], main = paste("Original", var),
breaks = 20, col = "lightblue")
hist(predict(bc_trans, x), main = paste("Transformed", var),
breaks = 20, col = "lightgreen")
}
Several preprocessing steps would help improve model performance with this dataset. First, all predictors should be centered and scaled to account for their different measurement units. The K, Ba, and Fe variables would benefit from Box-Cox transformations given their severe skewness. It’s also worth monitoring the moderate correlation between Ca and RI, as this could lead to multicollinearity issues. Finally, if outliers continue to be problematic after applying these transformations, a spatial sign transformation could be used to further reduce their impact.
preproc <- preProcess(Glass[,1:9], method = c('BoxCox', 'center', 'scale'))
glass_processed <- predict(preproc, Glass[,1:9])
library(VIM)
library(dplyr)
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
freq_summary <- data.frame(
Variable = character(),
Unique_Levels = integer(),
Dominant_Percent = numeric(),
Status = character(),
stringsAsFactors = FALSE
)
degenerate_vars <- c()
for(i in 1:(ncol(Soybean)-1)) {
var_name <- names(Soybean)[i]
if(is.factor(Soybean[,i])) {
freq_table <- table(Soybean[,i], useNA = "no")
prop_table <- prop.table(freq_table)
unique_levels <- length(freq_table)
dominant_pct <- max(prop_table) * 100
status <- "OK"
if(unique_levels == 1) {
status <- "Zero Variance"
degenerate_vars <- c(degenerate_vars, var_name)
} else if(dominant_pct > 95) {
status <- "Near-Zero Variance"
degenerate_vars <- c(degenerate_vars, var_name)
}
freq_summary <- rbind(freq_summary,
data.frame(
Variable = var_name,
Unique_Levels = unique_levels,
Dominant_Percent = round(dominant_pct, 2),
Status = status
))
}
}
print(freq_summary)
## Variable Unique_Levels Dominant_Percent Status
## 1 Class 19 13.47 OK
## 2 date 7 21.85 OK
## 3 plant.stand 2 54.71 OK
## 4 precip 3 71.16 OK
## 5 temp 3 57.27 OK
## 6 hail 2 77.40 OK
## 7 crop.hist 4 32.83 OK
## 8 area.dam 4 33.28 OK
## 9 sever 3 57.30 OK
## 10 seed.tmt 3 54.27 OK
## 11 germ 3 37.30 OK
## 12 plant.growth 2 66.12 OK
## 13 leaves 2 88.73 OK
## 14 leaf.halo 3 57.10 OK
## 15 leaf.marg 3 59.60 OK
## 16 leaf.size 3 54.59 OK
## 17 leaf.shread 2 83.53 OK
## 18 leaf.malf 2 92.49 OK
## 19 leaf.mild 3 93.04 OK
## 20 stem 2 55.62 OK
## 21 lodging 2 92.53 OK
## 22 stem.cankers 4 58.76 OK
## 23 canker.lesion 4 49.61 OK
## 24 fruiting.bodies 2 81.98 OK
## 25 ext.decay 3 77.05 OK
## 26 mycelium 2 99.07 Near-Zero Variance
## 27 int.discolor 3 90.08 OK
## 28 sclerotia 2 96.90 Near-Zero Variance
## 29 fruit.pods 4 67.95 OK
## 30 fruit.spots 4 59.79 OK
## 31 seed 2 80.54 OK
## 32 mold.growth 2 88.66 OK
## 33 seed.discolor 2 88.91 OK
## 34 seed.size 2 90.02 OK
## 35 shriveling 2 93.41 OK
cat("\nDegenerate variables detected:\n")
##
## Degenerate variables detected:
print(degenerate_vars)
## [1] "mycelium" "sclerotia"
nzv <- nearZeroVar(Soybean[,1:35], saveMetrics = TRUE)
cat("\nNear-zero variance check:\n")
##
## Near-zero variance check:
print(rownames(nzv)[nzv$nzv == TRUE])
## [1] "leaf.mild" "mycelium" "sclerotia"
Several variables have near-zero variance with dominant categories exceeding 95% of observations.
total_missing <- sum(is.na(Soybean[,1:35])) / (nrow(Soybean) * 35) * 100
cat(sprintf("Overall missing: %.2f%%\n", total_missing))
## Overall missing: 9.65%
missing_pct <- colSums(is.na(Soybean[,1:35])) / nrow(Soybean) * 100
missing_sorted <- sort(missing_pct[missing_pct > 0], decreasing = TRUE)
cat("\nVariables with missing data:\n")
##
## Variables with missing data:
print(round(missing_sorted, 2))
## hail sever seed.tmt lodging germ
## 17.72 17.72 17.72 17.72 16.40
## leaf.mild fruiting.bodies fruit.spots seed.discolor shriveling
## 15.81 15.52 15.52 15.52 15.52
## leaf.shread seed mold.growth seed.size leaf.halo
## 14.64 13.47 13.47 13.47 12.30
## leaf.marg leaf.size leaf.malf fruit.pods precip
## 12.30 12.30 12.30 12.30 5.56
## stem.cankers canker.lesion ext.decay mycelium int.discolor
## 5.56 5.56 5.56 5.56 5.56
## sclerotia plant.stand temp crop.hist plant.growth
## 5.56 5.27 4.39 2.34 2.34
## stem date area.dam
## 2.34 0.15 0.15
missing_by_class <- Soybean %>%
group_by(Class) %>%
summarise(
n = n(),
avg_missing = mean(rowSums(is.na(across(1:35))) / 35 * 100)
) %>%
arrange(desc(avg_missing))
print(missing_by_class)
## # A tibble: 19 × 3
## Class n avg_missing
## <fct> <int> <dbl>
## 1 2-4-d-injury 16 80.4
## 2 cyst-nematode 14 68.6
## 3 herbicide-injury 8 57.1
## 4 phytophthora-rot 88 39.4
## 5 diaporthe-pod-&-stem-blight 15 33.7
## 6 alternarialeaf-spot 91 0
## 7 anthracnose 44 0
## 8 bacterial-blight 20 0
## 9 bacterial-pustule 20 0
## 10 brown-spot 92 0
## 11 brown-stem-rot 44 0
## 12 charcoal-rot 20 0
## 13 diaporthe-stem-canker 20 0
## 14 downy-mildew 20 0
## 15 frog-eye-leaf-spot 91 0
## 16 phyllosticta-leaf-spot 20 0
## 17 powdery-mildew 20 0
## 18 purple-seed-stain 20 0
## 19 rhizoctonia-root-rot 20 0
aggr(Soybean, col = c('navyblue','red'),
numbers = TRUE, sortVars = TRUE,
cex.axis = 0.6)
##
## Variables sorted by number of missings:
## Variable Count
## hail 0.177159590
## sever 0.177159590
## seed.tmt 0.177159590
## lodging 0.177159590
## germ 0.163982430
## leaf.mild 0.158125915
## fruiting.bodies 0.155197657
## fruit.spots 0.155197657
## seed.discolor 0.155197657
## shriveling 0.155197657
## leaf.shread 0.146412884
## seed 0.134699854
## mold.growth 0.134699854
## seed.size 0.134699854
## leaf.halo 0.122986823
## leaf.marg 0.122986823
## leaf.size 0.122986823
## leaf.malf 0.122986823
## fruit.pods 0.122986823
## precip 0.055636896
## stem.cankers 0.055636896
## canker.lesion 0.055636896
## ext.decay 0.055636896
## mycelium 0.055636896
## int.discolor 0.055636896
## sclerotia 0.055636896
## plant.stand 0.052708638
## roots 0.045387994
## temp 0.043923865
## crop.hist 0.023426061
## plant.growth 0.023426061
## stem 0.023426061
## date 0.001464129
## area.dam 0.001464129
## Class 0.000000000
## leaves 0.000000000
Missing data is not random. Certain predictors have high missingness (>40%), and specific disease classes show higher missing rates.
high_missing <- names(missing_pct[missing_pct > 40])
moderate_missing <- names(missing_pct[missing_pct > 10 & missing_pct <= 40])
low_missing <- names(missing_pct[missing_pct > 0 & missing_pct <= 10])
cat("Remove (>40% missing):", length(high_missing), "variables\n")
## Remove (>40% missing): 0 variables
cat(paste(high_missing, collapse = ", "), "\n\n")
cat("Impute (10-40% missing):", length(moderate_missing), "variables\n")
## Impute (10-40% missing): 19 variables
cat(paste(moderate_missing, collapse = ", "), "\n\n")
## hail, sever, seed.tmt, germ, leaf.halo, leaf.marg, leaf.size, leaf.shread, leaf.malf, leaf.mild, lodging, fruiting.bodies, fruit.pods, fruit.spots, seed, mold.growth, seed.discolor, seed.size, shriveling
cat("Impute (<10% missing):", length(low_missing), "variables\n")
## Impute (<10% missing): 14 variables
soybean_clean <- Soybean %>%
select(-all_of(c(high_missing, degenerate_vars)))
for(col in names(soybean_clean)[1:(ncol(soybean_clean)-1)]) {
if(is.factor(soybean_clean[[col]]) && any(is.na(soybean_clean[[col]]))) {
mode_val <- names(sort(table(soybean_clean[[col]]), decreasing = TRUE))[1]
soybean_clean[[col]][is.na(soybean_clean[[col]])] <- mode_val
}
}
cat("Original dimensions:", dim(Soybean), "\n")
## Original dimensions: 683 36
cat("Cleaned dimensions:", dim(soybean_clean), "\n")
## Cleaned dimensions: 683 34
cat("Remaining missing:", sum(is.na(soybean_clean)), "\n")
## Remaining missing: 31
The missing data strategy should address several key issues found in this dataset. Variables with more than 40% missing data don’t provide enough information for reliable predictions and should be removed. Predictors with near-zero variance also add little to the model since they don’t vary much across observations, so these can be eliminated as well. For the remaining variables with missing values, mode imputation offers a straightforward solution for categorical data. However, mode imputation can distort distributions by overrepresenting the most common category. KNN imputation would do a better job preserving relationships between variables since it uses similar observations to fill in missing values. Tree-based methods could also work well here because they naturally handle categorical data. Since the missing data isn’t random—with certain disease classes showing much higher rates of missingness—KNN or tree-based imputation would likely capture these patterns better than mode imputation.