Exercise 3.1: Glass Data Analysis

Load Data and Libraries

library(mlbench)
library(caret)
library(corrplot)
library(moments)

data(Glass)
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

(a) Explore Distributions and Relationships

Summary Statistics

summary(Glass[,1:9])
##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe         
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.05701  
##  3rd Qu.:0.10000  
##  Max.   :0.51000

Distribution Histograms

par(mfrow = c(3, 3))
for(i in 1:9) {
  hist(Glass[,i], main = names(Glass)[i], 
       xlab = names(Glass)[i], col = "lightblue", breaks = 20)
}

Boxplots by Glass Type

par(mfrow = c(3, 3))
for(i in 1:9) {
  boxplot(Glass[,i] ~ Glass$Type, 
          main = names(Glass)[i],
          xlab = "Type", ylab = names(Glass)[i])
}

Correlation Matrix

cor_matrix <- cor(Glass[, 1:9])
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45)

Ca and RI show moderate positive correlation. Most other variables are weakly correlated.

(b) Outliers and Skewness

Skewness Analysis

skew_vals <- apply(Glass[,1:9], 2, skewness)
skew_df <- data.frame(
  Variable = names(skew_vals),
  Skewness = round(skew_vals, 3)
)
print(skew_df)
##    Variable Skewness
## RI       RI    1.614
## Na       Na    0.451
## Mg       Mg   -1.144
## Al       Al    0.901
## Si       Si   -0.725
## K         K    6.506
## Ca       Ca    2.033
## Ba       Ba    3.392
## Fe       Fe    1.742
high_skew <- names(skew_vals[abs(skew_vals) > 1])
cat("\nVariables with high skewness (|skew| > 1):", paste(high_skew, collapse = ", "))
## 
## Variables with high skewness (|skew| > 1): RI, Mg, K, Ca, Ba, Fe

Outlier Detection

outlier_summary <- data.frame(
  Variable = names(Glass)[1:9],
  Outliers = sapply(Glass[,1:9], function(x) {
    Q1 <- quantile(x, 0.25)
    Q3 <- quantile(x, 0.75)
    IQR_val <- Q3 - Q1
    sum(x < (Q1 - 1.5 * IQR_val) | x > (Q3 + 1.5 * IQR_val))
  })
)
print(outlier_summary)
##    Variable Outliers
## RI       RI       17
## Na       Na        7
## Mg       Mg        0
## Al       Al       18
## Si       Si       12
## K         K        7
## Ca       Ca       26
## Ba       Ba       38
## Fe       Fe       12

RI, Mg, K, Ca, Ba, and Fe all show high skewness (|skewness| > 1). K and Ba are particularly severe with skewness values over 3, and both have multiple outliers.

(c) Transformation Recommendations

Box-Cox Transformations

for(var in high_skew) {
  x <- Glass[,var]
  
  if(min(x) <= 0) {
    x <- x + abs(min(x)) + 1
  }
  
  bc_trans <- BoxCoxTrans(x)
  transformed <- predict(bc_trans, x)
  
  cat(sprintf("\n%s:\n", var))
  cat(sprintf("  Original skewness: %.3f\n", skewness(x)))
  cat(sprintf("  Box-Cox lambda: %.3f\n", bc_trans$lambda))
  cat(sprintf("  Transformed skewness: %.3f\n", skewness(transformed)))
}
## 
## RI:
##   Original skewness: 1.614
##   Box-Cox lambda: -2.000
##   Transformed skewness: 1.577
## 
## Mg:
##   Original skewness: -1.144
##   Box-Cox lambda: 2.000
##   Transformed skewness: -0.934
## 
## K:
##   Original skewness: 6.506
##   Box-Cox lambda: -1.000
##   Transformed skewness: -0.088
## 
## Ca:
##   Original skewness: 2.033
##   Box-Cox lambda: -1.100
##   Transformed skewness: -0.195
## 
## Ba:
##   Original skewness: 3.392
##   Box-Cox lambda: -2.000
##   Transformed skewness: 2.151
## 
## Fe:
##   Original skewness: 1.742
##   Box-Cox lambda: -2.000
##   Transformed skewness: 1.336

Visual Comparison

par(mfrow = c(2, length(high_skew)))
for(var in high_skew) {
  x <- Glass[,var]
  if(min(x) <= 0) x <- x + abs(min(x)) + 1
  bc_trans <- BoxCoxTrans(x)
  
  hist(Glass[,var], main = paste("Original", var), 
       breaks = 20, col = "lightblue")
  hist(predict(bc_trans, x), main = paste("Transformed", var), 
       breaks = 20, col = "lightgreen")
}

Recommendations

Several preprocessing steps would help improve model performance with this dataset. First, all predictors should be centered and scaled to account for their different measurement units. The K, Ba, and Fe variables would benefit from Box-Cox transformations given their severe skewness. It’s also worth monitoring the moderate correlation between Ca and RI, as this could lead to multicollinearity issues. Finally, if outliers continue to be problematic after applying these transformations, a spatial sign transformation could be used to further reduce their impact.

preproc <- preProcess(Glass[,1:9], method = c('BoxCox', 'center', 'scale'))
glass_processed <- predict(preproc, Glass[,1:9])

Exercise 3.2: Soybean Data Analysis

Load Data

library(VIM)
library(dplyr)

data(Soybean)
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

(a) Frequency Distributions

Analyze Categorical Predictors

freq_summary <- data.frame(
  Variable = character(),
  Unique_Levels = integer(),
  Dominant_Percent = numeric(),
  Status = character(),
  stringsAsFactors = FALSE
)

degenerate_vars <- c()

for(i in 1:(ncol(Soybean)-1)) {
  var_name <- names(Soybean)[i]
  
  if(is.factor(Soybean[,i])) {
    freq_table <- table(Soybean[,i], useNA = "no")
    prop_table <- prop.table(freq_table)
    
    unique_levels <- length(freq_table)
    dominant_pct <- max(prop_table) * 100
    
    status <- "OK"
    if(unique_levels == 1) {
      status <- "Zero Variance"
      degenerate_vars <- c(degenerate_vars, var_name)
    } else if(dominant_pct > 95) {
      status <- "Near-Zero Variance"
      degenerate_vars <- c(degenerate_vars, var_name)
    }
    
    freq_summary <- rbind(freq_summary,
      data.frame(
        Variable = var_name,
        Unique_Levels = unique_levels,
        Dominant_Percent = round(dominant_pct, 2),
        Status = status
      ))
  }
}

print(freq_summary)
##           Variable Unique_Levels Dominant_Percent             Status
## 1            Class            19            13.47                 OK
## 2             date             7            21.85                 OK
## 3      plant.stand             2            54.71                 OK
## 4           precip             3            71.16                 OK
## 5             temp             3            57.27                 OK
## 6             hail             2            77.40                 OK
## 7        crop.hist             4            32.83                 OK
## 8         area.dam             4            33.28                 OK
## 9            sever             3            57.30                 OK
## 10        seed.tmt             3            54.27                 OK
## 11            germ             3            37.30                 OK
## 12    plant.growth             2            66.12                 OK
## 13          leaves             2            88.73                 OK
## 14       leaf.halo             3            57.10                 OK
## 15       leaf.marg             3            59.60                 OK
## 16       leaf.size             3            54.59                 OK
## 17     leaf.shread             2            83.53                 OK
## 18       leaf.malf             2            92.49                 OK
## 19       leaf.mild             3            93.04                 OK
## 20            stem             2            55.62                 OK
## 21         lodging             2            92.53                 OK
## 22    stem.cankers             4            58.76                 OK
## 23   canker.lesion             4            49.61                 OK
## 24 fruiting.bodies             2            81.98                 OK
## 25       ext.decay             3            77.05                 OK
## 26        mycelium             2            99.07 Near-Zero Variance
## 27    int.discolor             3            90.08                 OK
## 28       sclerotia             2            96.90 Near-Zero Variance
## 29      fruit.pods             4            67.95                 OK
## 30     fruit.spots             4            59.79                 OK
## 31            seed             2            80.54                 OK
## 32     mold.growth             2            88.66                 OK
## 33   seed.discolor             2            88.91                 OK
## 34       seed.size             2            90.02                 OK
## 35      shriveling             2            93.41                 OK

Degenerate Variables

cat("\nDegenerate variables detected:\n")
## 
## Degenerate variables detected:
print(degenerate_vars)
## [1] "mycelium"  "sclerotia"
nzv <- nearZeroVar(Soybean[,1:35], saveMetrics = TRUE)
cat("\nNear-zero variance check:\n")
## 
## Near-zero variance check:
print(rownames(nzv)[nzv$nzv == TRUE])
## [1] "leaf.mild" "mycelium"  "sclerotia"

Several variables have near-zero variance with dominant categories exceeding 95% of observations.

(b) Missing Data Patterns

Overall Missing Data

total_missing <- sum(is.na(Soybean[,1:35])) / (nrow(Soybean) * 35) * 100
cat(sprintf("Overall missing: %.2f%%\n", total_missing))
## Overall missing: 9.65%

Missing by Predictor

missing_pct <- colSums(is.na(Soybean[,1:35])) / nrow(Soybean) * 100
missing_sorted <- sort(missing_pct[missing_pct > 0], decreasing = TRUE)

cat("\nVariables with missing data:\n")
## 
## Variables with missing data:
print(round(missing_sorted, 2))
##            hail           sever        seed.tmt         lodging            germ 
##           17.72           17.72           17.72           17.72           16.40 
##       leaf.mild fruiting.bodies     fruit.spots   seed.discolor      shriveling 
##           15.81           15.52           15.52           15.52           15.52 
##     leaf.shread            seed     mold.growth       seed.size       leaf.halo 
##           14.64           13.47           13.47           13.47           12.30 
##       leaf.marg       leaf.size       leaf.malf      fruit.pods          precip 
##           12.30           12.30           12.30           12.30            5.56 
##    stem.cankers   canker.lesion       ext.decay        mycelium    int.discolor 
##            5.56            5.56            5.56            5.56            5.56 
##       sclerotia     plant.stand            temp       crop.hist    plant.growth 
##            5.56            5.27            4.39            2.34            2.34 
##            stem            date        area.dam 
##            2.34            0.15            0.15

Missing by Class

missing_by_class <- Soybean %>%
  group_by(Class) %>%
  summarise(
    n = n(),
    avg_missing = mean(rowSums(is.na(across(1:35))) / 35 * 100)
  ) %>%
  arrange(desc(avg_missing))

print(missing_by_class)
## # A tibble: 19 × 3
##    Class                           n avg_missing
##    <fct>                       <int>       <dbl>
##  1 2-4-d-injury                   16        80.4
##  2 cyst-nematode                  14        68.6
##  3 herbicide-injury                8        57.1
##  4 phytophthora-rot               88        39.4
##  5 diaporthe-pod-&-stem-blight    15        33.7
##  6 alternarialeaf-spot            91         0  
##  7 anthracnose                    44         0  
##  8 bacterial-blight               20         0  
##  9 bacterial-pustule              20         0  
## 10 brown-spot                     92         0  
## 11 brown-stem-rot                 44         0  
## 12 charcoal-rot                   20         0  
## 13 diaporthe-stem-canker          20         0  
## 14 downy-mildew                   20         0  
## 15 frog-eye-leaf-spot             91         0  
## 16 phyllosticta-leaf-spot         20         0  
## 17 powdery-mildew                 20         0  
## 18 purple-seed-stain              20         0  
## 19 rhizoctonia-root-rot           20         0

Visualize Missing Patterns

aggr(Soybean, col = c('navyblue','red'), 
     numbers = TRUE, sortVars = TRUE, 
     cex.axis = 0.6)

## 
##  Variables sorted by number of missings: 
##         Variable       Count
##             hail 0.177159590
##            sever 0.177159590
##         seed.tmt 0.177159590
##          lodging 0.177159590
##             germ 0.163982430
##        leaf.mild 0.158125915
##  fruiting.bodies 0.155197657
##      fruit.spots 0.155197657
##    seed.discolor 0.155197657
##       shriveling 0.155197657
##      leaf.shread 0.146412884
##             seed 0.134699854
##      mold.growth 0.134699854
##        seed.size 0.134699854
##        leaf.halo 0.122986823
##        leaf.marg 0.122986823
##        leaf.size 0.122986823
##        leaf.malf 0.122986823
##       fruit.pods 0.122986823
##           precip 0.055636896
##     stem.cankers 0.055636896
##    canker.lesion 0.055636896
##        ext.decay 0.055636896
##         mycelium 0.055636896
##     int.discolor 0.055636896
##        sclerotia 0.055636896
##      plant.stand 0.052708638
##            roots 0.045387994
##             temp 0.043923865
##        crop.hist 0.023426061
##     plant.growth 0.023426061
##             stem 0.023426061
##             date 0.001464129
##         area.dam 0.001464129
##            Class 0.000000000
##           leaves 0.000000000

Missing data is not random. Certain predictors have high missingness (>40%), and specific disease classes show higher missing rates.

(c) Missing Data Strategy

Categorize Variables

high_missing <- names(missing_pct[missing_pct > 40])
moderate_missing <- names(missing_pct[missing_pct > 10 & missing_pct <= 40])
low_missing <- names(missing_pct[missing_pct > 0 & missing_pct <= 10])

cat("Remove (>40% missing):", length(high_missing), "variables\n")
## Remove (>40% missing): 0 variables
cat(paste(high_missing, collapse = ", "), "\n\n")
cat("Impute (10-40% missing):", length(moderate_missing), "variables\n")
## Impute (10-40% missing): 19 variables
cat(paste(moderate_missing, collapse = ", "), "\n\n")
## hail, sever, seed.tmt, germ, leaf.halo, leaf.marg, leaf.size, leaf.shread, leaf.malf, leaf.mild, lodging, fruiting.bodies, fruit.pods, fruit.spots, seed, mold.growth, seed.discolor, seed.size, shriveling
cat("Impute (<10% missing):", length(low_missing), "variables\n")
## Impute (<10% missing): 14 variables

Clean Dataset

soybean_clean <- Soybean %>%
  select(-all_of(c(high_missing, degenerate_vars)))

for(col in names(soybean_clean)[1:(ncol(soybean_clean)-1)]) {
  if(is.factor(soybean_clean[[col]]) && any(is.na(soybean_clean[[col]]))) {
    mode_val <- names(sort(table(soybean_clean[[col]]), decreasing = TRUE))[1]
    soybean_clean[[col]][is.na(soybean_clean[[col]])] <- mode_val
  }
}

cat("Original dimensions:", dim(Soybean), "\n")
## Original dimensions: 683 36
cat("Cleaned dimensions:", dim(soybean_clean), "\n")
## Cleaned dimensions: 683 34
cat("Remaining missing:", sum(is.na(soybean_clean)), "\n")
## Remaining missing: 31

Recommendations

The missing data strategy should address several key issues found in this dataset. Variables with more than 40% missing data don’t provide enough information for reliable predictions and should be removed. Predictors with near-zero variance also add little to the model since they don’t vary much across observations, so these can be eliminated as well. For the remaining variables with missing values, mode imputation offers a straightforward solution for categorical data. However, mode imputation can distort distributions by overrepresenting the most common category. KNN imputation would do a better job preserving relationships between variables since it uses similar observations to fill in missing values. Tree-based methods could also work well here because they naturally handle categorical data. Since the missing data isn’t random—with certain disease classes showing much higher rates of missingness—KNN or tree-based imputation would likely capture these patterns better than mode imputation.