install.packages(“mlbench”, “GGally”, “caret”, “corrplot”, “e1071”, “lattice”, “skimr”, “BiocManager”,“RANN”)

Q1. The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

  1. Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
data(Glass)

ggplot(Glass, aes(x = Type)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Glass Data", x = "Glass Type", y = "Count") +
  theme_minimal()

glass_predictors <- Glass[, 1:9]
glass_predictors_long <- glass_predictors |>
  gather(key = "variable", value = "value")
ggplot(glass_predictors_long, aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Predictor Variables") +
  theme_minimal()

glass_long <- Glass |>
  gather(key = "predictor", value = "value", -Type)

ggplot(glass_long, aes(x = Type, y = value, fill = Type)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ predictor, scales = "free_y") +
  labs(title = "Predictor Distributions by Glass Type") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

pairs(Glass[, 1:9], 
      main = "Glass Predictors Colored by Type",
      pch = 16, 
      col = rainbow(6)[Glass$Type],  # 6 colors for 6 glass types
      cex = 0.6)

  1. Do there appear to be any outliers in the data? Are any predictors skewed?

Learned from text book, I used the skewness function in the e1071 package to calculates the skewness statistic for each predictor. Yes it does appear there are few predictors have very uneven distribution which could be an indicator of a of large number of outliers exist. For example K has skewness of 6.46, Ba also has higher skwness number3.36 and Fe has a skwness 1.73 and all three above has a unbalanced plot shifting toward one side.

library(mlbench)
library(caret)
library(e1071)
library(lattice)

data(Glass)


par(mfrow = c(3, 3))
for(i in 1:9) {
  hist(Glass[, i], 
       main = paste("Distribution of", names(Glass)[i]),
       xlab = names(Glass)[i],
       col = "lightblue",
       border = "black",
       breaks = 20, 
       probability = TRUE)  
  
  lines(density(Glass[, i], na.rm = TRUE), col = "red", lwd = 2)
  
  abline(v = mean(Glass[, i]), col = "blue", lwd = 2, lty = 2)
  
  skew_val <- e1071::skewness(Glass[, i])
  legend("topright", 
         legend = paste("Skewness:", round(skew_val, 3)),
         bty = "n")
}

par(mfrow = c(1, 1)) 
  1. Are there any relevant transformations of one or more predictors that might improve the classification model?

I tried log and square_root transformation to K and BA. Seems it helps with predictors K.

library(mlbench)
library(caret)
library(e1071)
library(lattice)

create_transformation_plots <- function(data, var_name) {
  original <- data[[var_name]]
  
  plot_data <- data.frame(
    Original = original,
    Log = log(original + 0.001),
    Square_Root = sqrt(original)
  )
  
  plot_data_long <- plot_data |>
    gather(key = "Transformation", value = "Value")
  
  ggplot(plot_data_long, aes(x = Value)) +
    geom_histogram(bins = 30, fill = "steelblue") +
    facet_wrap(~ Transformation, scales = "free") +
    labs(title = paste("Transformations for", var_name)) +
    theme_minimal()
}

create_transformation_plots(Glass, "Ba")

create_transformation_plots(Glass, "K")

create_transformation_plots(Glass,"Fe")

compare_transformations <- function(x) {
  transformations <- list(
    Original = x,
    Log = log(x + 0.001),
    Square_Root = sqrt(x)
  )
  
  skew_comparison <- sapply(transformations, skewness)
  return(round(skew_comparison, 3))
}

transformation_results <- lapply(glass_predictors, compare_transformations)
print(transformation_results)
## $RI
##    Original         Log Square_Root 
##       1.603       1.590       1.597 
## 
## $Na
##    Original         Log Square_Root 
##       0.448       0.034       0.240 
## 
## $Mg
##    Original         Log Square_Root 
##      -1.136      -1.496      -1.346 
## 
## $Al
##    Original         Log Square_Root 
##       0.895      -0.833       0.091 
## 
## $Si
##    Original         Log Square_Root 
##      -0.720      -0.789      -0.755 
## 
## $K
##    Original         Log Square_Root 
##       6.460      -1.590       0.859 
## 
## $Ca
##    Original         Log Square_Root 
##       2.018       1.052       1.550 
## 
## $Ba
##    Original         Log Square_Root 
##       3.369       1.785       2.344 
## 
## $Fe
##    Original         Log Square_Root 
##       1.730       0.794       1.038

Q2 The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

  1. Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

Frequency distributions for the categorical predictors are listed below with category numbers, number of missing values.

data(Soybean)

analyze_categorical <- function(data) {
  cat("CATEGORICAL PREDICTOR ANALYSIS \n\n")
  
  predictors <- data[, -which(names(data) == "Class")]
  
  degenerate_predictors <- c()
  low_variance_predictors <- c()
  
  for(col_name in names(predictors)) {
    freq_table <- table(predictors[[col_name]])
    n_categories <- length(freq_table)
    max_prop <- max(prop.table(freq_table))
    n_missing <- sum(is.na(predictors[[col_name]]))
    
    cat("Predictor:", col_name, "\n")
    cat("  Categories:", n_categories, "\n")
    cat("  Most frequent category proportion:", round(max_prop, 3), "\n")
    cat("  Missing values:", n_missing, "\n")
    
    if(max_prop > 0.95) {
      degenerate_predictors <- c(degenerate_predictors, col_name)
      cat("  ** DEGENERATE **\n")
    } else if(max_prop > 0.90) {
      low_variance_predictors <- c(low_variance_predictors, col_name)
      cat("  ** LOW VARIANCE **\n")
    }
    cat("  Frequency table:\n")
    print(round(prop.table(freq_table), 3))
    cat("---\n")
  }
  
  return(list(degenerate = degenerate_predictors, 
              low_variance = low_variance_predictors))
}

predictor_analysis <- analyze_categorical(Soybean)
## CATEGORICAL PREDICTOR ANALYSIS 
## 
## Predictor: date 
##   Categories: 7 
##   Most frequent category proportion: 0.218 
##   Missing values: 1 
##   Frequency table:
## 
##     0     1     2     3     4     5     6 
## 0.038 0.110 0.136 0.173 0.192 0.218 0.132 
## ---
## Predictor: plant.stand 
##   Categories: 2 
##   Most frequent category proportion: 0.547 
##   Missing values: 36 
##   Frequency table:
## 
##     0     1 
## 0.547 0.453 
## ---
## Predictor: precip 
##   Categories: 3 
##   Most frequent category proportion: 0.712 
##   Missing values: 38 
##   Frequency table:
## 
##     0     1     2 
## 0.115 0.174 0.712 
## ---
## Predictor: temp 
##   Categories: 3 
##   Most frequent category proportion: 0.573 
##   Missing values: 30 
##   Frequency table:
## 
##     0     1     2 
## 0.123 0.573 0.305 
## ---
## Predictor: hail 
##   Categories: 2 
##   Most frequent category proportion: 0.774 
##   Missing values: 121 
##   Frequency table:
## 
##     0     1 
## 0.774 0.226 
## ---
## Predictor: crop.hist 
##   Categories: 4 
##   Most frequent category proportion: 0.328 
##   Missing values: 16 
##   Frequency table:
## 
##     0     1     2     3 
## 0.097 0.247 0.328 0.327 
## ---
## Predictor: area.dam 
##   Categories: 4 
##   Most frequent category proportion: 0.333 
##   Missing values: 1 
##   Frequency table:
## 
##     0     1     2     3 
## 0.180 0.333 0.213 0.274 
## ---
## Predictor: sever 
##   Categories: 3 
##   Most frequent category proportion: 0.573 
##   Missing values: 121 
##   Frequency table:
## 
##     0     1     2 
## 0.347 0.573 0.080 
## ---
## Predictor: seed.tmt 
##   Categories: 3 
##   Most frequent category proportion: 0.543 
##   Missing values: 121 
##   Frequency table:
## 
##     0     1     2 
## 0.543 0.395 0.062 
## ---
## Predictor: germ 
##   Categories: 3 
##   Most frequent category proportion: 0.373 
##   Missing values: 112 
##   Frequency table:
## 
##     0     1     2 
## 0.289 0.373 0.338 
## ---
## Predictor: plant.growth 
##   Categories: 2 
##   Most frequent category proportion: 0.661 
##   Missing values: 16 
##   Frequency table:
## 
##     0     1 
## 0.661 0.339 
## ---
## Predictor: leaves 
##   Categories: 2 
##   Most frequent category proportion: 0.887 
##   Missing values: 0 
##   Frequency table:
## 
##     0     1 
## 0.113 0.887 
## ---
## Predictor: leaf.halo 
##   Categories: 3 
##   Most frequent category proportion: 0.571 
##   Missing values: 84 
##   Frequency table:
## 
##     0     1     2 
## 0.369 0.060 0.571 
## ---
## Predictor: leaf.marg 
##   Categories: 3 
##   Most frequent category proportion: 0.596 
##   Missing values: 84 
##   Frequency table:
## 
##     0     1     2 
## 0.596 0.035 0.369 
## ---
## Predictor: leaf.size 
##   Categories: 3 
##   Most frequent category proportion: 0.546 
##   Missing values: 84 
##   Frequency table:
## 
##     0     1     2 
## 0.085 0.546 0.369 
## ---
## Predictor: leaf.shread 
##   Categories: 2 
##   Most frequent category proportion: 0.835 
##   Missing values: 100 
##   Frequency table:
## 
##     0     1 
## 0.835 0.165 
## ---
## Predictor: leaf.malf 
##   Categories: 2 
##   Most frequent category proportion: 0.925 
##   Missing values: 84 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##     0     1 
## 0.925 0.075 
## ---
## Predictor: leaf.mild 
##   Categories: 3 
##   Most frequent category proportion: 0.93 
##   Missing values: 108 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##     0     1     2 
## 0.930 0.035 0.035 
## ---
## Predictor: stem 
##   Categories: 2 
##   Most frequent category proportion: 0.556 
##   Missing values: 16 
##   Frequency table:
## 
##     0     1 
## 0.444 0.556 
## ---
## Predictor: lodging 
##   Categories: 2 
##   Most frequent category proportion: 0.925 
##   Missing values: 121 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##     0     1 
## 0.925 0.075 
## ---
## Predictor: stem.cankers 
##   Categories: 4 
##   Most frequent category proportion: 0.588 
##   Missing values: 38 
##   Frequency table:
## 
##     0     1     2     3 
## 0.588 0.060 0.056 0.296 
## ---
## Predictor: canker.lesion 
##   Categories: 4 
##   Most frequent category proportion: 0.496 
##   Missing values: 38 
##   Frequency table:
## 
##     0     1     2     3 
## 0.496 0.129 0.274 0.101 
## ---
## Predictor: fruiting.bodies 
##   Categories: 2 
##   Most frequent category proportion: 0.82 
##   Missing values: 106 
##   Frequency table:
## 
##    0    1 
## 0.82 0.18 
## ---
## Predictor: ext.decay 
##   Categories: 3 
##   Most frequent category proportion: 0.771 
##   Missing values: 38 
##   Frequency table:
## 
##     0     1     2 
## 0.771 0.209 0.020 
## ---
## Predictor: mycelium 
##   Categories: 2 
##   Most frequent category proportion: 0.991 
##   Missing values: 38 
##   ** DEGENERATE **
##   Frequency table:
## 
##     0     1 
## 0.991 0.009 
## ---
## Predictor: int.discolor 
##   Categories: 3 
##   Most frequent category proportion: 0.901 
##   Missing values: 38 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##     0     1     2 
## 0.901 0.068 0.031 
## ---
## Predictor: sclerotia 
##   Categories: 2 
##   Most frequent category proportion: 0.969 
##   Missing values: 38 
##   ** DEGENERATE **
##   Frequency table:
## 
##     0     1 
## 0.969 0.031 
## ---
## Predictor: fruit.pods 
##   Categories: 4 
##   Most frequent category proportion: 0.679 
##   Missing values: 84 
##   Frequency table:
## 
##     0     1     2     3 
## 0.679 0.217 0.023 0.080 
## ---
## Predictor: fruit.spots 
##   Categories: 4 
##   Most frequent category proportion: 0.598 
##   Missing values: 106 
##   Frequency table:
## 
##     0     1     2     4 
## 0.598 0.130 0.099 0.173 
## ---
## Predictor: seed 
##   Categories: 2 
##   Most frequent category proportion: 0.805 
##   Missing values: 92 
##   Frequency table:
## 
##     0     1 
## 0.805 0.195 
## ---
## Predictor: mold.growth 
##   Categories: 2 
##   Most frequent category proportion: 0.887 
##   Missing values: 92 
##   Frequency table:
## 
##     0     1 
## 0.887 0.113 
## ---
## Predictor: seed.discolor 
##   Categories: 2 
##   Most frequent category proportion: 0.889 
##   Missing values: 106 
##   Frequency table:
## 
##     0     1 
## 0.889 0.111 
## ---
## Predictor: seed.size 
##   Categories: 2 
##   Most frequent category proportion: 0.9 
##   Missing values: 92 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##   0   1 
## 0.9 0.1 
## ---
## Predictor: shriveling 
##   Categories: 2 
##   Most frequent category proportion: 0.934 
##   Missing values: 106 
##   ** LOW VARIANCE **
##   Frequency table:
## 
##     0     1 
## 0.934 0.066 
## ---
## Predictor: roots 
##   Categories: 3 
##   Most frequent category proportion: 0.845 
##   Missing values: 31 
##   Frequency table:
## 
##     0     1     2 
## 0.845 0.132 0.023 
## ---
  1. Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

Yes, there are pattern of missing data related to classes and predictors. Such as predictors stem.canckers, seed.tmt and seed.size, etc. class that have missing data include cyst-nematodes and herbicide-injur as shown in the graph below.

data(Soybean)

total_missing <- sum(is.na(Soybean))
total_cells <- nrow(Soybean) * ncol(Soybean)

missing_by_predictor <- sapply(Soybean, function(x) sum(is.na(x)))
missing_percent <- round(missing_by_predictor / nrow(Soybean) * 100, 1)

missing_df <- data.frame(
  Predictor = names(missing_by_predictor),
  Missing_Count = missing_by_predictor,
  Missing_Percent = missing_percent
)
missing_df <- missing_df[order(-missing_df$Missing_Percent), ]


missing_by_class <- aggregate(. ~ Class, 
                             data = Soybean, 
                             FUN = function(x) sum(is.na(x)))
library(ggplot2)
library(tidyr)

soybean_missing <- as.data.frame(lapply(Soybean, function(x) as.numeric(is.na(x))))
soybean_missing$Class <- Soybean$Class

missing_long <- soybean_missing |>
  group_by(Class) |>
  summarise(across(everything(), mean)) |>
  pivot_longer(-Class, names_to = "Predictor", values_to = "MissingRate")

ggplot(missing_long, aes(x = Predictor, y = Class, fill = MissingRate)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red", name = "Missing Rate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(title = "Missing Data by Class")

  1. Develop a strategy for handling missing data, either by eliminating predictors or imputation.

The text booked talked about impute missing values, by using function from the impute package -impute.knn, it uses Knearest neighbors to estimate the missing data. And verified that missing value after imputation is 0.

library(mlbench)
library(caret)
data(Soybean)

impute_soybean <- function(data, k = 5) {
  data_imputed <- data
  numeric_data <- data_imputed
  factor_levels <- list()
  
  for(col in names(numeric_data)) {
    if(is.factor(numeric_data[[col]])) {
      factor_levels[[col]] <- levels(numeric_data[[col]])
      numeric_data[[col]] <- as.numeric(numeric_data[[col]])
    }
  }
  
  set.seed(123)
  preproc <- preProcess(numeric_data, method = "knnImpute", k = k)
  imputed_numeric <- predict(preproc, numeric_data)
  
  for(col in names(data_imputed)) {
    if(is.factor(data_imputed[[col]])) {
      imputed_values <- imputed_numeric[[col]]
      center <- preproc$mean[col]
      scale_val <- preproc$std[col]
      original_scale <- (imputed_values * scale_val) + center
      rounded <- round(original_scale)
      n_levels <- length(factor_levels[[col]])
      rounded <- pmin(pmax(rounded, 1), n_levels)
      data_imputed[[col]] <- factor(rounded, 
                                   levels = 1:n_levels,
                                   labels = factor_levels[[col]])
    }
  }
  return(data_imputed)
}

soybean_imputed <- impute_soybean(Soybean, k = 5)

cat("Missing values after imputation:", sum(is.na(soybean_imputed)))
## Missing values after imputation: 0