{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)

Load Required Libraries

options(repos = c(CRAN = "https://cloud.r-project.org"))

# Function to install a package if not already installed
install_if_needed <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)

  }
}

# List of packages to check and install if necessary
required_packages <- c("fpp3", "dplyr", "ggplot2", "lubridate", "tsibble", 
                       "tsibbledata", "feasts", "fable", "fabletools", 
                       "curl", "USgas", "readxl", "readr", "tidyr", "forecast",
                       "seasonal", "patchwork", "LaTeX", "tinytex", "mlbench", 
                       "VIM", "mice", "missForest", "caret")

# Loop through the list and install packages only if needed
for (pkg in required_packages) {
  install_if_needed(pkg)
}
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Installing package into 'C:/Users/RemoteUser/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'LaTeX' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
# Function to suppress package startup messages
suppressPackageStartupMessages({
  library(fpp3)
  library(dplyr)
  library(ggplot2)
  library(lubridate)
  library(tsibble)
  library(tsibbledata)
  library(feasts)
  library(fable)
  library(fabletools)
  library(readr)
  library(readxl)
  library(tidyr)
  library(forecast)
  library(seasonal)
  library(patchwork)
  library(tinytex)
  library(mlbench)
  library(caret)
})
# Load necessary libraries 
library(mlbench) 
library(ggplot2) 
library(GGally) 
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Exercise 3.1.

# Preview the data
data(Glass)
glass_data <- Glass
str(glass_data)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values in each column
missing_data <- colSums(is.na(glass_data))

# Print out columns with missing data
if (any(missing_data > 0)) {
  cat("Columns with missing data:\n")
  print(missing_data[missing_data > 0])
} else {
  cat("There are no missing values in the dataset.\n")
}
## There are no missing values in the dataset.

a. Explore the predictor variables

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

1. Plot individual histograms for each predictor to explore distributions

# Load necessary library
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Melt the glass data to long format
glass_melted <- melt(glass_data, id.vars = "Type")

# Plot histograms for each predictor
ggplot(glass_melted, aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictor Variables")

2. Draw box plot to display the distribution, central tendency, and variability of a dataset, highlighting the median, quartiles, and potential outliers.

# Individual box plot for each predictor to explore distributions
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = value)) +
  geom_boxplot(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictor Variables")
## Warning in geom_boxplot(bins = 30, fill = "skyblue", color = "black"): Ignoring
## unknown parameters: `bins`

#### 3. Plot scatter plot to visualize any linear relationshp

# Individual scatter plot with lm line for each predictor to explore relationships
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = as.numeric(value), y = as.numeric(Type))) +
  geom_point(color = "black") +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ variable, scales = "free_x") +  # Facet by variable with independent x-axis scaling
  theme_minimal() +
  labs(title = "Scatter Plots of Predictor Variables with Linear Model Line", x = "Value", y = "Type")
## `geom_smooth()` using formula = 'y ~ x'

4. Plot Pair plot to explore relationships and distributions between predictors

# And excluding the Type variable as it's the target variable 
ggpairs(Glass[, -10]) + theme_bw() 

## b. Do there appear to be any outliers in the data? Are any predictors skewed?

From the visualizations, we can make some observations about potential outliers and skewness in the predictor variables:

1. Outliers

  • Boxplots: Several variables show potential outliers, as evidenced by the dots outside the whiskers of the boxplots. Notable outliers are visible in:
    • Mg: There are points far outside the main box, indicating extreme values.
    • K: There are a few outliers on both the low and high ends.
    • Ba: Contains some extreme outliers above the typical range.
    • Fe: A few data points exist far outside the whiskers, indicating potential outliers.

2. Skewness:

  • Histograms: Several predictors show significant skewness. Specifically:
    • Ca and Ba: These are highly right-skewed, with many values clustered towards the left and a long tail to the right.
    • Fe: Highly right-skewed, with most values near zero and a few higher values extending the tail.
    • K: This predictor also appears to be right-skewed.
    • Mg: Shows slight right-skewness.
  • Normality: Some variables such as Si and Na show more symmetric distributions, closer to normality.

c. Transformations

Are there any relevant transformations of one or more predictors that might improve the classification model?

Here are some transformations that might be relevant for improving the classification model:

1. Centering and Scaling

  • Centering and scaling the data ensures that all predictor variables are on a common scale, which is especially important for models sensitive to predictor magnitudes like linear models and SVMs.
  • This is recommended for variables like RI, Na, Si, etc., especially if they vary across different scales.
# Exclude the 'Type' column (non-numeric) and scale the numeric columns
glass_data_numeric <- glass_data[, -ncol(glass_data)]  # Exclude the last column ('Type')
glass_data_scaled <- scale(glass_data_numeric)  # Scale the numeric columns

# Combine the scaled numeric data with the 'Type' column
glass_data_scaled <- data.frame(glass_data_scaled, Type = glass_data$Type)

# Check the first few rows of the scaled data
head(glass_data_scaled)
##           RI         Na        Mg         Al          Si           K         Ca
## 1  0.8708258  0.2842867 1.2517037 -0.6908222 -1.12444556 -0.67013422 -0.1454254
## 2 -0.2487502  0.5904328 0.6346799 -0.1700615  0.10207972 -0.02615193 -0.7918771
## 3 -0.7196308  0.1495824 0.6000157  0.1904651  0.43776033 -0.16414813 -0.8270103
## 4 -0.2322859 -0.2422846 0.6970756 -0.3102663 -0.05284979  0.11184428 -0.5178378
## 5 -0.3113148 -0.1688095 0.6485456 -0.4104126  0.55395746  0.08117845 -0.6232375
## 6 -0.7920739 -0.7566101 0.6416128  0.3506992  0.41193874  0.21917466 -0.6232375
##           Ba         Fe Type
## 1 -0.3520514 -0.5850791    1
## 2 -0.3520514 -0.5850791    1
## 3 -0.3520514 -0.5850791    1
## 4 -0.3520514 -0.5850791    1
## 5 -0.3520514 -0.5850791    1
## 6 -0.3520514  2.0832652    1

2. Skewness Transformation

  • Skewed distributions, particularly right-skewed ones like Ca, Ba, and Fe, can benefit from transformations to make the data more symmetric.
  • Transformations like log, square root, or Box-Cox can be used to address skewness, as this can improve model performance, especially for models that assume normally distributed inputs.
## Example in Box-Cox transformation:
library(caret)

# Apply Box-Cox transformation to skewed variables
trans <- preProcess(glass_data, method = c("BoxCox"))
glass_data_transformed <- predict(trans, glass_data)

head(glass_data_transformed)
##          RI       Na   Mg        Al       Si    K        Ca Ba   Fe Type
## 1 0.2838746 2.613007 4.49 0.0976177 2575.684 0.06 0.8254539  0 0.00    1
## 2 0.2829051 2.631169 3.60 0.3323808 2644.326 0.48 0.8145827  0 0.00    1
## 3 0.2824954 2.604909 3.55 0.4819347 2663.270 0.39 0.8139144  0 0.00    1
## 4 0.2829194 2.580974 3.69 0.2715633 2635.606 0.57 0.8195032  0 0.00    1
## 5 0.2828507 2.585506 3.62 0.2271057 2669.843 0.55 0.8176698  0 0.00    1
## 6 0.2824323 2.548664 3.61 0.5455844 2661.810 0.64 0.8176698  0 0.26    1

3. Dealing with Outliers

  • Outliers can disproportionately influence model training. Methods like the spatial sign transformation help mitigate the effects of outliers by projecting data onto a common distance from the center.
  • This can be useful for variables like Mg, K, and Ba, where outliers are present.
# Example in R (Spatial Sign Transformation):

library(caret)
glass_data_spatial <- preProcess(glass_data, method = c("spatialSign"))
head(glass_data_spatial)
## $dim
## [1] 214  10
## 
## $bc
## NULL
## 
## $yj
## NULL
## 
## $et
## NULL
## 
## $invHyperbolicSine
## NULL
## 
## $mean
##          RI          Na          Mg          Al          Si           K 
##  1.51836542 13.40785047  2.68453271  1.44490654 72.65093458  0.49705607 
##          Ca          Ba          Fe 
##  8.95696262  0.17504673  0.05700935

Exercise 3.2

# load the data
data(Soybean)
soybean_data <- Soybean
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

a. Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

# Identify the frequency distribution of each factor predictor
freq_tables_list <- list()

# Loop through all columns to calculate the frequency distribution
for (col in colnames(soybean_data)) {
  freq_table <- table(soybean_data[[col]])  # Create a frequency table for each column
  freq_df <- as.data.frame(freq_table)      # Convert the table to a data frame
  colnames(freq_df) <- c("Category", "Frequency")  # Rename columns for clarity
  freq_df$Predictor <- col                   # Add the predictor name
  freq_tables_list[[col]] <- freq_df         # Store the frequency table in a list
}

# Combine all frequency tables into one data frame
all_freq_tables <- do.call(rbind, freq_tables_list)

# View the combined frequency table
head(all_freq_tables)
##                    Category Frequency Predictor
## Class.1        2-4-d-injury        16     Class
## Class.2 alternarialeaf-spot        91     Class
## Class.3         anthracnose        44     Class
## Class.4    bacterial-blight        20     Class
## Class.5   bacterial-pustule        20     Class
## Class.6          brown-spot        92     Class

Looking at the frequency distribution table, we can evaluate whether any of the distributions are degenerate, meaning they provide little variability or are heavily skewed to one category, as discussed in the chapter on near-zero variance predictors in the book.

Identifying Degenerate Distributions: A degenerate distribution would have the majority of observations concentrated in one or two categories, making it potentially less useful for modeling. Based on the table above:

  • “alternarialeaf-spot” (91) and “brown-spot” (92) have high frequencies compared to other classes, while “2-4-d injury” (16) is quite rare.
  • Imbalance: There is some imbalance in the distribution, with a majority of data points falling into a few categories (i.e., “alternarialeaf-spot” and “brown-spot”). If most observations fall into only one or two categories, the model might have difficulty distinguishing between the other classes, potentially leading to poor generalization.

However, the distributions are not fully degenerate in the strictest sense but there is class imbalance, which could affect model performance.

c. Develop a strategy for handling missing data, either by eliminating predictors or imputation.

To handle missing data, follow these strategies based on the book:

  1. Remove Predictors: Eliminate predictors with extensive missing data if they don’t contribute much to the model.
# Load necessary library
library(dplyr)

# Define a threshold for missing data (e.g., 50% missing data)
missing_threshold <- 0.5  # 50%

# Calculate the percentage of missing values for each predictor
missing_data_percentage <- colSums(is.na(soybean_data)) / nrow(soybean_data)

# Remove predictors with missing data above the threshold
soybean_data_cleaned <- soybean_data %>%
  select(which(missing_data_percentage <= missing_threshold))

# Check the resulting dataset to ensure columns are removed
colSums(is.na(soybean_data_cleaned))
##           Class            date     plant.stand          precip            temp 
##               0               1              36              38              30 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##             121              16               1             121             121 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##             112              16               0              84              84 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##              84             100              84             108              16 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##             121              38              38             106              38 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##              38              38              38              84             106 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##              92              92             106              92             106 
##           roots 
##              31
# View the first few rows of the cleaned dataset
head(soybean_data_cleaned)
##                   Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker    6           0      2    1    0         1        1
## 2 diaporthe-stem-canker    4           0      2    1    0         2        0
## 3 diaporthe-stem-canker    3           0      2    1    0         1        0
## 4 diaporthe-stem-canker    3           0      2    1    0         1        0
## 5 diaporthe-stem-canker    6           0      2    1    0         2        0
## 6 diaporthe-stem-canker    5           0      2    1    0         3        0
##   sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1     1        0    0            1      1         0         2         2
## 2     2        1    1            1      1         0         2         2
## 3     2        1    2            1      1         0         2         2
## 4     2        0    1            1      1         0         2         2
## 5     1        0    2            1      1         0         2         2
## 6     1        0    1            1      1         0         2         2
##   leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1           0         0         0    1       1            3             1
## 2           0         0         0    1       0            3             1
## 3           0         0         0    1       0            3             0
## 4           0         0         0    1       0            3             0
## 5           0         0         0    1       0            3             1
## 6           0         0         0    1       0            3             0
##   fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1               1         1        0            0         0          0
## 2               1         1        0            0         0          0
## 3               1         1        0            0         0          0
## 4               1         1        0            0         0          0
## 5               1         1        0            0         0          0
## 6               1         1        0            0         0          0
##   fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1           4    0           0             0         0          0     0
## 2           4    0           0             0         0          0     0
## 3           4    0           0             0         0          0     0
## 4           4    0           0             0         0          0     0
## 5           4    0           0             0         0          0     0
## 6           4    0           0             0         0          0     0
  1. Imputation:
# Load necessary library
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Apply KNN imputation to the dataset
soybean_data_knn <- kNN(soybean_data, k = 5)

# Check for missing values after imputation
# colSums(is.na()) checks for missing values column-wise
colSums(is.na(soybean_data_knn))
##               Class                date         plant.stand              precip 
##                   0                   0                   0                   0 
##                temp                hail           crop.hist            area.dam 
##                   0                   0                   0                   0 
##               sever            seed.tmt                germ        plant.growth 
##                   0                   0                   0                   0 
##              leaves           leaf.halo           leaf.marg           leaf.size 
##                   0                   0                   0                   0 
##         leaf.shread           leaf.malf           leaf.mild                stem 
##                   0                   0                   0                   0 
##             lodging        stem.cankers       canker.lesion     fruiting.bodies 
##                   0                   0                   0                   0 
##           ext.decay            mycelium        int.discolor           sclerotia 
##                   0                   0                   0                   0 
##          fruit.pods         fruit.spots                seed         mold.growth 
##                   0                   0                   0                   0 
##       seed.discolor           seed.size          shriveling               roots 
##                   0                   0                   0                   0 
##           Class_imp            date_imp     plant.stand_imp          precip_imp 
##                   0                   0                   0                   0 
##            temp_imp            hail_imp       crop.hist_imp        area.dam_imp 
##                   0                   0                   0                   0 
##           sever_imp        seed.tmt_imp            germ_imp    plant.growth_imp 
##                   0                   0                   0                   0 
##          leaves_imp       leaf.halo_imp       leaf.marg_imp       leaf.size_imp 
##                   0                   0                   0                   0 
##     leaf.shread_imp       leaf.malf_imp       leaf.mild_imp            stem_imp 
##                   0                   0                   0                   0 
##         lodging_imp    stem.cankers_imp   canker.lesion_imp fruiting.bodies_imp 
##                   0                   0                   0                   0 
##       ext.decay_imp        mycelium_imp    int.discolor_imp       sclerotia_imp 
##                   0                   0                   0                   0 
##      fruit.pods_imp     fruit.spots_imp            seed_imp     mold.growth_imp 
##                   0                   0                   0                   0 
##   seed.discolor_imp       seed.size_imp      shriveling_imp           roots_imp 
##                   0                   0                   0                   0
# View the first few rows of the imputed dataset
head(soybean_data_knn)
##                   Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker    6           0      2    1    0         1        1
## 2 diaporthe-stem-canker    4           0      2    1    0         2        0
## 3 diaporthe-stem-canker    3           0      2    1    0         1        0
## 4 diaporthe-stem-canker    3           0      2    1    0         1        0
## 5 diaporthe-stem-canker    6           0      2    1    0         2        0
## 6 diaporthe-stem-canker    5           0      2    1    0         3        0
##   sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1     1        0    0            1      1         0         2         2
## 2     2        1    1            1      1         0         2         2
## 3     2        1    2            1      1         0         2         2
## 4     2        0    1            1      1         0         2         2
## 5     1        0    2            1      1         0         2         2
## 6     1        0    1            1      1         0         2         2
##   leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1           0         0         0    1       1            3             1
## 2           0         0         0    1       0            3             1
## 3           0         0         0    1       0            3             0
## 4           0         0         0    1       0            3             0
## 5           0         0         0    1       0            3             1
## 6           0         0         0    1       0            3             0
##   fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1               1         1        0            0         0          0
## 2               1         1        0            0         0          0
## 3               1         1        0            0         0          0
## 4               1         1        0            0         0          0
## 5               1         1        0            0         0          0
## 6               1         1        0            0         0          0
##   fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1           4    0           0             0         0          0     0
## 2           4    0           0             0         0          0     0
## 3           4    0           0             0         0          0     0
## 4           4    0           0             0         0          0     0
## 5           4    0           0             0         0          0     0
## 6           4    0           0             0         0          0     0
##   Class_imp date_imp plant.stand_imp precip_imp temp_imp hail_imp crop.hist_imp
## 1     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
## 2     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
## 3     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
## 4     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
## 5     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
## 6     FALSE    FALSE           FALSE      FALSE    FALSE    FALSE         FALSE
##   area.dam_imp sever_imp seed.tmt_imp germ_imp plant.growth_imp leaves_imp
## 1        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
## 2        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
## 3        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
## 4        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
## 5        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
## 6        FALSE     FALSE        FALSE    FALSE            FALSE      FALSE
##   leaf.halo_imp leaf.marg_imp leaf.size_imp leaf.shread_imp leaf.malf_imp
## 1         FALSE         FALSE         FALSE           FALSE         FALSE
## 2         FALSE         FALSE         FALSE           FALSE         FALSE
## 3         FALSE         FALSE         FALSE           FALSE         FALSE
## 4         FALSE         FALSE         FALSE           FALSE         FALSE
## 5         FALSE         FALSE         FALSE           FALSE         FALSE
## 6         FALSE         FALSE         FALSE           FALSE         FALSE
##   leaf.mild_imp stem_imp lodging_imp stem.cankers_imp canker.lesion_imp
## 1         FALSE    FALSE       FALSE            FALSE             FALSE
## 2         FALSE    FALSE       FALSE            FALSE             FALSE
## 3         FALSE    FALSE       FALSE            FALSE             FALSE
## 4         FALSE    FALSE       FALSE            FALSE             FALSE
## 5         FALSE    FALSE       FALSE            FALSE             FALSE
## 6         FALSE    FALSE       FALSE            FALSE             FALSE
##   fruiting.bodies_imp ext.decay_imp mycelium_imp int.discolor_imp sclerotia_imp
## 1               FALSE         FALSE        FALSE            FALSE         FALSE
## 2               FALSE         FALSE        FALSE            FALSE         FALSE
## 3               FALSE         FALSE        FALSE            FALSE         FALSE
## 4               FALSE         FALSE        FALSE            FALSE         FALSE
## 5               FALSE         FALSE        FALSE            FALSE         FALSE
## 6               FALSE         FALSE        FALSE            FALSE         FALSE
##   fruit.pods_imp fruit.spots_imp seed_imp mold.growth_imp seed.discolor_imp
## 1          FALSE           FALSE    FALSE           FALSE             FALSE
## 2          FALSE           FALSE    FALSE           FALSE             FALSE
## 3          FALSE           FALSE    FALSE           FALSE             FALSE
## 4          FALSE           FALSE    FALSE           FALSE             FALSE
## 5          FALSE           FALSE    FALSE           FALSE             FALSE
## 6          FALSE           FALSE    FALSE           FALSE             FALSE
##   seed.size_imp shriveling_imp roots_imp
## 1         FALSE          FALSE     FALSE
## 2         FALSE          FALSE     FALSE
## 3         FALSE          FALSE     FALSE
## 4         FALSE          FALSE     FALSE
## 5         FALSE          FALSE     FALSE
## 6         FALSE          FALSE     FALSE

The missForest package is ideal for tree-based methods like Random Forest, which can handle missing data natively. It predicts missing values by training a random forest model on observed data.

# Load necessary library
library(missForest)
## 
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
## 
##     nrmse
# Apply random forest imputation
soybean_data_forest <- missForest(soybean_data)

# Extract the imputed data
soybean_data_forest_imputed <- soybean_data_forest$ximp

# Check for missing values after imputation
colSums(is.na(soybean_data_forest_imputed))
##           Class            date     plant.stand          precip            temp 
##               0               0               0               0               0 
##            hail       crop.hist        area.dam           sever        seed.tmt 
##               0               0               0               0               0 
##            germ    plant.growth          leaves       leaf.halo       leaf.marg 
##               0               0               0               0               0 
##       leaf.size     leaf.shread       leaf.malf       leaf.mild            stem 
##               0               0               0               0               0 
##         lodging    stem.cankers   canker.lesion fruiting.bodies       ext.decay 
##               0               0               0               0               0 
##        mycelium    int.discolor       sclerotia      fruit.pods     fruit.spots 
##               0               0               0               0               0 
##            seed     mold.growth   seed.discolor       seed.size      shriveling 
##               0               0               0               0               0 
##           roots 
##               0
# Summarize if there are any columns with missing data
if (any(is.na(soybean_data_forest_imputed))) {
  print("There are still missing values in the dataset.")
} else {
  print("No missing values in the dataset.")
}
## [1] "No missing values in the dataset."
# View the first few rows of the imputed dataset to inspect
head(soybean_data_forest_imputed)
##                   Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker    6           0      2    1    0         1        1
## 2 diaporthe-stem-canker    4           0      2    1    0         2        0
## 3 diaporthe-stem-canker    3           0      2    1    0         1        0
## 4 diaporthe-stem-canker    3           0      2    1    0         1        0
## 5 diaporthe-stem-canker    6           0      2    1    0         2        0
## 6 diaporthe-stem-canker    5           0      2    1    0         3        0
##   sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1     1        0    0            1      1         0         2         2
## 2     2        1    1            1      1         0         2         2
## 3     2        1    2            1      1         0         2         2
## 4     2        0    1            1      1         0         2         2
## 5     1        0    2            1      1         0         2         2
## 6     1        0    1            1      1         0         2         2
##   leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1           0         0         0    1       1            3             1
## 2           0         0         0    1       0            3             1
## 3           0         0         0    1       0            3             0
## 4           0         0         0    1       0            3             0
## 5           0         0         0    1       0            3             1
## 6           0         0         0    1       0            3             0
##   fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1               1         1        0            0         0          0
## 2               1         1        0            0         0          0
## 3               1         1        0            0         0          0
## 4               1         1        0            0         0          0
## 5               1         1        0            0         0          0
## 6               1         1        0            0         0          0
##   fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1           4    0           0             0         0          0     0
## 2           4    0           0             0         0          0     0
## 3           4    0           0             0         0          0     0
## 4           4    0           0             0         0          0     0
## 5           4    0           0             0         0          0     0
## 6           4    0           0             0         0          0     0
  1. Incorporate Imputation in Cross-Validation: Ensure imputation is done within each fold of cross-validation to avoid bias. Set up Cross-Validation with Imputation
# Load necessary libraries
library(caret)
library(VIM)          # For KNN imputation
library(mice)         # For Regression imputation
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(missForest)   # For Random Forest imputation
library(randomForest) # For Random Forest model
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:seasonal':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# Define the control for cross-validation
folds <- createFolds(soybean_data$Class, k = 5, list = TRUE)

# Initialize an empty list to store performance metrics
results <- list()

# Custom function to train and evaluate models using KNN imputation
knn_imputation_cv <- function(train_data, test_data) {
  # Impute missing values in the training data using KNN
  train_imputed <- kNN(train_data, k = 5)
  
  # Fit the model (Random Forest in this case)
  model <- randomForest(Class ~ ., data = train_imputed)
  
  # Impute the missing values in the test data
  test_imputed <- kNN(test_data, k = 5)
  
  # Make predictions on the test data
  predictions <- predict(model, test_imputed)
  
  # Evaluate accuracy
  accuracy <- mean(predictions == test_imputed$Class)
  return(accuracy)
}

# Perform Cross-Validation with Imputation
for (i in 1:length(folds)) {
  # Split the data into training and validation sets
  train_idx <- folds[[i]]
  train_data <- soybean_data[train_idx, ]
  test_data <- soybean_data[-train_idx, ]
  
  # Apply KNN Imputation and train the model
  fold_result <- knn_imputation_cv(train_data, test_data)
  
  # Store the result
  results[[i]] <- fold_result
}

# Print the average accuracy across folds
mean(unlist(results))
## [1] 0.9022625
  1. Handle Informative Missingness: If missing data is related to the outcome, more advanced techniques may be required.
# Adding missingness indicators to the dataset
add_missing_indicators <- function(data) {
  for (col in colnames(data)) {
    # Check for missing values
    if (any(is.na(data[[col]]))) {
      # Create a new binary column to indicate missingness
      data[[paste0(col, "_missing")]] <- ifelse(is.na(data[[col]]), 1, 0)
    }
  }
  return(data)
}

# Apply the function to the dataset to add missingness indicators
soybean_data_with_indicators <- add_missing_indicators(soybean_data)
head(soybean_data_with_indicators)
##                   Class date plant.stand precip temp hail crop.hist area.dam
## 1 diaporthe-stem-canker    6           0      2    1    0         1        1
## 2 diaporthe-stem-canker    4           0      2    1    0         2        0
## 3 diaporthe-stem-canker    3           0      2    1    0         1        0
## 4 diaporthe-stem-canker    3           0      2    1    0         1        0
## 5 diaporthe-stem-canker    6           0      2    1    0         2        0
## 6 diaporthe-stem-canker    5           0      2    1    0         3        0
##   sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg leaf.size
## 1     1        0    0            1      1         0         2         2
## 2     2        1    1            1      1         0         2         2
## 3     2        1    2            1      1         0         2         2
## 4     2        0    1            1      1         0         2         2
## 5     1        0    2            1      1         0         2         2
## 6     1        0    1            1      1         0         2         2
##   leaf.shread leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
## 1           0         0         0    1       1            3             1
## 2           0         0         0    1       0            3             1
## 3           0         0         0    1       0            3             0
## 4           0         0         0    1       0            3             0
## 5           0         0         0    1       0            3             1
## 6           0         0         0    1       0            3             0
##   fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
## 1               1         1        0            0         0          0
## 2               1         1        0            0         0          0
## 3               1         1        0            0         0          0
## 4               1         1        0            0         0          0
## 5               1         1        0            0         0          0
## 6               1         1        0            0         0          0
##   fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
## 1           4    0           0             0         0          0     0
## 2           4    0           0             0         0          0     0
## 3           4    0           0             0         0          0     0
## 4           4    0           0             0         0          0     0
## 5           4    0           0             0         0          0     0
## 6           4    0           0             0         0          0     0
##   date_missing plant.stand_missing precip_missing temp_missing hail_missing
## 1            0                   0              0            0            0
## 2            0                   0              0            0            0
## 3            0                   0              0            0            0
## 4            0                   0              0            0            0
## 5            0                   0              0            0            0
## 6            0                   0              0            0            0
##   crop.hist_missing area.dam_missing sever_missing seed.tmt_missing
## 1                 0                0             0                0
## 2                 0                0             0                0
## 3                 0                0             0                0
## 4                 0                0             0                0
## 5                 0                0             0                0
## 6                 0                0             0                0
##   germ_missing plant.growth_missing leaf.halo_missing leaf.marg_missing
## 1            0                    0                 0                 0
## 2            0                    0                 0                 0
## 3            0                    0                 0                 0
## 4            0                    0                 0                 0
## 5            0                    0                 0                 0
## 6            0                    0                 0                 0
##   leaf.size_missing leaf.shread_missing leaf.malf_missing leaf.mild_missing
## 1                 0                   0                 0                 0
## 2                 0                   0                 0                 0
## 3                 0                   0                 0                 0
## 4                 0                   0                 0                 0
## 5                 0                   0                 0                 0
## 6                 0                   0                 0                 0
##   stem_missing lodging_missing stem.cankers_missing canker.lesion_missing
## 1            0               0                    0                     0
## 2            0               0                    0                     0
## 3            0               0                    0                     0
## 4            0               0                    0                     0
## 5            0               0                    0                     0
## 6            0               0                    0                     0
##   fruiting.bodies_missing ext.decay_missing mycelium_missing
## 1                       0                 0                0
## 2                       0                 0                0
## 3                       0                 0                0
## 4                       0                 0                0
## 5                       0                 0                0
## 6                       0                 0                0
##   int.discolor_missing sclerotia_missing fruit.pods_missing fruit.spots_missing
## 1                    0                 0                  0                   0
## 2                    0                 0                  0                   0
## 3                    0                 0                  0                   0
## 4                    0                 0                  0                   0
## 5                    0                 0                  0                   0
## 6                    0                 0                  0                   0
##   seed_missing mold.growth_missing seed.discolor_missing seed.size_missing
## 1            0                   0                     0                 0
## 2            0                   0                     0                 0
## 3            0                   0                     0                 0
## 4            0                   0                     0                 0
## 5            0                   0                     0                 0
## 6            0                   0                     0                 0
##   shriveling_missing roots_missing
## 1                  0             0
## 2                  0             0
## 3                  0             0
## 4                  0             0
## 5                  0             0
## 6                  0             0