{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)

Load Required Libraries

options(repos = c(CRAN = "https://cloud.r-project.org"))

# Function to install a package if not already installed
install_if_needed <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)

  }
}

# List of packages to check and install if necessary
required_packages <- c("fpp3", "dplyr", "ggplot2", "lubridate", "tsibble", 
                       "tsibbledata", "feasts", "fable", "fabletools", 
                       "curl", "USgas", "readxl", "readr", "tidyr", "forecast",
                       "seasonal", "patchwork", "LaTeX", "tinytex", "mlbench")

# Loop through the list and install packages only if needed
for (pkg in required_packages) {
  install_if_needed(pkg)
}
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Installing package into 'C:/Users/Sahr Kassoh/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'LaTeX' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
# Function to suppress package startup messages
suppressPackageStartupMessages({
  library(fpp3)
  library(dplyr)
  library(ggplot2)
  library(lubridate)
  library(tsibble)
  library(tsibbledata)
  library(feasts)
  library(fable)
  library(fabletools)
  library(readr)
  library(readxl)
  library(tidyr)
  library(forecast)
  library(seasonal)
  library(patchwork)
  library(tinytex)
  library(mlbench)
})
## Warning: package 'fpp3' was built under R version 4.4.1
## Warning: package 'tibble' was built under R version 4.4.1
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'tidyr' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.1
## Warning: package 'ggplot2' was built under R version 4.4.1
## Warning: package 'tsibble' was built under R version 4.4.1
## Warning: package 'tsibbledata' was built under R version 4.4.1
## Warning: package 'feasts' was built under R version 4.4.1
## Warning: package 'fabletools' was built under R version 4.4.1
## Warning: package 'fable' was built under R version 4.4.1
## Warning: package 'readr' was built under R version 4.4.1
## Warning: package 'readxl' was built under R version 4.4.1
## Warning: package 'forecast' was built under R version 4.4.1
## Warning: package 'seasonal' was built under R version 4.4.1
## Warning: package 'patchwork' was built under R version 4.4.1
## Warning: package 'tinytex' was built under R version 4.4.1
## Warning: package 'mlbench' was built under R version 4.4.1
# Load necessary libraries 
library(mlbench) 
library(ggplot2) 
library(GGally) 
## Warning: package 'GGally' was built under R version 4.4.1
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Exercise 3.1.

# Preview the data
data(Glass)
glass_data <- Glass
str(glass_data)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values in each column
missing_data <- colSums(is.na(glass_data))

# Print out columns with missing data
if (any(missing_data > 0)) {
  cat("Columns with missing data:\n")
  print(missing_data[missing_data > 0])
} else {
  cat("There are no missing values in the dataset.\n")
}
## There are no missing values in the dataset.

a. Explore the predictor variables

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

1. Plot individual histograms for each predictor to explore distributions

# Load necessary library
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.1
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Melt the glass data to long format
glass_melted <- melt(glass_data, id.vars = "Type")

# Plot histograms for each predictor
ggplot(glass_melted, aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictor Variables")

2. Draw box plot to display the distribution, central tendency, and variability of a dataset, highlighting the median, quartiles, and potential outliers.

# Individual box plot for each predictor to explore distributions
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = value)) +
  geom_boxplot(bins = 30, fill = "skyblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictor Variables")
## Warning in geom_boxplot(bins = 30, fill = "skyblue", color = "black"): Ignoring
## unknown parameters: `bins`

#### 3. Plot scatter plot to visualize any linear relationshp

# Individual scatter plot with lm line for each predictor to explore relationships
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = as.numeric(value), y = as.numeric(Type))) +
  geom_point(color = "black") +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ variable, scales = "free_x") +  # Facet by variable with independent x-axis scaling
  theme_minimal() +
  labs(title = "Scatter Plots of Predictor Variables with Linear Model Line", x = "Value", y = "Type")
## `geom_smooth()` using formula = 'y ~ x'

4. Plot Pair plot to explore relationships and distributions between predictors

# And excluding the Type variable as it's the target variable 
ggpairs(Glass[, -10]) + theme_bw() 

## b. Do there appear to be any outliers in the data? Are any predictors skewed?

From the visualizations, we can make some observations about potential outliers and skewness in the predictor variables:

1. Outliers

  • Boxplots: Several variables show potential outliers, as evidenced by the dots outside the whiskers of the boxplots. Notable outliers are visible in:
    • Mg: There are points far outside the main box, indicating extreme values.
    • K: There are a few outliers on both the low and high ends.
    • Ba: Contains some extreme outliers above the typical range.
    • Fe: A few data points exist far outside the whiskers, indicating potential outliers.

2. Skewness:

  • Histograms: Several predictors show significant skewness. Specifically:
    • Ca and Ba: These are highly right-skewed, with many values clustered towards the left and a long tail to the right.
    • Fe: Highly right-skewed, with most values near zero and a few higher values extending the tail.
    • K: This predictor also appears to be right-skewed.
    • Mg: Shows slight right-skewness.
  • Normality: Some variables such as Si and Na show more symmetric distributions, closer to normality.

c. Transformations

Are there any relevant transformations of one or more predictors that might improve the classification model?

Here are some transformations that might be relevant for improving the classification model:

1. Centering and Scaling

  • Centering and scaling the data ensures that all predictor variables are on a common scale, which is especially important for models sensitive to predictor magnitudes like linear models and SVMs.
  • This is recommended for variables like RI, Na, Si, etc., especially if they vary across different scales.
# Exclude the 'Type' column (non-numeric) and scale the numeric columns
glass_data_numeric <- glass_data[, -ncol(glass_data)]  # Exclude the last column ('Type')
glass_data_scaled <- scale(glass_data_numeric)  # Scale the numeric columns

# Combine the scaled numeric data with the 'Type' column
glass_data_scaled <- data.frame(glass_data_scaled, Type = glass_data$Type)

# Check the first few rows of the scaled data
head(glass_data_scaled)
##           RI         Na        Mg         Al          Si           K         Ca
## 1  0.8708258  0.2842867 1.2517037 -0.6908222 -1.12444556 -0.67013422 -0.1454254
## 2 -0.2487502  0.5904328 0.6346799 -0.1700615  0.10207972 -0.02615193 -0.7918771
## 3 -0.7196308  0.1495824 0.6000157  0.1904651  0.43776033 -0.16414813 -0.8270103
## 4 -0.2322859 -0.2422846 0.6970756 -0.3102663 -0.05284979  0.11184428 -0.5178378
## 5 -0.3113148 -0.1688095 0.6485456 -0.4104126  0.55395746  0.08117845 -0.6232375
## 6 -0.7920739 -0.7566101 0.6416128  0.3506992  0.41193874  0.21917466 -0.6232375
##           Ba         Fe Type
## 1 -0.3520514 -0.5850791    1
## 2 -0.3520514 -0.5850791    1
## 3 -0.3520514 -0.5850791    1
## 4 -0.3520514 -0.5850791    1
## 5 -0.3520514 -0.5850791    1
## 6 -0.3520514  2.0832652    1

2. Skewness Transformation

  • Skewed distributions, particularly right-skewed ones like Ca, Ba, and Fe, can benefit from transformations to make the data more symmetric.
  • Transformations like log, square root, or Box-Cox can be used to address skewness, as this can improve model performance, especially for models that assume normally distributed inputs.
## Example in Box-Cox transformation:
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
# Apply Box-Cox transformation to skewed variables
trans <- preProcess(glass_data, method = c("BoxCox"))
glass_data_transformed <- predict(trans, glass_data)

head(glass_data_transformed)
##          RI       Na   Mg        Al       Si    K        Ca Ba   Fe Type
## 1 0.2838746 2.613007 4.49 0.0976177 2575.684 0.06 0.8254539  0 0.00    1
## 2 0.2829051 2.631169 3.60 0.3323808 2644.326 0.48 0.8145827  0 0.00    1
## 3 0.2824954 2.604909 3.55 0.4819347 2663.270 0.39 0.8139144  0 0.00    1
## 4 0.2829194 2.580974 3.69 0.2715633 2635.606 0.57 0.8195032  0 0.00    1
## 5 0.2828507 2.585506 3.62 0.2271057 2669.843 0.55 0.8176698  0 0.00    1
## 6 0.2824323 2.548664 3.61 0.5455844 2661.810 0.64 0.8176698  0 0.26    1

3. Dealing with Outliers

  • Outliers can disproportionately influence model training. Methods like the spatial sign transformation help mitigate the effects of outliers by projecting data onto a common distance from the center.
  • This can be useful for variables like Mg, K, and Ba, where outliers are present.
# Example in R (Spatial Sign Transformation):

library(caret)
glass_data_spatial <- preProcess(glass_data, method = c("spatialSign"))
head(glass_data_spatial)
## $dim
## [1] 214  10
## 
## $bc
## NULL
## 
## $yj
## NULL
## 
## $et
## NULL
## 
## $invHyperbolicSine
## NULL
## 
## $mean
##          RI          Na          Mg          Al          Si           K 
##  1.51836542 13.40785047  2.68453271  1.44490654 72.65093458  0.49705607 
##          Ca          Ba          Fe 
##  8.95696262  0.17504673  0.05700935

Exercise 3.2

# load the data
data(Soybean)
soybean_data <- Soybean
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

a. Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

# Identify the frequency distribution of each factor predictor
freq_tables_list <- list()

# Loop through all columns to calculate the frequency distribution
for (col in colnames(soybean_data)) {
  freq_table <- table(soybean_data[[col]])  # Create a frequency table for each column
  freq_df <- as.data.frame(freq_table)      # Convert the table to a data frame
  colnames(freq_df) <- c("Category", "Frequency")  # Rename columns for clarity
  freq_df$Predictor <- col                   # Add the predictor name
  freq_tables_list[[col]] <- freq_df         # Store the frequency table in a list
}

# Combine all frequency tables into one data frame
all_freq_tables <- do.call(rbind, freq_tables_list)

# View the combined frequency table
head(all_freq_tables)
##                    Category Frequency Predictor
## Class.1        2-4-d-injury        16     Class
## Class.2 alternarialeaf-spot        91     Class
## Class.3         anthracnose        44     Class
## Class.4    bacterial-blight        20     Class
## Class.5   bacterial-pustule        20     Class
## Class.6          brown-spot        92     Class

Looking at the frequency distribution table, we can evaluate whether any of the distributions are degenerate, meaning they provide little variability or are heavily skewed to one category, as discussed in the chapter on near-zero variance predictors in the book.

Identifying Degenerate Distributions: A degenerate distribution would have the majority of observations concentrated in one or two categories, making it potentially less useful for modeling. Based on the table above:

  • “alternarialeaf-spot” (91) and “brown-spot” (92) have high frequencies compared to other classes, while “2-4-d injury” (16) is quite rare.
  • Imbalance: There is some imbalance in the distribution, with a majority of data points falling into a few categories (i.e., “alternarialeaf-spot” and “brown-spot”). If most observations fall into only one or two categories, the model might have difficulty distinguishing between the other classes, potentially leading to poor generalization.

However, the distributions are not fully degenerate in the strictest sense but there is class imbalance, which could affect model performance.

c. Develop a strategy for handling missing data, either by eliminating predictors or imputation.

To handle missing data, follow these strategies based on the book:

  1. Remove Predictors: Eliminate predictors with extensive missing data if they don’t contribute much to the model.

  2. Imputation:

    • K-Nearest Neighbors (KNN): Impute missing values using the average of similar data points.
    • Regression Imputation: Use correlated predictors to estimate missing values.
    • Tree-Based Methods: Consider models like Random Forest, which handle missing data natively.
  3. Incorporate Imputation in Cross-Validation: Ensure imputation is done within each fold of cross-validation to avoid bias.

  4. Handle Informative Missingness: If missing data is related to the outcome, more advanced techniques may be required.