{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE)
options(repos = c(CRAN = "https://cloud.r-project.org"))
# Function to install a package if not already installed
install_if_needed <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg)
}
}
# List of packages to check and install if necessary
required_packages <- c("fpp3", "dplyr", "ggplot2", "lubridate", "tsibble",
"tsibbledata", "feasts", "fable", "fabletools",
"curl", "USgas", "readxl", "readr", "tidyr", "forecast",
"seasonal", "patchwork", "LaTeX", "tinytex", "mlbench")
# Loop through the list and install packages only if needed
for (pkg in required_packages) {
install_if_needed(pkg)
}
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Installing package into 'C:/Users/Sahr Kassoh/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'LaTeX' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
# Function to suppress package startup messages
suppressPackageStartupMessages({
library(fpp3)
library(dplyr)
library(ggplot2)
library(lubridate)
library(tsibble)
library(tsibbledata)
library(feasts)
library(fable)
library(fabletools)
library(readr)
library(readxl)
library(tidyr)
library(forecast)
library(seasonal)
library(patchwork)
library(tinytex)
library(mlbench)
})
## Warning: package 'fpp3' was built under R version 4.4.1
## Warning: package 'tibble' was built under R version 4.4.1
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'tidyr' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.1
## Warning: package 'ggplot2' was built under R version 4.4.1
## Warning: package 'tsibble' was built under R version 4.4.1
## Warning: package 'tsibbledata' was built under R version 4.4.1
## Warning: package 'feasts' was built under R version 4.4.1
## Warning: package 'fabletools' was built under R version 4.4.1
## Warning: package 'fable' was built under R version 4.4.1
## Warning: package 'readr' was built under R version 4.4.1
## Warning: package 'readxl' was built under R version 4.4.1
## Warning: package 'forecast' was built under R version 4.4.1
## Warning: package 'seasonal' was built under R version 4.4.1
## Warning: package 'patchwork' was built under R version 4.4.1
## Warning: package 'tinytex' was built under R version 4.4.1
## Warning: package 'mlbench' was built under R version 4.4.1
# Load necessary libraries
library(mlbench)
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.1
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Preview the data
data(Glass)
glass_data <- Glass
str(glass_data)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
# Check for missing values in each column
missing_data <- colSums(is.na(glass_data))
# Print out columns with missing data
if (any(missing_data > 0)) {
cat("Columns with missing data:\n")
print(missing_data[missing_data > 0])
} else {
cat("There are no missing values in the dataset.\n")
}
## There are no missing values in the dataset.
Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
# Load necessary library
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.1
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Melt the glass data to long format
glass_melted <- melt(glass_data, id.vars = "Type")
# Plot histograms for each predictor
ggplot(glass_melted, aes(x = value)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Predictor Variables")
# Individual box plot for each predictor to explore distributions
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = value)) +
geom_boxplot(bins = 30, fill = "skyblue", color = "black") +
facet_wrap(~ variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Predictor Variables")
## Warning in geom_boxplot(bins = 30, fill = "skyblue", color = "black"): Ignoring
## unknown parameters: `bins`
#### 3. Plot scatter plot to visualize any linear relationshp
# Individual scatter plot with lm line for each predictor to explore relationships
glass_melted <- melt(glass_data, id.vars = "Type")
ggplot(glass_melted, aes(x = as.numeric(value), y = as.numeric(Type))) +
geom_point(color = "black") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ variable, scales = "free_x") + # Facet by variable with independent x-axis scaling
theme_minimal() +
labs(title = "Scatter Plots of Predictor Variables with Linear Model Line", x = "Value", y = "Type")
## `geom_smooth()` using formula = 'y ~ x'
# And excluding the Type variable as it's the target variable
ggpairs(Glass[, -10]) + theme_bw()
## b. Do there appear to be any outliers in the data? Are any predictors
skewed?
From the visualizations, we can make some observations about potential outliers and skewness in the predictor variables:
Are there any relevant transformations of one or more predictors that might improve the classification model?
Here are some transformations that might be relevant for improving the classification model:
# Exclude the 'Type' column (non-numeric) and scale the numeric columns
glass_data_numeric <- glass_data[, -ncol(glass_data)] # Exclude the last column ('Type')
glass_data_scaled <- scale(glass_data_numeric) # Scale the numeric columns
# Combine the scaled numeric data with the 'Type' column
glass_data_scaled <- data.frame(glass_data_scaled, Type = glass_data$Type)
# Check the first few rows of the scaled data
head(glass_data_scaled)
## RI Na Mg Al Si K Ca
## 1 0.8708258 0.2842867 1.2517037 -0.6908222 -1.12444556 -0.67013422 -0.1454254
## 2 -0.2487502 0.5904328 0.6346799 -0.1700615 0.10207972 -0.02615193 -0.7918771
## 3 -0.7196308 0.1495824 0.6000157 0.1904651 0.43776033 -0.16414813 -0.8270103
## 4 -0.2322859 -0.2422846 0.6970756 -0.3102663 -0.05284979 0.11184428 -0.5178378
## 5 -0.3113148 -0.1688095 0.6485456 -0.4104126 0.55395746 0.08117845 -0.6232375
## 6 -0.7920739 -0.7566101 0.6416128 0.3506992 0.41193874 0.21917466 -0.6232375
## Ba Fe Type
## 1 -0.3520514 -0.5850791 1
## 2 -0.3520514 -0.5850791 1
## 3 -0.3520514 -0.5850791 1
## 4 -0.3520514 -0.5850791 1
## 5 -0.3520514 -0.5850791 1
## 6 -0.3520514 2.0832652 1
## Example in Box-Cox transformation:
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
# Apply Box-Cox transformation to skewed variables
trans <- preProcess(glass_data, method = c("BoxCox"))
glass_data_transformed <- predict(trans, glass_data)
head(glass_data_transformed)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 0.2838746 2.613007 4.49 0.0976177 2575.684 0.06 0.8254539 0 0.00 1
## 2 0.2829051 2.631169 3.60 0.3323808 2644.326 0.48 0.8145827 0 0.00 1
## 3 0.2824954 2.604909 3.55 0.4819347 2663.270 0.39 0.8139144 0 0.00 1
## 4 0.2829194 2.580974 3.69 0.2715633 2635.606 0.57 0.8195032 0 0.00 1
## 5 0.2828507 2.585506 3.62 0.2271057 2669.843 0.55 0.8176698 0 0.00 1
## 6 0.2824323 2.548664 3.61 0.5455844 2661.810 0.64 0.8176698 0 0.26 1
# Example in R (Spatial Sign Transformation):
library(caret)
glass_data_spatial <- preProcess(glass_data, method = c("spatialSign"))
head(glass_data_spatial)
## $dim
## [1] 214 10
##
## $bc
## NULL
##
## $yj
## NULL
##
## $et
## NULL
##
## $invHyperbolicSine
## NULL
##
## $mean
## RI Na Mg Al Si K
## 1.51836542 13.40785047 2.68453271 1.44490654 72.65093458 0.49705607
## Ca Ba Fe
## 8.95696262 0.17504673 0.05700935
# load the data
data(Soybean)
soybean_data <- Soybean
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
# Identify the frequency distribution of each factor predictor
freq_tables_list <- list()
# Loop through all columns to calculate the frequency distribution
for (col in colnames(soybean_data)) {
freq_table <- table(soybean_data[[col]]) # Create a frequency table for each column
freq_df <- as.data.frame(freq_table) # Convert the table to a data frame
colnames(freq_df) <- c("Category", "Frequency") # Rename columns for clarity
freq_df$Predictor <- col # Add the predictor name
freq_tables_list[[col]] <- freq_df # Store the frequency table in a list
}
# Combine all frequency tables into one data frame
all_freq_tables <- do.call(rbind, freq_tables_list)
# View the combined frequency table
head(all_freq_tables)
## Category Frequency Predictor
## Class.1 2-4-d-injury 16 Class
## Class.2 alternarialeaf-spot 91 Class
## Class.3 anthracnose 44 Class
## Class.4 bacterial-blight 20 Class
## Class.5 bacterial-pustule 20 Class
## Class.6 brown-spot 92 Class
Looking at the frequency distribution table, we can evaluate whether any of the distributions are degenerate, meaning they provide little variability or are heavily skewed to one category, as discussed in the chapter on near-zero variance predictors in the book.
Identifying Degenerate Distributions: A degenerate distribution would have the majority of observations concentrated in one or two categories, making it potentially less useful for modeling. Based on the table above:
However, the distributions are not fully degenerate in the strictest sense but there is class imbalance, which could affect model performance.
To handle missing data, follow these strategies based on the book:
Remove Predictors: Eliminate predictors with extensive missing data if they don’t contribute much to the model.
Imputation:
Incorporate Imputation in Cross-Validation: Ensure imputation is done within each fold of cross-validation to avoid bias.
Handle Informative Missingness: If missing data is related to the outcome, more advanced techniques may be required.