Load packages

library(mlbench)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(tidyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.4     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(e1071)

Accessing the Glass Data

data("Glass")
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
##        RI    Na   Mg   Al    Si    K   Ca Ba   Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0 0.00    1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0 0.00    1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0 0.00    1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0 0.00    1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0 0.00    1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07  0 0.26    1
Glass$Type <- as.factor(Glass$Type)

Exploring the predictor variables

to understands their distributions and relationship between predictors. Displaying histogram for each following to analyze.

Glass_long <- Glass %>%
  pivot_longer(cols = -Type, names_to = "Predictor", values_to = "Value")
glimpse(pivot_longer)
## function (data, cols, ..., cols_vary = "fastest", names_to = "name", names_prefix = NULL, 
##     names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL, 
##     names_repair = "check_unique", values_to = "value", values_drop_na = FALSE, 
##     values_ptypes = NULL, values_transform = NULL)
predictors <- Glass[, -ncol(Glass)] 
predictor_names <- colnames(predictors)

Predictors histogram

ggplot(Glass_long, aes(x = Value, fill = as.factor(Type))) +
  geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
  facet_wrap(~Predictor, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Predictors in Glass Dataset", x = "Value", y = "Count", fill = "Glass Type")

for (var in predictor_names) {
  p <- ggplot(Glass, aes_string(x = var)) +
    geom_histogram(bins = 20, fill = 'green', color = 'purple') +
    theme_minimal() +
    ggtitle(paste('Histogram of', var))
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Detecting the corelation and relationship between preditors using paiwise scatter plots.

subset_predictors <- predictors[, 1:4] 


ggpairs(subset_predictors, 
        upper = list(continuous = "smooth"),
        lower = list(continuous = "points"),
        diag = list(continuous = "densityDiag"),
        mapping = aes(color = Glass$Type)) +
  theme_minimal() + 
  theme(legend.position = "bottom") + 
  ggtitle("Pairwise Plot for Selected Predictors") +
  theme(plot.title = element_text(hjust = 0.5))

Detecting outlier with boxplots.

Glass_long %>%
  ggplot(aes(x = Predictor, y = Value, fill = Predictor)) +
  geom_boxplot(outlier.colour = "orange", outlier.shape = 8) +
  theme_minimal() +
  coord_flip() +
  labs(title = "Boxplots for Outlier Detection in Glass Dataset")

# Utilizzing boxblot for each preditors to detect outlier.

for (var in predictor_names) {
  p <- ggplot(Glass, aes_string(y = var)) +
    geom_boxplot(fill = 'gold', color = 'black', outlier.color = 'red', outlier.size = 2) +
    theme_minimal() +
    ggtitle(paste('Boxplot of', var)) +
    theme(plot.title = element_text(hjust = 0.5))
  print(p)  # Print each boxplot separately
}

# logging transformation for skewness variables.

Glass_transformed <- Glass %>%
  mutate(across(where(is.numeric), ~ifelse(sd(.) > 0, log1p(.), .)))

displaying ouliers and skewess

for Relevance transformation that could improve the classification model

apply(Glass[, sapply(Glass, is.numeric)], 2, function(x) if(sd(x) > 0) skewness(x) else NA)
##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107
library(e1071)  # Ensure e1071 is loaded
numeric_cols <- Glass %>% select(where(is.numeric))

skew_vals <- apply(numeric_cols, 2, function(x) if(sd(x) > 0) skewness(x) else NA)
print(skew_vals)  
##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107
pillar::glimpse(skew_vals)
##  Named num [1:9] 1.603 0.448 -1.136 0.895 -0.72 ...
##  - attr(*, "names")= chr [1:9] "RI" "Na" "Mg" "Al" ...
summary(skew_vals)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.1365  0.4478  1.6027  1.6295  2.0184  6.4601
transformed_data <- Glass  
for (var in predictor_names) {
  if (abs(skew_vals[var]) > 1) {  
    transformed_data[[var]] <- log1p(Glass[[var]])  
    print(paste("Applied log transformation to", var))}} 
## [1] "Applied log transformation to RI"
## [1] "Applied log transformation to Mg"
## [1] "Applied log transformation to K"
## [1] "Applied log transformation to Ca"
## [1] "Applied log transformation to Ba"
## [1] "Applied log transformation to Fe"

Checking the Outcomes of the tramsormed data

transformed_skewness <- apply(Glass_transformed[, -ncol(Glass_transformed)], 2, function(x) if(sd(x) > 0) skewness(x) else NA)
transformed_skewness
## RI Na Mg Al Si  K Ca Ba Fe 
## NA NA NA NA NA NA NA NA NA

The conclusion of the findings are as follow.

  1. The predictor variables have varying distributions, with some appearing skewed.

  2. Outlines are present in various predictors, as seen in the box plot.

  3. Skewed variables were transformed using logarithms to improve the classification model.

Analyzing the Soybean Dataset

Accessing the dataset The string help observing the structure of the data.

data("Soybean")
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

visualizing Soybean categorical distributions using a ggplot

num_categories <- Soybean %>%
  summarise(across(where(is.factor), ~n_distinct(.)))
num_categories
##   Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
## 1    19    8           3      4    4    3         5        5     4        4
##   germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
## 1    4            3      2         4         4         4           3         3
##   leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 1         4    3       3            5             5               3         4
##   mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth
## 1        3            4         3          5           5    3           3
##   seed.discolor seed.size shriveling roots
## 1             3         3          3     4
Soybean %>%
  count(Class) %>%
  ggplot(aes(x = reorder(Class, -n), y = n, fill = Class)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Class Distribution in Soybean Dataset", x = "Class", y = "Count")

# In this step will be handing the missing values with data processing

Soybean_clean <- Soybean |>
  mutate(across(where(is.factor), ~fct_explicit_na(.)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.factor), ~fct_explicit_na(.))`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.

categorical variables

In this step,categorical variables will be encoded to turn them into numerical format. Using one-hot encoding via ‘model.matrix’ will aid in the transformation of categorical predictors into binary columns.

Soybean_final <- model.matrix(Class ~ . - 1, data = Soybean_clean)
Soybean_final <- as.data.frame(Soybean_final)

summarise(Soybean_final)
## data frame with 0 columns and 1 row

Analysis of the Glass and Soybean Datasets.

The Glass and Soybean datasets were used in this project’s exploratory data analysis, outlier detection, and data preprocessing for predictive modeling. Model selection may benefit from the links between variables that were revealed by correlation analysis.

There are several different kinds of glass in the sample, and their possible overlapping distributions could affect classification results.

To improve their distribution and make them more appropriate for machine learning models, severely skewed data were subjected to a log transformation.

An imbalance in the class distribution analysis may necessitate the use of handling strategies like weighting or re sampling in classification models.

Categorical variables were transformed into a machine-learning-friendly format using one-hot encoding.

Machine learning models require categorical variables to be encoded; model.matrix.

Next Actions To find the best method for differentiating between glass kinds and soybean illnesses, train and assess classification models.