library(mlbench)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(tidyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.4 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(e1071)
data("Glass")
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
Glass$Type <- as.factor(Glass$Type)
to understands their distributions and relationship between predictors. Displaying histogram for each following to analyze.
Glass_long <- Glass %>%
pivot_longer(cols = -Type, names_to = "Predictor", values_to = "Value")
glimpse(pivot_longer)
## function (data, cols, ..., cols_vary = "fastest", names_to = "name", names_prefix = NULL,
## names_sep = NULL, names_pattern = NULL, names_ptypes = NULL, names_transform = NULL,
## names_repair = "check_unique", values_to = "value", values_drop_na = FALSE,
## values_ptypes = NULL, values_transform = NULL)
predictors <- Glass[, -ncol(Glass)]
predictor_names <- colnames(predictors)
ggplot(Glass_long, aes(x = Value, fill = as.factor(Type))) +
geom_histogram(bins = 30, alpha = 0.7, position = "identity") +
facet_wrap(~Predictor, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Predictors in Glass Dataset", x = "Value", y = "Count", fill = "Glass Type")
for (var in predictor_names) {
p <- ggplot(Glass, aes_string(x = var)) +
geom_histogram(bins = 20, fill = 'green', color = 'purple') +
theme_minimal() +
ggtitle(paste('Histogram of', var))
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Detecting the corelation and relationship between preditors using
paiwise scatter plots.
subset_predictors <- predictors[, 1:4]
ggpairs(subset_predictors,
upper = list(continuous = "smooth"),
lower = list(continuous = "points"),
diag = list(continuous = "densityDiag"),
mapping = aes(color = Glass$Type)) +
theme_minimal() +
theme(legend.position = "bottom") +
ggtitle("Pairwise Plot for Selected Predictors") +
theme(plot.title = element_text(hjust = 0.5))
Glass_long %>%
ggplot(aes(x = Predictor, y = Value, fill = Predictor)) +
geom_boxplot(outlier.colour = "orange", outlier.shape = 8) +
theme_minimal() +
coord_flip() +
labs(title = "Boxplots for Outlier Detection in Glass Dataset")
# Utilizzing boxblot for each preditors to detect outlier.
for (var in predictor_names) {
p <- ggplot(Glass, aes_string(y = var)) +
geom_boxplot(fill = 'gold', color = 'black', outlier.color = 'red', outlier.size = 2) +
theme_minimal() +
ggtitle(paste('Boxplot of', var)) +
theme(plot.title = element_text(hjust = 0.5))
print(p) # Print each boxplot separately
}
# logging transformation for skewness variables.
Glass_transformed <- Glass %>%
mutate(across(where(is.numeric), ~ifelse(sd(.) > 0, log1p(.), .)))
for Relevance transformation that could improve the classification model
apply(Glass[, sapply(Glass, is.numeric)], 2, function(x) if(sd(x) > 0) skewness(x) else NA)
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
library(e1071) # Ensure e1071 is loaded
numeric_cols <- Glass %>% select(where(is.numeric))
skew_vals <- apply(numeric_cols, 2, function(x) if(sd(x) > 0) skewness(x) else NA)
print(skew_vals)
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
pillar::glimpse(skew_vals)
## Named num [1:9] 1.603 0.448 -1.136 0.895 -0.72 ...
## - attr(*, "names")= chr [1:9] "RI" "Na" "Mg" "Al" ...
summary(skew_vals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.1365 0.4478 1.6027 1.6295 2.0184 6.4601
transformed_data <- Glass
for (var in predictor_names) {
if (abs(skew_vals[var]) > 1) {
transformed_data[[var]] <- log1p(Glass[[var]])
print(paste("Applied log transformation to", var))}}
## [1] "Applied log transformation to RI"
## [1] "Applied log transformation to Mg"
## [1] "Applied log transformation to K"
## [1] "Applied log transformation to Ca"
## [1] "Applied log transformation to Ba"
## [1] "Applied log transformation to Fe"
transformed_skewness <- apply(Glass_transformed[, -ncol(Glass_transformed)], 2, function(x) if(sd(x) > 0) skewness(x) else NA)
transformed_skewness
## RI Na Mg Al Si K Ca Ba Fe
## NA NA NA NA NA NA NA NA NA
The predictor variables have varying distributions, with some appearing skewed.
Outlines are present in various predictors, as seen in the box plot.
Skewed variables were transformed using logarithms to improve the classification model.
Accessing the dataset The string help observing the structure of the data.
data("Soybean")
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
num_categories <- Soybean %>%
summarise(across(where(is.factor), ~n_distinct(.)))
num_categories
## Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
## 1 19 8 3 4 4 3 5 5 4 4
## germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf
## 1 4 3 2 4 4 4 3 3
## leaf.mild stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 1 4 3 3 5 5 3 4
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed mold.growth
## 1 3 4 3 5 5 3 3
## seed.discolor seed.size shriveling roots
## 1 3 3 3 4
Soybean %>%
count(Class) %>%
ggplot(aes(x = reorder(Class, -n), y = n, fill = Class)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Class Distribution in Soybean Dataset", x = "Class", y = "Count")
# In this step will be handing the missing values with data
processing
Soybean_clean <- Soybean |>
mutate(across(where(is.factor), ~fct_explicit_na(.)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.factor), ~fct_explicit_na(.))`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
In this step,categorical variables will be encoded to turn them into numerical format. Using one-hot encoding via ‘model.matrix’ will aid in the transformation of categorical predictors into binary columns.
Soybean_final <- model.matrix(Class ~ . - 1, data = Soybean_clean)
Soybean_final <- as.data.frame(Soybean_final)
summarise(Soybean_final)
## data frame with 0 columns and 1 row
The Glass and Soybean datasets were used in this project’s exploratory data analysis, outlier detection, and data preprocessing for predictive modeling. Model selection may benefit from the links between variables that were revealed by correlation analysis.
There are several different kinds of glass in the sample, and their possible overlapping distributions could affect classification results.
To improve their distribution and make them more appropriate for machine learning models, severely skewed data were subjected to a log transformation.
An imbalance in the class distribution analysis may necessitate the use of handling strategies like weighting or re sampling in classification models.
Categorical variables were transformed into a machine-learning-friendly format using one-hot encoding.
Machine learning models require categorical variables to be encoded; model.matrix.
Next Actions To find the best method for differentiating between glass kinds and soybean illnesses, train and assess classification models.