library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Glass |>
select_if(is.numeric) |>
pivot_longer(cols = everything()) |>
ggplot(aes(value)) +
geom_histogram(bins = 15, fill = "blue", color = "black") +
facet_wrap(~name, scales = 'free') +
ggtitle("Numerical Predictors Histograms")
pairs(Glass[,1:9], col = Glass$Type)
Glass|>
select(where(is.numeric)) |>
pivot_longer(cols = everything()) |>
ggplot(aes(y = value)) + geom_boxplot(fill = "red") +
facet_wrap(~name, scales = 'free') +
ggtitle("Numerical Predictors Boxplots")
Glass |>
ggplot() +
geom_bar(aes(x = Type)) +
ggtitle("Types of Glass Distribution")
Glass |>
select(where(is.numeric)) |>
summarise(across(everything(), ~ skewness(.x, na.rm = TRUE))) |>
mutate(across(everything(), round, 4))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## RI Na Mg Al Si K Ca Ba Fe
## 1 1.6027 0.4478 -1.1365 0.8946 -0.7202 6.4601 2.0184 3.3687 1.7298
Glass_transformed <- Glass |>
mutate(across(where(is.numeric), ~ log(. + 1)))
summary(Glass_transformed)
## RI Na Mg Al
## Min. :0.9207 Min. :2.462 Min. :0.000 Min. :0.2546
## 1st Qu.:0.9229 1st Qu.:2.632 1st Qu.:1.136 1st Qu.:0.7839
## Median :0.9233 Median :2.660 Median :1.500 Median :0.8587
## Mean :0.9236 Mean :2.666 Mean :1.168 Mean :0.8741
## 3rd Qu.:0.9239 3rd Qu.:2.696 3rd Qu.:1.526 3rd Qu.:0.9670
## Max. :0.9298 Max. :2.911 Max. :1.703 Max. :1.5041
## Si K Ca Ba
## Min. :4.260 Min. :0.0000 Min. :1.861 Min. :0.0000
## 1st Qu.:4.294 1st Qu.:0.1156 1st Qu.:2.224 1st Qu.:0.0000
## Median :4.301 Median :0.4415 Median :2.262 Median :0.0000
## Mean :4.299 Mean :0.3567 Mean :2.289 Mean :0.1094
## 3rd Qu.:4.305 3rd Qu.:0.4762 3rd Qu.:2.320 3rd Qu.:0.0000
## Max. :4.336 Max. :1.9755 Max. :2.844 Max. :1.4231
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05159 5:13
## 3rd Qu.:0.09531 6: 9
## Max. :0.41211 7:29
The presence of outliers is shown by the skewness values of some predictors. (K),(Ba),(Ca), and (Fe) have high positive skewness, showing significant outliers on the upper end of their distributions. while, (Mg) and (Si) have negative skewness, which means there are outliers on the lower end.
Logarithmic Transformation can be used for Potassium (K), Barium (Ba), Calcium (Ca), and Iron (Fe) Square Root Transformation can be used for Refractive Index (RI) and Iron (Fe). Inverse Transformation can be used for Magnesium (Mg) and Silicon (Si).
library(ggplot2)
library(mlbench)
data(Soybean)
for (col in names(Soybean)) {
if (is.factor(Soybean[[col]])) {
cat("\nColumn: ", col)
print(table(Soybean[[col]]))
}
}
##
## Column: Class
## 2-4-d-injury alternarialeaf-spot
## 16 91
## anthracnose bacterial-blight
## 44 20
## bacterial-pustule brown-spot
## 20 92
## brown-stem-rot charcoal-rot
## 44 20
## cyst-nematode diaporthe-pod-&-stem-blight
## 14 15
## diaporthe-stem-canker downy-mildew
## 20 20
## frog-eye-leaf-spot herbicide-injury
## 91 8
## phyllosticta-leaf-spot phytophthora-rot
## 20 88
## powdery-mildew purple-seed-stain
## 20 20
## rhizoctonia-root-rot
## 20
##
## Column: date
## 0 1 2 3 4 5 6
## 26 75 93 118 131 149 90
##
## Column: plant.stand
## 0 1
## 354 293
##
## Column: precip
## 0 1 2
## 74 112 459
##
## Column: temp
## 0 1 2
## 80 374 199
##
## Column: hail
## 0 1
## 435 127
##
## Column: crop.hist
## 0 1 2 3
## 65 165 219 218
##
## Column: area.dam
## 0 1 2 3
## 123 227 145 187
##
## Column: sever
## 0 1 2
## 195 322 45
##
## Column: seed.tmt
## 0 1 2
## 305 222 35
##
## Column: germ
## 0 1 2
## 165 213 193
##
## Column: plant.growth
## 0 1
## 441 226
##
## Column: leaves
## 0 1
## 77 606
##
## Column: leaf.halo
## 0 1 2
## 221 36 342
##
## Column: leaf.marg
## 0 1 2
## 357 21 221
##
## Column: leaf.size
## 0 1 2
## 51 327 221
##
## Column: leaf.shread
## 0 1
## 487 96
##
## Column: leaf.malf
## 0 1
## 554 45
##
## Column: leaf.mild
## 0 1 2
## 535 20 20
##
## Column: stem
## 0 1
## 296 371
##
## Column: lodging
## 0 1
## 520 42
##
## Column: stem.cankers
## 0 1 2 3
## 379 39 36 191
##
## Column: canker.lesion
## 0 1 2 3
## 320 83 177 65
##
## Column: fruiting.bodies
## 0 1
## 473 104
##
## Column: ext.decay
## 0 1 2
## 497 135 13
##
## Column: mycelium
## 0 1
## 639 6
##
## Column: int.discolor
## 0 1 2
## 581 44 20
##
## Column: sclerotia
## 0 1
## 625 20
##
## Column: fruit.pods
## 0 1 2 3
## 407 130 14 48
##
## Column: fruit.spots
## 0 1 2 4
## 345 75 57 100
##
## Column: seed
## 0 1
## 476 115
##
## Column: mold.growth
## 0 1
## 524 67
##
## Column: seed.discolor
## 0 1
## 513 64
##
## Column: seed.size
## 0 1
## 532 59
##
## Column: shriveling
## 0 1
## 539 38
##
## Column: roots
## 0 1 2
## 551 86 15
columns <- colnames(Soybean)
plots <- lapply(columns, function(col) {
ggplot(Soybean, aes(x = .data[[col]])) +
geom_bar() +
coord_flip() +
ggtitle(col)
})
plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
The frequency distributions of some categorical predictors in the soybean dataset shows some degeneration, which makes them possibly less effective for predictive modeling. the “herbicide-injury” predictor has only 8 occurrences, which can be ineffective for reliable analysis or prediction. also, the predictors mycelium, sclerotia, and shriveling shows a highly skewed distributions.
library(dplyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
missing_data <- Soybean |>
group_by(Class) |>
summarise(across(everything(), ~ sum(is.na(.)) / n() * 100, .names = "missing_{.col}"))
missing_data
## # A tibble: 19 × 36
## Class missing_date missing_plant.stand missing_precip missing_temp
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2-4-d-injury 6.25 100 100 100
## 2 alternarialeaf-… 0 0 0 0
## 3 anthracnose 0 0 0 0
## 4 bacterial-blight 0 0 0 0
## 5 bacterial-pustu… 0 0 0 0
## 6 brown-spot 0 0 0 0
## 7 brown-stem-rot 0 0 0 0
## 8 charcoal-rot 0 0 0 0
## 9 cyst-nematode 0 100 100 100
## 10 diaporthe-pod-&… 0 40 0 0
## 11 diaporthe-stem-… 0 0 0 0
## 12 downy-mildew 0 0 0 0
## 13 frog-eye-leaf-s… 0 0 0 0
## 14 herbicide-injury 0 0 100 0
## 15 phyllosticta-le… 0 0 0 0
## 16 phytophthora-rot 0 0 0 0
## 17 powdery-mildew 0 0 0 0
## 18 purple-seed-sta… 0 0 0 0
## 19 rhizoctonia-roo… 0 0 0 0
## # ℹ 31 more variables: missing_hail <dbl>, missing_crop.hist <dbl>,
## # missing_area.dam <dbl>, missing_sever <dbl>, missing_seed.tmt <dbl>,
## # missing_germ <dbl>, missing_plant.growth <dbl>, missing_leaves <dbl>,
## # missing_leaf.halo <dbl>, missing_leaf.marg <dbl>, missing_leaf.size <dbl>,
## # missing_leaf.shread <dbl>, missing_leaf.malf <dbl>,
## # missing_leaf.mild <dbl>, missing_stem <dbl>, missing_lodging <dbl>,
## # missing_stem.cankers <dbl>, missing_canker.lesion <dbl>, …
melted_data <- melt(missing_data, id.vars = "Class")
ggplot(melted_data, aes(x = variable, y = Class, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 10),
axis.title.x = element_text(size = 12), # Adjust x-axis title size
axis.title.y = element_text(size = 12) # Adjust y-axis title size
) +
labs(
title = "Missing Data Percentage by Class",
x = "Predictor",
y = "Class",
fill = "Missing pct."
)
Missing data should quantifible, identify if missing data is complete, at random, or not random, to help with the right imputation method. For low levels of missing data, simple techniques like mean or median imputation are used, methods like Multiple Imputation by Chained Equations (MICE) would be one strategery