3.1-(A) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
data(Glass)
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Glass |>
  select_if(is.numeric) |>
  pivot_longer(cols = everything()) |>
  ggplot(aes(value)) +
    geom_histogram(bins = 15, fill = "blue", color = "black") +
    facet_wrap(~name, scales = 'free') +
    ggtitle("Numerical Predictors Histograms")

pairs(Glass[,1:9], col = Glass$Type) 

Glass|>
  select(where(is.numeric)) |> 
  pivot_longer(cols = everything()) |>
  ggplot(aes(y = value)) + geom_boxplot(fill = "red") +
  facet_wrap(~name, scales = 'free') + 
  ggtitle("Numerical Predictors Boxplots") 

Glass |>
  ggplot() +
  geom_bar(aes(x = Type)) +
  ggtitle("Types of Glass Distribution")

Glass |>
  select(where(is.numeric)) |> 
  summarise(across(everything(), ~ skewness(.x, na.rm = TRUE))) |>  
  mutate(across(everything(), round, 4))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(everything(), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
##       RI     Na      Mg     Al      Si      K     Ca     Ba     Fe
## 1 1.6027 0.4478 -1.1365 0.8946 -0.7202 6.4601 2.0184 3.3687 1.7298
Glass_transformed <- Glass |>
  mutate(across(where(is.numeric), ~ log(. + 1)))  

summary(Glass_transformed)
##        RI               Na              Mg              Al        
##  Min.   :0.9207   Min.   :2.462   Min.   :0.000   Min.   :0.2546  
##  1st Qu.:0.9229   1st Qu.:2.632   1st Qu.:1.136   1st Qu.:0.7839  
##  Median :0.9233   Median :2.660   Median :1.500   Median :0.8587  
##  Mean   :0.9236   Mean   :2.666   Mean   :1.168   Mean   :0.8741  
##  3rd Qu.:0.9239   3rd Qu.:2.696   3rd Qu.:1.526   3rd Qu.:0.9670  
##  Max.   :0.9298   Max.   :2.911   Max.   :1.703   Max.   :1.5041  
##        Si              K                Ca              Ba        
##  Min.   :4.260   Min.   :0.0000   Min.   :1.861   Min.   :0.0000  
##  1st Qu.:4.294   1st Qu.:0.1156   1st Qu.:2.224   1st Qu.:0.0000  
##  Median :4.301   Median :0.4415   Median :2.262   Median :0.0000  
##  Mean   :4.299   Mean   :0.3567   Mean   :2.289   Mean   :0.1094  
##  3rd Qu.:4.305   3rd Qu.:0.4762   3rd Qu.:2.320   3rd Qu.:0.0000  
##  Max.   :4.336   Max.   :1.9755   Max.   :2.844   Max.   :1.4231  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05159   5:13  
##  3rd Qu.:0.09531   6: 9  
##  Max.   :0.41211   7:29

3.1-(B) Do there appear to be any outliers in the data? Are any predictors skewed?

The presence of outliers is shown by the skewness values of some predictors. (K),(Ba),(Ca), and (Fe) have high positive skewness, showing significant outliers on the upper end of their distributions. while, (Mg) and (Si) have negative skewness, which means there are outliers on the lower end.

3.1-(C) Are there any relevant transformations of one or more predictors that might improve the classification model?

Logarithmic Transformation can be used for Potassium (K), Barium (Ba), Calcium (Ca), and Iron (Fe) Square Root Transformation can be used for Refractive Index (RI) and Iron (Fe). Inverse Transformation can be used for Magnesium (Mg) and Silicon (Si).

3.2 (a) Investigate the frequency distributions for the categorical predictors. Areany of the distributions degenerate in the ways discussed earlier in this chapter?

library(ggplot2)
library(mlbench)

data(Soybean)

for (col in names(Soybean)) {
  if (is.factor(Soybean[[col]])) {
    cat("\nColumn: ", col)
    print(table(Soybean[[col]]))
  }
}
## 
## Column:  Class
##                2-4-d-injury         alternarialeaf-spot 
##                          16                          91 
##                 anthracnose            bacterial-blight 
##                          44                          20 
##           bacterial-pustule                  brown-spot 
##                          20                          92 
##              brown-stem-rot                charcoal-rot 
##                          44                          20 
##               cyst-nematode diaporthe-pod-&-stem-blight 
##                          14                          15 
##       diaporthe-stem-canker                downy-mildew 
##                          20                          20 
##          frog-eye-leaf-spot            herbicide-injury 
##                          91                           8 
##      phyllosticta-leaf-spot            phytophthora-rot 
##                          20                          88 
##              powdery-mildew           purple-seed-stain 
##                          20                          20 
##        rhizoctonia-root-rot 
##                          20 
## 
## Column:  date
##   0   1   2   3   4   5   6 
##  26  75  93 118 131 149  90 
## 
## Column:  plant.stand
##   0   1 
## 354 293 
## 
## Column:  precip
##   0   1   2 
##  74 112 459 
## 
## Column:  temp
##   0   1   2 
##  80 374 199 
## 
## Column:  hail
##   0   1 
## 435 127 
## 
## Column:  crop.hist
##   0   1   2   3 
##  65 165 219 218 
## 
## Column:  area.dam
##   0   1   2   3 
## 123 227 145 187 
## 
## Column:  sever
##   0   1   2 
## 195 322  45 
## 
## Column:  seed.tmt
##   0   1   2 
## 305 222  35 
## 
## Column:  germ
##   0   1   2 
## 165 213 193 
## 
## Column:  plant.growth
##   0   1 
## 441 226 
## 
## Column:  leaves
##   0   1 
##  77 606 
## 
## Column:  leaf.halo
##   0   1   2 
## 221  36 342 
## 
## Column:  leaf.marg
##   0   1   2 
## 357  21 221 
## 
## Column:  leaf.size
##   0   1   2 
##  51 327 221 
## 
## Column:  leaf.shread
##   0   1 
## 487  96 
## 
## Column:  leaf.malf
##   0   1 
## 554  45 
## 
## Column:  leaf.mild
##   0   1   2 
## 535  20  20 
## 
## Column:  stem
##   0   1 
## 296 371 
## 
## Column:  lodging
##   0   1 
## 520  42 
## 
## Column:  stem.cankers
##   0   1   2   3 
## 379  39  36 191 
## 
## Column:  canker.lesion
##   0   1   2   3 
## 320  83 177  65 
## 
## Column:  fruiting.bodies
##   0   1 
## 473 104 
## 
## Column:  ext.decay
##   0   1   2 
## 497 135  13 
## 
## Column:  mycelium
##   0   1 
## 639   6 
## 
## Column:  int.discolor
##   0   1   2 
## 581  44  20 
## 
## Column:  sclerotia
##   0   1 
## 625  20 
## 
## Column:  fruit.pods
##   0   1   2   3 
## 407 130  14  48 
## 
## Column:  fruit.spots
##   0   1   2   4 
## 345  75  57 100 
## 
## Column:  seed
##   0   1 
## 476 115 
## 
## Column:  mold.growth
##   0   1 
## 524  67 
## 
## Column:  seed.discolor
##   0   1 
## 513  64 
## 
## Column:  seed.size
##   0   1 
## 532  59 
## 
## Column:  shriveling
##   0   1 
## 539  38 
## 
## Column:  roots
##   0   1   2 
## 551  86  15
columns <- colnames(Soybean)

plots <- lapply(columns, function(col) {
  ggplot(Soybean, aes(x = .data[[col]])) +
    geom_bar() +
    coord_flip() +
    ggtitle(col)
})

plots
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

The frequency distributions of some categorical predictors in the soybean dataset shows some degeneration, which makes them possibly less effective for predictive modeling. the “herbicide-injury” predictor has only 8 occurrences, which can be ineffective for reliable analysis or prediction. also, the predictors mycelium, sclerotia, and shriveling shows a highly skewed distributions.

library(dplyr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
missing_data <- Soybean |>
  group_by(Class) |>
  summarise(across(everything(), ~ sum(is.na(.)) / n() * 100, .names = "missing_{.col}"))

missing_data
## # A tibble: 19 × 36
##    Class            missing_date missing_plant.stand missing_precip missing_temp
##    <fct>                   <dbl>               <dbl>          <dbl>        <dbl>
##  1 2-4-d-injury             6.25                 100            100          100
##  2 alternarialeaf-…         0                      0              0            0
##  3 anthracnose              0                      0              0            0
##  4 bacterial-blight         0                      0              0            0
##  5 bacterial-pustu…         0                      0              0            0
##  6 brown-spot               0                      0              0            0
##  7 brown-stem-rot           0                      0              0            0
##  8 charcoal-rot             0                      0              0            0
##  9 cyst-nematode            0                    100            100          100
## 10 diaporthe-pod-&…         0                     40              0            0
## 11 diaporthe-stem-…         0                      0              0            0
## 12 downy-mildew             0                      0              0            0
## 13 frog-eye-leaf-s…         0                      0              0            0
## 14 herbicide-injury         0                      0            100            0
## 15 phyllosticta-le…         0                      0              0            0
## 16 phytophthora-rot         0                      0              0            0
## 17 powdery-mildew           0                      0              0            0
## 18 purple-seed-sta…         0                      0              0            0
## 19 rhizoctonia-roo…         0                      0              0            0
## # ℹ 31 more variables: missing_hail <dbl>, missing_crop.hist <dbl>,
## #   missing_area.dam <dbl>, missing_sever <dbl>, missing_seed.tmt <dbl>,
## #   missing_germ <dbl>, missing_plant.growth <dbl>, missing_leaves <dbl>,
## #   missing_leaf.halo <dbl>, missing_leaf.marg <dbl>, missing_leaf.size <dbl>,
## #   missing_leaf.shread <dbl>, missing_leaf.malf <dbl>,
## #   missing_leaf.mild <dbl>, missing_stem <dbl>, missing_lodging <dbl>,
## #   missing_stem.cankers <dbl>, missing_canker.lesion <dbl>, …
melted_data <- melt(missing_data, id.vars = "Class")

ggplot(melted_data, aes(x = variable, y = Class, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 10), 
    axis.title.x = element_text(size = 12), # Adjust x-axis title size
    axis.title.y = element_text(size = 12)  # Adjust y-axis title size
  ) +
  labs(
    title = "Missing Data Percentage by Class",
    x = "Predictor",
    y = "Class",
    fill = "Missing pct."
  )

(c) Develop a strategy for handling missing data, either by eliminating predictors or imputation.

Missing data should quantifible, identify if missing data is complete, at random, or not random, to help with the right imputation method. For low levels of missing data, simple techniques like mean or median imputation are used, methods like Multiple Imputation by Chained Equations (MICE) would be one strategery