#install.packages('corrplot')
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)

3.1. The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via: > library(mlbench) > data(Glass) > str(Glass) ‘data.frame’: 214 obs. of 10 variables: $ RI : num 1.52 1.52 1.52 1.52 1.52 … $ Na : num 13.6 13.9 13.5 13.2 13.3 … $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 … $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 … $ Si : num 71.8 72.7 73 72.6 73.1 … $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 … $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 … $ Ba : num 0 0 0 0 0 0 0 0 0 0 … $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 … $ Type: Factor w/ 6 levels “1”,“2”,“3”,“5”,..: 1 1 1 1 1 1 1 1 1 1 … (a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors. (b) Do there appear to be any outliers in the data? Are any predictors skewed? (c) Are there any relevant transformations of one or more predictors that might improve the classification model?

library(mlbench)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
data(Glass)
str(Glass)
## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
#(a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
# Remove categorical variables
glass_numeric <- Glass[, sapply(Glass, is.numeric)]

# Create correlation matrix
correlation_matrix <- cor(glass_numeric)

# Create correlation plot
corrplot(correlation_matrix, method = "circle", type = "upper", tl.cex = 0.7)

par(mfrow = c(3, 3))  
for (i in 1:ncol(glass_numeric)) {
  hist(glass_numeric[, i], main = names(glass_numeric)[i], xlab = "", col = "skyblue", border = "white")
}

# we see normal distribtions for RI, Na, AL, and Si
cor((glass_numeric[,1:9]))
##               RI          Na           Mg          Al          Si            K
## RI  1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790  1.00000000 -0.273731961  0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196  1.000000000 -0.48179851 -0.16592672  0.005395667
## Al -0.4073260341  0.15679367 -0.481798509  1.00000000 -0.00552372  0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372  1.00000000 -0.193330854
## K  -0.2898327111 -0.26608650  0.005395667  0.32595845 -0.19333085  1.000000000
## Ca  0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189  0.32660288 -0.492262118  0.47940390 -0.10215131 -0.042618059
## Fe  0.1430096093 -0.24134641  0.083059529 -0.07440215 -0.09420073 -0.007719049
##            Ca            Ba           Fe
## RI  0.8104027 -0.0003860189  0.143009609
## Na -0.2754425  0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178  0.083059529
## Al -0.2595920  0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K  -0.3178362 -0.0426180594 -0.007719049
## Ca  1.0000000 -0.1128409671  0.124968219
## Ba -0.1128410  1.0000000000 -0.058691755
## Fe  0.1249682 -0.0586917554  1.000000000
# Select columns 1 to 9 from the Glass dataset
df <- Glass[, 1:9]

# Set up the layout
par(mfrow = c(3, 3))


lapply(1:ncol(df), function(i) {
  boxplot(df[, i], ylab = names(df)[i], horizontal = TRUE,
           col = "green")
})

## [[1]]
## [[1]]$stats
##         [,1]
## [1,] 1.51299
## [2,] 1.51652
## [3,] 1.51768
## [4,] 1.51916
## [5,] 1.52300
## 
## [[1]]$n
## [1] 214
## 
## [[1]]$conf
##          [,1]
## [1,] 1.517395
## [2,] 1.517965
## 
## [[1]]$out
##  [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
## 
## [[1]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[1]]$names
## [1] ""
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]
## [1,] 11.56
## [2,] 12.90
## [3,] 13.30
## [4,] 13.83
## [5,] 15.15
## 
## [[2]]$n
## [1] 214
## 
## [[2]]$conf
##          [,1]
## [1,] 13.19955
## [2,] 13.40045
## 
## [[2]]$out
## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79
## 
## [[2]]$group
## [1] 1 1 1 1 1 1 1
## 
## [[2]]$names
## [1] ""
## 
## 
## [[3]]
## [[3]]$stats
##      [,1]
## [1,] 0.00
## [2,] 2.09
## [3,] 3.48
## [4,] 3.60
## [5,] 4.49
## 
## [[3]]$n
## [1] 214
## 
## [[3]]$conf
##         [,1]
## [1,] 3.31691
## [2,] 3.64309
## 
## [[3]]$out
## numeric(0)
## 
## [[3]]$group
## numeric(0)
## 
## [[3]]$names
## [1] ""
## 
## 
## [[4]]
## [[4]]$stats
##      [,1]
## [1,] 0.56
## [2,] 1.19
## [3,] 1.36
## [4,] 1.63
## [5,] 2.27
## 
## [[4]]$n
## [1] 214
## 
## [[4]]$conf
##          [,1]
## [1,] 1.312477
## [2,] 1.407523
## 
## [[4]]$out
##  [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88
## 
## [[4]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[4]]$names
## [1] ""
## 
## 
## [[5]]
## [[5]]$stats
##       [,1]
## [1,] 71.15
## [2,] 72.28
## [3,] 72.79
## [4,] 73.09
## [5,] 73.88
## 
## [[5]]$n
## [1] 214
## 
## [[5]]$conf
##          [,1]
## [1,] 72.70251
## [2,] 72.87749
## 
## [[5]]$out
##  [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18
## 
## [[5]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[5]]$names
## [1] ""
## 
## 
## [[6]]
## [[6]]$stats
##       [,1]
## [1,] 0.000
## [2,] 0.120
## [3,] 0.555
## [4,] 0.610
## [5,] 1.100
## 
## [[6]]$n
## [1] 214
## 
## [[6]]$conf
##           [,1]
## [1,] 0.5020768
## [2,] 0.6079232
## 
## [[6]]$out
## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41
## 
## [[6]]$group
## [1] 1 1 1 1 1 1 1
## 
## [[6]]$names
## [1] ""
## 
## 
## [[7]]
## [[7]]$stats
##       [,1]
## [1,]  6.93
## [2,]  8.24
## [3,]  8.60
## [4,]  9.18
## [5,] 10.56
## 
## [[7]]$n
## [1] 214
## 
## [[7]]$conf
##          [,1]
## [1,] 8.498474
## [2,] 8.701526
## 
## [[7]]$out
##  [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13]  5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22  6.65  5.43
## [25]  5.79  6.47
## 
## [[7]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[7]]$names
## [1] ""
## 
## 
## [[8]]
## [[8]]$stats
##      [,1]
## [1,]    0
## [2,]    0
## [3,]    0
## [4,]    0
## [5,]    0
## 
## [[8]]$n
## [1] 214
## 
## [[8]]$conf
##      [,1]
## [1,]    0
## [2,]    0
## 
## [[8]]$out
##  [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67
## 
## [[8]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[8]]$names
## [1] ""
## 
## 
## [[9]]
## [[9]]$stats
##      [,1]
## [1,] 0.00
## [2,] 0.00
## [3,] 0.00
## [4,] 0.10
## [5,] 0.25
## 
## [[9]]$n
## [1] 214
## 
## [[9]]$conf
##             [,1]
## [1,] -0.01080066
## [2,]  0.01080066
## 
## [[9]]$out
##  [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28
## 
## [[9]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[9]]$names
## [1] ""
#(b) Do there appear to be any outliers in the data? Are any predictors skewed? From the distributions we see that RI, Na, Ca, Ba, and Fe are skewed to the right. For the outliers it looks like Ca and Ba have outliers. 
  1. Are there any relevant transformations of one or more predictors that might improve the classification model? we can use box-cox transforamtions

3.2. The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct

The data can be loaded via: > library(mlbench) > data(Soybean) > ## See ?Soybean for details (a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter? (b) Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes? (c) Develop a strategy for handling missing data, either by eliminating predictors or imputation

library(mlbench)
data(Soybean)
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
soybean <- data(Soybean)
Soybean %>%
  mutate(
    date = as.character(date),
    plant.stand = as.factor(plant.stand)
  ) %>%
  select(-Class) %>%
  drop_na() %>%
  pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_bar() +
  facet_wrap(~ key)

(a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter? We see several degenerate distributions. Particulary in mycellium,sclerotia,roots, and leaf.mild somewhat in others as well.

Soybean %>%
  summarise_all(~is.na(.)) %>%
  pivot_longer(cols = everything(), names_to = "variables", values_to = "missing") %>%
  count(variables, missing) %>%
  ggplot(aes(y = variables, x = n, fill = missing)) +
  geom_col() +
  scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red"))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate missing percentage for each predictor
missing_percentage <- colMeans(is.na(Soybean)) * 100

missing_data <- data.frame(Predictor = names(missing_percentage), MissingPercentage = missing_percentage)

missing_data_sorted <- missing_data[order(-missing_data$MissingPercentage), ]

# Print the sorted data frame with column names
print(missing_data_sorted)
##                       Predictor MissingPercentage
## hail                       hail        17.7159590
## sever                     sever        17.7159590
## seed.tmt               seed.tmt        17.7159590
## lodging                 lodging        17.7159590
## germ                       germ        16.3982430
## leaf.mild             leaf.mild        15.8125915
## fruiting.bodies fruiting.bodies        15.5197657
## fruit.spots         fruit.spots        15.5197657
## seed.discolor     seed.discolor        15.5197657
## shriveling           shriveling        15.5197657
## leaf.shread         leaf.shread        14.6412884
## seed                       seed        13.4699854
## mold.growth         mold.growth        13.4699854
## seed.size             seed.size        13.4699854
## leaf.halo             leaf.halo        12.2986823
## leaf.marg             leaf.marg        12.2986823
## leaf.size             leaf.size        12.2986823
## leaf.malf             leaf.malf        12.2986823
## fruit.pods           fruit.pods        12.2986823
## precip                   precip         5.5636896
## stem.cankers       stem.cankers         5.5636896
## canker.lesion     canker.lesion         5.5636896
## ext.decay             ext.decay         5.5636896
## mycelium               mycelium         5.5636896
## int.discolor       int.discolor         5.5636896
## sclerotia             sclerotia         5.5636896
## plant.stand         plant.stand         5.2708638
## roots                     roots         4.5387994
## temp                       temp         4.3923865
## crop.hist             crop.hist         2.3426061
## plant.growth       plant.growth         2.3426061
## stem                       stem         2.3426061
## date                       date         0.1464129
## area.dam               area.dam         0.1464129
## Class                     Class         0.0000000
## leaves                   leaves         0.0000000
missing_indicator <- data.frame(HasMissing = apply(is.na(Soybean), 1, any), Class = Soybean$Class)

# Calculate proportions of missing values by class
missing_proportions_by_class <- tapply(missing_indicator$HasMissing, missing_indicator$Class, mean)

print(missing_proportions_by_class)
##                2-4-d-injury         alternarialeaf-spot 
##                   1.0000000                   0.0000000 
##                 anthracnose            bacterial-blight 
##                   0.0000000                   0.0000000 
##           bacterial-pustule                  brown-spot 
##                   0.0000000                   0.0000000 
##              brown-stem-rot                charcoal-rot 
##                   0.0000000                   0.0000000 
##               cyst-nematode diaporthe-pod-&-stem-blight 
##                   1.0000000                   1.0000000 
##       diaporthe-stem-canker                downy-mildew 
##                   0.0000000                   0.0000000 
##          frog-eye-leaf-spot            herbicide-injury 
##                   0.0000000                   1.0000000 
##      phyllosticta-leaf-spot            phytophthora-rot 
##                   0.0000000                   0.7727273 
##              powdery-mildew           purple-seed-stain 
##                   0.0000000                   0.0000000 
##        rhizoctonia-root-rot 
##                   0.0000000
imputed_soybean <- Soybean
for (col in names(Soybean)) {
  if (is.numeric(Soybean[[col]])) {
    imputed_soybean[[col]][is.na(Soybean[[col]])] <- mean(Soybean[[col]], na.rm = TRUE)
  } else if (is.factor(Soybean[[col]])) {
# For categorical variables, replace missing values with the mode
    levels <- levels(Soybean[[col]])
    mode_value <- levels[which.max(table(Soybean[[col]]))]
    imputed_soybean[[col]][is.na(Soybean[[col]])] <- mode_value
  }
}
  1. Develop a strategy for handling missing data, either by eliminating predictors or imputation

We can handle missing data byreplacing missing values with mean, median, or mode for each predictor