#install.packages('corrplot')
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)
3.1. The UC Irvine Machine Learning Repository6 contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via: > library(mlbench) > data(Glass) > str(Glass) ‘data.frame’: 214 obs. of 10 variables: $ RI : num 1.52 1.52 1.52 1.52 1.52 … $ Na : num 13.6 13.9 13.5 13.2 13.3 … $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 … $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 … $ Si : num 71.8 72.7 73 72.6 73.1 … $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 … $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 … $ Ba : num 0 0 0 0 0 0 0 0 0 0 … $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 … $ Type: Factor w/ 6 levels “1”,“2”,“3”,“5”,..: 1 1 1 1 1 1 1 1 1 1 … (a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors. (b) Do there appear to be any outliers in the data? Are any predictors skewed? (c) Are there any relevant transformations of one or more predictors that might improve the classification model?
library(mlbench)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
#(a) Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
# Remove categorical variables
glass_numeric <- Glass[, sapply(Glass, is.numeric)]
# Create correlation matrix
correlation_matrix <- cor(glass_numeric)
# Create correlation plot
corrplot(correlation_matrix, method = "circle", type = "upper", tl.cex = 0.7)
par(mfrow = c(3, 3))
for (i in 1:ncol(glass_numeric)) {
hist(glass_numeric[, i], main = names(glass_numeric)[i], xlab = "", col = "skyblue", border = "white")
}
# we see normal distribtions for RI, Na, AL, and Si
cor((glass_numeric[,1:9]))
## RI Na Mg Al Si K
## RI 1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790 1.00000000 -0.273731961 0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196 1.000000000 -0.48179851 -0.16592672 0.005395667
## Al -0.4073260341 0.15679367 -0.481798509 1.00000000 -0.00552372 0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372 1.00000000 -0.193330854
## K -0.2898327111 -0.26608650 0.005395667 0.32595845 -0.19333085 1.000000000
## Ca 0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189 0.32660288 -0.492262118 0.47940390 -0.10215131 -0.042618059
## Fe 0.1430096093 -0.24134641 0.083059529 -0.07440215 -0.09420073 -0.007719049
## Ca Ba Fe
## RI 0.8104027 -0.0003860189 0.143009609
## Na -0.2754425 0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178 0.083059529
## Al -0.2595920 0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K -0.3178362 -0.0426180594 -0.007719049
## Ca 1.0000000 -0.1128409671 0.124968219
## Ba -0.1128410 1.0000000000 -0.058691755
## Fe 0.1249682 -0.0586917554 1.000000000
# Select columns 1 to 9 from the Glass dataset
df <- Glass[, 1:9]
# Set up the layout
par(mfrow = c(3, 3))
lapply(1:ncol(df), function(i) {
boxplot(df[, i], ylab = names(df)[i], horizontal = TRUE,
col = "green")
})
## [[1]]
## [[1]]$stats
## [,1]
## [1,] 1.51299
## [2,] 1.51652
## [3,] 1.51768
## [4,] 1.51916
## [5,] 1.52300
##
## [[1]]$n
## [1] 214
##
## [[1]]$conf
## [,1]
## [1,] 1.517395
## [2,] 1.517965
##
## [[1]]$out
## [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
##
## [[1]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[1]]$names
## [1] ""
##
##
## [[2]]
## [[2]]$stats
## [,1]
## [1,] 11.56
## [2,] 12.90
## [3,] 13.30
## [4,] 13.83
## [5,] 15.15
##
## [[2]]$n
## [1] 214
##
## [[2]]$conf
## [,1]
## [1,] 13.19955
## [2,] 13.40045
##
## [[2]]$out
## [1] 11.45 10.73 11.23 11.02 11.03 17.38 15.79
##
## [[2]]$group
## [1] 1 1 1 1 1 1 1
##
## [[2]]$names
## [1] ""
##
##
## [[3]]
## [[3]]$stats
## [,1]
## [1,] 0.00
## [2,] 2.09
## [3,] 3.48
## [4,] 3.60
## [5,] 4.49
##
## [[3]]$n
## [1] 214
##
## [[3]]$conf
## [,1]
## [1,] 3.31691
## [2,] 3.64309
##
## [[3]]$out
## numeric(0)
##
## [[3]]$group
## numeric(0)
##
## [[3]]$names
## [1] ""
##
##
## [[4]]
## [[4]]$stats
## [,1]
## [1,] 0.56
## [2,] 1.19
## [3,] 1.36
## [4,] 1.63
## [5,] 2.27
##
## [[4]]$n
## [1] 214
##
## [[4]]$conf
## [,1]
## [1,] 1.312477
## [2,] 1.407523
##
## [[4]]$out
## [1] 0.29 0.47 0.47 0.51 3.50 3.04 3.02 0.34 2.38 2.79 2.68 2.54 2.34 2.66 2.51
## [16] 2.42 2.74 2.88
##
## [[4]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[4]]$names
## [1] ""
##
##
## [[5]]
## [[5]]$stats
## [,1]
## [1,] 71.15
## [2,] 72.28
## [3,] 72.79
## [4,] 73.09
## [5,] 73.88
##
## [[5]]$n
## [1] 214
##
## [[5]]$conf
## [,1]
## [1,] 72.70251
## [2,] 72.87749
##
## [[5]]$out
## [1] 70.57 69.81 70.16 74.45 69.89 70.48 70.70 74.55 75.41 70.26 70.43 75.18
##
## [[5]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[5]]$names
## [1] ""
##
##
## [[6]]
## [[6]]$stats
## [,1]
## [1,] 0.000
## [2,] 0.120
## [3,] 0.555
## [4,] 0.610
## [5,] 1.100
##
## [[6]]$n
## [1] 214
##
## [[6]]$conf
## [,1]
## [1,] 0.5020768
## [2,] 0.6079232
##
## [[6]]$out
## [1] 1.68 6.21 6.21 1.76 1.46 2.70 1.41
##
## [[6]]$group
## [1] 1 1 1 1 1 1 1
##
## [[6]]$names
## [1] ""
##
##
## [[7]]
## [[7]]$stats
## [,1]
## [1,] 6.93
## [2,] 8.24
## [3,] 8.60
## [4,] 9.18
## [5,] 10.56
##
## [[7]]$n
## [1] 214
##
## [[7]]$conf
## [,1]
## [1,] 8.498474
## [2,] 8.701526
##
## [[7]]$out
## [1] 11.64 10.79 13.24 13.30 16.19 11.52 10.99 14.68 14.96 14.40 11.14 13.44
## [13] 5.87 11.41 11.62 11.53 11.32 12.24 12.50 11.27 10.88 11.22 6.65 5.43
## [25] 5.79 6.47
##
## [[7]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[7]]$names
## [1] ""
##
##
## [[8]]
## [[8]]$stats
## [,1]
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
##
## [[8]]$n
## [1] 214
##
## [[8]]$conf
## [,1]
## [1,] 0
## [2,] 0
##
## [[8]]$out
## [1] 0.09 0.11 0.69 0.14 0.11 3.15 0.27 0.09 0.06 0.15 2.20 0.24 1.19 1.63 1.68
## [16] 0.76 0.64 0.40 1.59 1.57 0.61 0.81 0.66 0.64 0.53 0.63 0.56 1.71 0.67 1.55
## [31] 1.38 2.88 0.54 1.06 1.59 1.64 1.57 1.67
##
## [[8]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[8]]$names
## [1] ""
##
##
## [[9]]
## [[9]]$stats
## [,1]
## [1,] 0.00
## [2,] 0.00
## [3,] 0.00
## [4,] 0.10
## [5,] 0.25
##
## [[9]]$n
## [1] 214
##
## [[9]]$conf
## [,1]
## [1,] -0.01080066
## [2,] 0.01080066
##
## [[9]]$out
## [1] 0.26 0.30 0.31 0.32 0.34 0.28 0.29 0.28 0.35 0.37 0.51 0.28
##
## [[9]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[9]]$names
## [1] ""
#(b) Do there appear to be any outliers in the data? Are any predictors skewed? From the distributions we see that RI, Na, Ca, Ba, and Fe are skewed to the right. For the outliers it looks like Ca and Ba have outliers.
3.2. The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct
The data can be loaded via: > library(mlbench) > data(Soybean) > ## See ?Soybean for details (a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter? (b) Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes? (c) Develop a strategy for handling missing data, either by eliminating predictors or imputation
library(mlbench)
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
soybean <- data(Soybean)
Soybean %>%
mutate(
date = as.character(date),
plant.stand = as.factor(plant.stand)
) %>%
select(-Class) %>%
drop_na() %>%
pivot_longer(cols = everything(), names_to = "key", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_bar() +
facet_wrap(~ key)
(a) Investigate the frequency distributions for the categorical
predictors. Are any of the distributions degenerate in the ways
discussed earlier in this chapter? We see several degenerate
distributions. Particulary in mycellium,sclerotia,roots, and leaf.mild
somewhat in others as well.
Soybean %>%
summarise_all(~is.na(.)) %>%
pivot_longer(cols = everything(), names_to = "variables", values_to = "missing") %>%
count(variables, missing) %>%
ggplot(aes(y = variables, x = n, fill = missing)) +
geom_col() +
scale_fill_manual(values = c("FALSE" = "blue", "TRUE" = "red"))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the dplyr package.
## Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Calculate missing percentage for each predictor
missing_percentage <- colMeans(is.na(Soybean)) * 100
missing_data <- data.frame(Predictor = names(missing_percentage), MissingPercentage = missing_percentage)
missing_data_sorted <- missing_data[order(-missing_data$MissingPercentage), ]
# Print the sorted data frame with column names
print(missing_data_sorted)
## Predictor MissingPercentage
## hail hail 17.7159590
## sever sever 17.7159590
## seed.tmt seed.tmt 17.7159590
## lodging lodging 17.7159590
## germ germ 16.3982430
## leaf.mild leaf.mild 15.8125915
## fruiting.bodies fruiting.bodies 15.5197657
## fruit.spots fruit.spots 15.5197657
## seed.discolor seed.discolor 15.5197657
## shriveling shriveling 15.5197657
## leaf.shread leaf.shread 14.6412884
## seed seed 13.4699854
## mold.growth mold.growth 13.4699854
## seed.size seed.size 13.4699854
## leaf.halo leaf.halo 12.2986823
## leaf.marg leaf.marg 12.2986823
## leaf.size leaf.size 12.2986823
## leaf.malf leaf.malf 12.2986823
## fruit.pods fruit.pods 12.2986823
## precip precip 5.5636896
## stem.cankers stem.cankers 5.5636896
## canker.lesion canker.lesion 5.5636896
## ext.decay ext.decay 5.5636896
## mycelium mycelium 5.5636896
## int.discolor int.discolor 5.5636896
## sclerotia sclerotia 5.5636896
## plant.stand plant.stand 5.2708638
## roots roots 4.5387994
## temp temp 4.3923865
## crop.hist crop.hist 2.3426061
## plant.growth plant.growth 2.3426061
## stem stem 2.3426061
## date date 0.1464129
## area.dam area.dam 0.1464129
## Class Class 0.0000000
## leaves leaves 0.0000000
missing_indicator <- data.frame(HasMissing = apply(is.na(Soybean), 1, any), Class = Soybean$Class)
# Calculate proportions of missing values by class
missing_proportions_by_class <- tapply(missing_indicator$HasMissing, missing_indicator$Class, mean)
print(missing_proportions_by_class)
## 2-4-d-injury alternarialeaf-spot
## 1.0000000 0.0000000
## anthracnose bacterial-blight
## 0.0000000 0.0000000
## bacterial-pustule brown-spot
## 0.0000000 0.0000000
## brown-stem-rot charcoal-rot
## 0.0000000 0.0000000
## cyst-nematode diaporthe-pod-&-stem-blight
## 1.0000000 1.0000000
## diaporthe-stem-canker downy-mildew
## 0.0000000 0.0000000
## frog-eye-leaf-spot herbicide-injury
## 0.0000000 1.0000000
## phyllosticta-leaf-spot phytophthora-rot
## 0.0000000 0.7727273
## powdery-mildew purple-seed-stain
## 0.0000000 0.0000000
## rhizoctonia-root-rot
## 0.0000000
imputed_soybean <- Soybean
for (col in names(Soybean)) {
if (is.numeric(Soybean[[col]])) {
imputed_soybean[[col]][is.na(Soybean[[col]])] <- mean(Soybean[[col]], na.rm = TRUE)
} else if (is.factor(Soybean[[col]])) {
# For categorical variables, replace missing values with the mode
levels <- levels(Soybean[[col]])
mode_value <- levels[which.max(table(Soybean[[col]]))]
imputed_soybean[[col]][is.na(Soybean[[col]])] <- mode_value
}
}
We can handle missing data byreplacing missing values with mean, median, or mode for each predictor