#Import needed libraries
library(tsibbledata) #to use the time series data in it for the exercises.
library(tsibble) # to use datasets and function as_tsibble
library(tibble) # to use view function
library(ggplot2)
library(feasts) # to use the functions for graphics like autoplot()
library(readr) # to uses read_csv function
library(dplyr) # to use Filter, mutate, arrange function etc
library(tidyr) # to use pivot_longer function
library(USgas) # to use us_total data
library(fpp3) # to use us_gasoline dataset
library(seasonal) # X-13ARIMA-SEATS decomposition
library(feasts)
library(mlbench) # to use glass data
library(corrplot)
When looking for “degenerate” distributions in categorical predictors, following are considered:
Near-zero variance predictors - variables where almost all observations have the same value (one dominant level) Zero variance predictors - variables with only one unique value (completely constant) Highly imbalanced categories - where one level appears much more frequently than others Missing values - categories with many NA values
Based on the frequency distribution plot, i can observe clearly the following: Near-zero variance predictors - mycelium Zero variance predictors - none Highly imbalanced - int_discolor, leaf_malf, sclerotia Missing values - many categorical predictors are missng values.
data(Soybean)
str(Soybean)
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
Soybean %>%
select(-Class)%>%
gather() %>%
ggplot(aes(value)) +
geom_bar()+
facet_wrap(~ key) +
labs(title = "Distribution of Soybean Categorical Predictor Variables")
Hail,sever,seed.tmt ,lodging have the highest missing % of 18% followed by germ which has 16%. Only Date, area.dam, Class, leaves doe not have missing values. Rest all have missing values. Yes the pattern of missing data is related to the classes as shown in the chart below. 5 classes only have missing data.
# Overall missing data by predictor
missing_by_predictor <- Soybean %>%
summarise(across(everything(), ~sum(is.na(.))/n()*100)) %>%
pivot_longer(cols = everything(),
names_to = "predictor",
values_to = "percent_missing") %>%
arrange(desc(percent_missing))
# Print predictors with highest missing rates
print(missing_by_predictor %>% filter(percent_missing > 0))
## # A tibble: 34 × 2
## predictor percent_missing
## <chr> <dbl>
## 1 hail 17.7
## 2 sever 17.7
## 3 seed.tmt 17.7
## 4 lodging 17.7
## 5 germ 16.4
## 6 leaf.mild 15.8
## 7 fruiting.bodies 15.5
## 8 fruit.spots 15.5
## 9 seed.discolor 15.5
## 10 shriveling 15.5
## # ℹ 24 more rows
# Missing data by class
missing_by_class <- Soybean %>%
group_by(Class) %>%
summarise(n_samples = n(),
across(everything(), ~sum(is.na(.))/n()*100)) %>%
pivot_longer(cols = -c(Class, n_samples),
names_to = "predictor",
values_to = "percent_missing") %>%
filter(percent_missing > 0)
# Find predictors with class-dependent missingness
class_dependent_missing <- missing_by_class %>%
group_by(predictor) %>%
summarise(min_missing = min(percent_missing),
max_missing = max(percent_missing),
range = max_missing - min_missing) %>%
filter(range > 10) %>% # Arbitrary threshold - variables with >10% difference between classes
arrange(desc(range))
# Visualize missing data by class for top variables with class-dependent missingness
if(nrow(class_dependent_missing) > 0) {
top_vars <- head(class_dependent_missing$predictor, 5)
missing_by_class %>%
filter(predictor %in% top_vars) %>%
ggplot(aes(x = Class, y = percent_missing, fill = predictor)) +
geom_bar(stat = "identity", position = "dodge") +
coord_flip() +
labs(title = "Missing Data by Class for Key Predictors",
y = "Percent Missing",
x = "Class") +
theme_minimal()
}
# Heatmap of missing data patterns by class
missing_heatmap <- Soybean %>%
group_by(Class) %>%
summarise(across(everything(), ~sum(is.na(.))/n()*100)) %>%
pivot_longer(cols = -Class,
names_to = "predictor",
values_to = "percent_missing") %>%
ggplot(aes(x = predictor, y = Class, fill = percent_missing)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Heatmap of Missing Data by Class and Predictor",
fill = "% Missing")
As each predictor has less than 18% of data missing, it is difficult to eliminate them as we have limited data. We need to understand the data more as to why only 5 classes are only missing data. One approach is to exclude these 5 classess and only use rest of the classes for classification.
If we have to handle imputation for important predictors,we can use missForest for imputation as it handles categorical data well and preserves relationships. I may consider handling class-specific missing data to ensure accurate classification.