library(fpp3)
library(knitr)
library(mlbench)
library(ggcorrplot)
library(RColorBrewer)
library(MASS)
select <- dplyr::select
library(hablar)
library(VIM)

Exercise 3.1:

The UC Irvine Machine Learning Repository contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe. The data can be accessed via:

data(Glass)
cols <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
glass_pivot <- Glass |>
    rownames_to_column() |>
    pivot_longer(cols = all_of(cols), names_to = "Variable",
                 values_to = "Value")
glass_pivot |>
    ggplot(aes(x = Value)) +
    geom_histogram(bins = 10, color = "#276419", fill = "#4D9221") +
    facet_wrap(vars(Variable), scales = "free_x")

glass_pivot |>
    ggplot(aes(x = Value)) +
    geom_boxplot(color = "#276419", fill = "#4D9221", outlier.color = "#C51B7D") +
    facet_wrap(vars(Variable), scales = "free_x")

corr <- round(cor(Glass[, cols]), 1)
ggcorrplot(corr, hc.order = TRUE, outline.color = "#F7F7F7", lab = TRUE,
           colors = c("#4D9221", "#F7F7F7", "#C51B7D"))

Yes, there are outliers in the data for every predictor other than Mg. Some of the predictors appear normally distributed: Al, Na, and Si. Others are right-skewed: Ba, Ca, Fe, K, and RI. One predictor, Mg, has a bimodal distribution.

Yes, there are transformations for the right-skewed variables that might improve the model:

skewed <- c("Ba", "Ca", "Fe", "K", "RI")
for (i in 1:(length(skewed))){
    #Add a small constant to columns with any 0 values
    if (sum(Glass[[skewed[i]]] == 0) > 0){
        Glass[[skewed[i]]] <-
            Glass[[skewed[i]]] + 0.01
    }
}
for (i in 1:(length(skewed))){
    if (i == 1){
        lambdas <- c()
    }
    bc <- boxcox(lm(Glass[[skewed[i]]] ~ 1),
                 lambda = seq(-2, 2, length.out = 81),
                 plotit = FALSE)
    lambda <- bc$x[which.max(bc$y)]
    lambdas <- append(lambdas, lambda)
}
lambdas <- as.data.frame(cbind(skewed, lambdas))
kable(lambdas, format = "simple")
skewed lambdas
Ba -1.3
Ca -1.1
Fe -0.8
K 0.35
RI -2

We can look at the distributions of the transformed variables to see if they’ve been improved:

not_skewed <- c("Na", "Mg", "Al", "Si")
glass_skewed_transformed <- Glass |>
    select(-all_of(not_skewed)) |>
    rownames_to_column() |>
    mutate(Ba = Ba^-1.3,
           Ca = Ca^-1.1,
           Fe = Fe^-0.8,
           K = K^0.35,
           RI = RI^-2) |>
    pivot_longer(cols = all_of(skewed), names_to = "Variable",
                 values_to = "Value")
    
glass_skewed_transformed |>
    ggplot(aes(x = Value)) +
    geom_histogram(bins = 10, color = "#276419", fill = "#4D9221") +
    facet_wrap(vars(Variable), scales = "free_x")

The results are mixed, with the transformations working best on Ca, K, and RI.

Exercise 3.2:

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes. The data can be loaded via:

data(Soybean)
cols <- c("date", "plant.stand", "precip", "temp", "hail", "crop.hist",
          "area.dam", "sever", "seed.tmt", "germ", "plant.growth",
          "leaves", "leaf.halo", "leaf.marg", "leaf.size", "leaf.shread",
          "leaf.malf", "leaf.mild", "stem", "lodging", "stem.cankers",
          "canker.lesion", "fruiting.bodies", "ext.decay", "mycelium",
          "int.discolor", "sclerotia", "fruit.pods", "fruit.spots", "seed",
          "mold.growth", "seed.discolor", "seed.size", "shriveling", "roots")
soybean_pivot <- Soybean |>
    convert(chr(all_of(cols))) |>
    rownames_to_column() |>
    pivot_longer(cols = all_of(cols), names_to = "Variable",
                 values_to = "Value")
soybean_pivot |>
    filter(Variable %in% cols[1:17]) |>
    ggplot(aes(x = Value)) +
    geom_histogram(stat = "count", color = "#276419", fill = "#4D9221") +
    facet_wrap(vars(Variable), scales = "free_x")

soybean_pivot |>
    filter(Variable %in% cols[18:35]) |>
    ggplot(aes(x = Value)) +
    geom_histogram(stat = "count", color = "#276419", fill = "#4D9221") +
    facet_wrap(vars(Variable), scales = "free_x")

Taking degenerate distributions to mean variables for which almost all observations have the same value, these variables appear to have degenerate distributions: leaves, int.discolor, leaf.mild, lodging, mold.growth, mycelium, roots, sclerotia, seed.discolor, seed.size, and shriveling.

The variables that are most likely to have NA values are: germ, hail, seed.tmt, sever, and lodging. We can isolate the five classes that have NA values for any variables:

soybean_na <- Soybean
soybean_na$na_count <- rowSums(is.na(soybean_na))
soybean_na <- soybean_na |>
    filter(na_count > 0)
classes <- as.data.frame(unique(soybean_na$Class))
colnames(classes) <- NULL
kable(classes, format = "simple")
phytophthora-rot
diaporthe-pod-&-stem-blight
cyst-nematode
2-4-d-injury
herbicide-injury

The number of variables for which the values are NA ranges from 11-30 for these five classes.

We can use both predictor elimination and imputation to deal with the missing data. Let’s eliminate the predictor variables that have generate distributions, then fill in missing values for the rest of the predictor variables using knn imputation.

degenerate <- c("leaves", "int.discolor", "leaf.mild", "lodging",
                "mold.growth", "mycelium", "roots", "sclerotia",
                "seed.discolor", "seed.size", "shriveling")
x <- colnames(Soybean)
x <- x[2:36]
not_degenerate <- x[!x %in% degenerate]
soybean_imputed <- Soybean |>
    select(-all_of(degenerate)) |>
    VIM::kNN(variable = not_degenerate, k = 15, numFun = weighted.mean,
             weightDist = TRUE, imp_var = FALSE)
summary(soybean_imputed)
##                  Class     date    plant.stand precip  temp    hail   
##  brown-spot         : 92   0: 27   0:360       0: 74   0: 92   0:556  
##  alternarialeaf-spot: 91   1: 75   1:323       1:130   1:388   1:127  
##  frog-eye-leaf-spot : 91   2: 93               2:479   2:203          
##  phytophthora-rot   : 88   3:118                                      
##  anthracnose        : 44   4:131                                      
##  brown-stem-rot     : 44   5:149                                      
##  (Other)            :233   6: 90                                      
##  crop.hist area.dam sever   seed.tmt germ    plant.growth leaf.halo leaf.marg
##  0: 81     0:124    0:198   0:384    0:171   0:441        0:303     0:359    
##  1:165     1:227    1:370   1:264    1:313   1:242        1: 36     1: 21    
##  2:219     2:145    2:115   2: 35    2:199                2:344     2:303    
##  3:218     3:187                                                             
##                                                                              
##                                                                              
##                                                                              
##  leaf.size leaf.shread leaf.malf stem    stem.cankers canker.lesion
##  0: 51     0:587       0:624     0:309   0:379        0:320        
##  1:329     1: 96       1: 59     1:374   1: 44        1: 83        
##  2:303                                   2: 39        2:215        
##                                          3:221        3: 65        
##                                                                    
##                                                                    
##                                                                    
##  fruiting.bodies ext.decay fruit.pods fruit.spots seed   
##  0:572           0:534     0:407      0:345       0:556  
##  1:111           1:135     1:130      1: 75       1:127  
##                  2: 14     2: 27      2: 71              
##                            3:119      4:192              
##                                                          
##                                                          
##