library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.92 loaded
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Applied Predictive Modeling - Chapter 3: Data Preprocessing

Exercise 3.1

The UC Irvine Machine Learning Repository contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.

data(Glass)
# Load data set Glass from mlbench library
?Glass

# View first 5 rows 
head(Glass)
# View summary stats
summary(Glass)
##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

a. Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

A grid with scatter plots to visualize the relationships between all predictor variables is helpful to get a quick glance of which two predictors to inspect closer.

# Scatter plot between all continuous variables 
pairs(Glass[, 1:9], main = "Relationships Between Predictor Variables", 
      pch = 19,                                                
      cex = 0.5, 
      gap = 0.3, 
      upper.panel = NULL)

Two relationships that appear to be the strongest and might be interesting to zoom in on are those between the reative index vs. calcium and sodium vs. aluminum.

Glass %>% ggplot(aes(x = Ca, y = RI)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "orange") +
  labs(title = " Relationship Between Reactive Index (RI) and Calcium (Ca)")
## `geom_smooth()` using formula = 'y ~ x'

Glass %>% ggplot(aes(x = Na, y = Al)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "orange") +
  labs(title = " Relationship Between Aluminum (Al) and Sodium (Na)")
## `geom_smooth()` using formula = 'y ~ x'

Correlation Plot of Numeric Predictors

# Only visualize the lower quadrant 

cor(Glass[,1:9]) %>% corrplot(method = "color", 
                              type = "lower", 
                              addCoef.col = 'black',                      
                              col = COL2('PuOr'))

par(mfrow=c(1,3))
attach(Glass)
hist(RI)
hist(Na)
hist(Mg)

par(mfrow=c(1,3))
attach(Glass)
## The following objects are masked from Glass (pos = 3):
## 
##     Al, Ba, Ca, Fe, K, Mg, Na, RI, Si, Type
hist(Al)
hist(Si)
hist(K)

par(mfrow=c(1,3))
attach(Glass)
## The following objects are masked from Glass (pos = 3):
## 
##     Al, Ba, Ca, Fe, K, Mg, Na, RI, Si, Type
## The following objects are masked from Glass (pos = 4):
## 
##     Al, Ba, Ca, Fe, K, Mg, Na, RI, Si, Type
hist(Ca)
hist(Ba)
hist(Fe)

Glass %>% ggplot(aes(x = Type)) + geom_bar(fill = "#590097") + labs(title = "Frequency of Types")

# Box plots to show distributions of the continuous variables within each class
Glass %>% ggplot(aes(x = Type, y = RI)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of RI Between Glass Types") + theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Na)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Na Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Mg)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Mg Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Al)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Al Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Si)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Si Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = K)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of K Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Ca)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Ca Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Ba)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Ba Between Glass Types")+ theme(legend.position = "none")

Glass %>% ggplot(aes(x = Type, y = Fe)) + geom_boxplot(aes(fill = Type)) + labs(title = "Distribution of Fe Between Glass Types")+ theme(legend.position = "none")

b. Do there appear to be any outliers in the data? Are any predictors skewed?

Based on looking at the distribution plots of the elements, there does appear to be outliers. Potassium (K), Barium (Ba), Calcium (Ca), Iron (Fe) are all right skewed, indicating few values that are large compared to the rest of the data. Magnesium (Mg) is left skewed with about 40 observations with very low levels. This might reflect a specific type of glass that has very low levels of this element in it.

c. Are there any relevant transformations of one or more predictors that might improve the classification model?

Depending on the classification model transforming the predictors mentioned above (K, Ba, Ca, Fe and Mg) would help to improve the model. Centering the data is important for many classification models like K-nearest neighbors, support vector machines and logistic regression. Another important transformation will be to scale the data. The predictors are all on different scales and bringing them into a standard format ensures that all features lie within the same range, making them comparable.

Exercise 3.2

The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.

data("Soybean")

head(Soybean)
str(Soybean)
## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

a. Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

# Use summary to generate the distribution of values in each predictor 

summary(Soybean)
##                  Class          date     plant.stand  precip      temp    
##  brown-spot         : 92   5      :149   0   :354    0   : 74   0   : 80  
##  alternarialeaf-spot: 91   4      :131   1   :293    1   :112   1   :374  
##  frog-eye-leaf-spot : 91   3      :118   NA's: 36    2   :459   2   :199  
##  phytophthora-rot   : 88   2      : 93               NA's: 38   NA's: 30  
##  anthracnose        : 44   6      : 90                                    
##  brown-stem-rot     : 44   (Other):101                                    
##  (Other)            :233   NA's   :  1                                    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  0   :435   0   : 65   0   :123   0   :195   0   :305   0   :165   0   :441    
##  1   :127   1   :165   1   :227   1   :322   1   :222   1   :213   1   :226    
##  NA's:121   2   :219   2   :145   2   : 45   2   : 35   2   :193   NA's: 16    
##             3   :218   3   :187   NA's:121   NA's:121   NA's:112               
##             NA's: 16   NA's:  1                                                
##                                                                                
##                                                                                
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##  0: 77   0   :221   0   :357   0   : 51   0   :487    0   :554   0   :535  
##  1:606   1   : 36   1   : 21   1   :327   1   : 96    1   : 45   1   : 20  
##          2   :342   2   :221   2   :221   NA's:100    NA's: 84   2   : 20  
##          NA's: 84   NA's: 84   NA's: 84                          NA's:108  
##                                                                            
##                                                                            
##                                                                            
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  0   :296   0   :520   0   :379     0   :320      0   :473        0   :497  
##  1   :371   1   : 42   1   : 39     1   : 83      1   :104        1   :135  
##  NA's: 16   NA's:121   2   : 36     2   :177      NA's:106        2   : 13  
##                        3   :191     3   : 65                      NA's: 38  
##                        NA's: 38     NA's: 38                                
##                                                                             
##                                                                             
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  0   :639   0   :581     0   :625   0   :407   0   :345    0   :476  
##  1   :  6   1   : 44     1   : 20   1   :130   1   : 75    1   :115  
##  NA's: 38   2   : 20     NA's: 38   2   : 14   2   : 57    NA's: 92  
##             NA's: 38                3   : 48   4   :100              
##                                     NA's: 84   NA's:106              
##                                                                      
##                                                                      
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  0   :524    0   :513      0   :532   0   :539   0   :551  
##  1   : 67    1   : 64      1   : 59   1   : 38   1   : 86  
##  NA's: 92    NA's:106      NA's: 92   NA's:106   2   : 15  
##                                                  NA's: 31  
##                                                            
##                                                            
## 

Visualize the number of unique values in each predictor and its missing values

# Create an empty tibble 

df <- tibble()
col_num = 2

# Calculate the number of occurrences of each value and add it to the empty tibble
for (c in colnames(Soybean[2:36])){
  
  a <- Soybean[2:36] %>% count(Soybean[col_num]) %>% 
    pivot_wider(names_from = c, values_from = n) %>% mutate(Variable = c)
  df <- bind_rows(df,a)
  
  col_num <- col_num + 1
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(c)
## 
##   # Now:
##   data %>% select(all_of(c))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Change the column names 
colnames(df) <- c("0", "1", "2", "3", "4", "5", "6", "NA", "Variable")

# Re order columns 
df <- df %>% select(Variable, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `NA`)

# Fill_na values with 0 in order to plot without missing values for categories that are part of the variable's measurement.
df[is.na(df)] = 0

# pivot the table longer to fill in the bars with the value category
df <- df %>% pivot_longer(cols = colnames(df[2:9]), names_to = "category", values_to = "category_count")
df %>% ggplot(aes(y = Variable, x = category_count, fill = category)) + 
  geom_bar(position="stack", stat="identity") + 
  labs(title = "Stacked bar chart of the distribution of values (with NA's) in each predictor")

Based on the plot above we can look at the proportion of values among the predictors that looked ot have a majority of one value.

#colnames(Soybean)

Soybean %>% group_by(mycelium) %>% summarise(prop = n()/nrow(Soybean))
Soybean %>% group_by(sclerotia) %>% summarise(prop = n()/nrow(Soybean))
Soybean %>% group_by(leaves) %>% summarise(prop = n()/nrow(Soybean))

Using the summary function we can see a count of the unique values in each predictor as well as the number of missing values. In the dat set there are no predictors with only one unique value or what the text book refers to as degenerative distributions. However, there seems to be a few predictors where the majority of observations fall into one category. These include mycelium, sclerotia where over 90% of the observations are in the same category.

c. Develop a strategy for handling missing data, either by eliminating predictors or imputation.

In order to understand which strategy for handling missing values would be most appropriate further investigating the distribution of missing values would be helpful. As a review all observations (rows) with missing values are distributed across 5 classes:

  1. 2-4-d-injury
  2. cyst-nematode
  3. diaporthe-pod-&-stem-blight
  4. herbicide-injury
  5. phytophthora-rot

Among the classes above we can look at how many missing values there are in each predictor.

# Filter the original data by the 5 classes
class_na <- Soybean %>% filter(Class %in% c("phytophthora-rot", "cyst-nematode", "diaporthe-pod-&-stem-blight", "herbicide-injury", "2-4-d-injury")) %>% 
  # Group by the class 
  group_by(Class) %>%
  # Count the missing values in each predictor
  summarise_all(~ sum(is.na(.)))

class_na
# Find how many predictors contain all missing values for 2-4-d-injury

class_na %>% 
  filter(Class == "2-4-d-injury") %>% 
  # Select all variables except the class
  select(-Class) %>% 
  # Pivot df to longer format and filter out the predictors with missing values = to the total number of observations
  # for 2-4-d-injury
  pivot_longer(cols = colnames(class_na)[-1], names_to = "Variable", values_to = "Counts") %>% 
  filter(Counts >= 16)

When exploring which predictors have missing values among each classes of disease, it seems that several predictors do not have any information for a particular class. For example, the disease class ‘2-4-d-injury’ accounts of 16 observations in the data set. Several predictors do not have any recorded value, in fact 28 predictors do not have any value recorded. Imputing a the most frequent value within each class will not work for these predictors. An alternative approach is to impute a values using the K-nearest neighbor classification to find samples that are closest to the observation with missing data. Before imputing, I would first eliminate the predictors that were identified as having near-zero variance.

It is good to sort the variables in ascending order by the number of missing values before running kNN. This way, all distance calculations will use as little imputed data as possible

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Sort predictors by the number of missing values in ascending order

na_cols_ordered <- Soybean %>% 
  # drop the two predictor variables 
  select(-c("mycelium", "sclerotia")) %>% 
  is.na() %>% 
  colSums() %>% 
  sort(decreasing = FALSE) %>%
  names()

# impute the missing values 

soybean_imputed <- Soybean %>% 
  select(all_of(na_cols_ordered)) %>% 
  kNN(k = 5)

Recheck that the imputation replaced the missing values.

soybean_imputed[1:34] %>% 
  is.na() %>% 
  colSums()
##           Class          leaves            date        area.dam       crop.hist 
##               0               0               0               0               0 
##    plant.growth            stem            temp           roots     plant.stand 
##               0               0               0               0               0 
##          precip    stem.cankers   canker.lesion       ext.decay    int.discolor 
##               0               0               0               0               0 
##       leaf.halo       leaf.marg       leaf.size       leaf.malf      fruit.pods 
##               0               0               0               0               0 
##            seed     mold.growth       seed.size     leaf.shread fruiting.bodies 
##               0               0               0               0               0 
##     fruit.spots   seed.discolor      shriveling       leaf.mild            germ 
##               0               0               0               0               0 
##            hail           sever        seed.tmt         lodging 
##               0               0               0               0