DATA 624 Homework 4

library(knitr)
library(rmdformats)

## Global options
options(max.print="31")
# opts_chunk$set(echo=FALSE,
#                cache=TRUE,
#                prompt=FALSE,
#                tidy=TRUE,
#                comment=NA,
#                message=FALSE,
#                warning=FALSE)
opts_knit$set(width=31)
require(mlbench)

## Loading required package: mlbench

library(mlbench)
library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# install.packages("devtools")
# devtools::install_github("MathiasHarrer/dmetar")
# library(dmetar)
# install.packages("rstatix")
# install.packages("tibble")
library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

# install.packages("ggpubr")

library(ggpubr)
library(SLIDE)

library(corrplot)

## corrplot 0.84 loaded

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(moments)

# 3.2 
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tidyr)
library(naniar)

data(Glass)
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

3.1

(a)

Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.

Glass_pivoted <- Glass %>%
  tidyr::pivot_longer(-Type, names_to = "Predictor", values_to = "Value")

head(Glass_pivoted)

## # A tibble: 6 x 3
##   Type  Predictor Value
##   <fct> <chr>     <dbl>
## 1 1     RI         1.52
## 2 1     Na        13.6 
## 3 1     Mg         4.49
## 4 1     Al         1.1 
## 5 1     Si        71.8 
## 6 1     K          0.06

# Compute summary stats 

summary.stats <- Glass_pivoted %>%
  group_by(Type) %>%
  get_summary_stats()

ggsummarytable(
  summary.stats, x = "Type", y= c("n", "median", "iqr")
)

# plot type against value using boxplot
ggsummarystats(
  Glass_pivoted, x = "Type", y = "Value", 
  ggfunc = ggboxplot,  color = "Type", 
  )

list_unique <- Glass_pivoted %>% pull(Predictor) %>% unique()


lapply(list_unique[1:8], function(i){
  ggsummarystats(
    Glass_pivoted %>% filter(Predictor == i), x = "Type", y = "Value", ggfunc = ggboxplot,  color = "Type", title = paste0("Element ",i," by Glass Type"))
})

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

# RI
# ggsummarystats(
# Glass_pivoted %>% filter(Predictor == list_unique[1]), x = "Type", y = "Value", ggfunc = ggboxplot,  color = "Type", title = paste0("Element ",list_unique[1]," by Glass Type"))

Key Observations

Element Mg and Fe show a considerable bigger spread in values for type 1 - 3 than 5 - 7.
Element Ba has a much higher median in type 7 than typo 1 - 3, 5, 6.
Element Ca has a clearly higher median in glass type 5 than the rest.
Element AI has a bigger spread and higher median values in type 5 - 7 than 1 - 3.

And we’re going to delve into the correlation among predictor variables.

corrplot(cor(Glass[,1:9]), method="color",  type="lower", order="hclust", addCoef.col = "black", tl.col="grey", tl.srt=0, diag=FALSE)

The strongest correlation that stands out is the RI, the Refractive Index, and Ca, Calcium and nothing else has more than .55, which tells me there are no strong correlations among the predictors.

(b)

Do there appear to be any outliers in the data? Are any predictors skewed?

summary.stats <- Glass_pivoted %>%
  group_by(Predictor) %>%
  get_summary_stats()

ggsummarytable(
  summary.stats, x = "Predictor", y= c("n", "median", "iqr")
)

# Create a barplot to illustrate outliners 

ggsummarystats(
  Glass_pivoted, x = "Predictor", y = "Value",
  ggfunc = ggboxplot, color = "Predictor"
)

# Checking for skewness in all the series of elements

list_unique

## [1] "RI" "Na" "Mg" "Al" "Si" "K"  "Ca" "Ba" "Fe"

lapply(list_unique[1:8], function(i){
  moments::skewness(Glass_pivoted %>% filter(Predictor == i) %>% select(Value))
})

## [[1]]
##    Value 
## 1.614015 
## 
## [[2]]
##     Value 
## 0.4509917 
## 
## [[3]]
##     Value 
## -1.144465 
## 
## [[4]]
##     Value 
## 0.9009179 
## 
## [[5]]
##      Value 
## -0.7253173 
## 
## [[6]]
##    Value 
## 6.505636 
## 
## [[7]]
##    Value 
## 2.032677 
## 
## [[8]]
##    Value 
## 3.392431

It appears very clearly in the boxplot above that K and Ca are quite skewed, and K, Ca, Ba, Si, AI and Na are shown to have predictors that are outliers. In addition, if we’re talking about a numeric representation of skewness, positively or negatively, Ba and Fe definitely present some significant skewnesses.

(c)

Are there any relevant transformations of one or more predictors that might improve the classification model?

Yes, BoxCox transformation or log transformation might improve the classification model. On top of that, removing outliers might be the best bet for improving the classification model. Center and scaling is another option that might improve model the performance.

3.2

data(Soybean)

(a)

Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?

str(Soybean)

## 'data.frame':    683 obs. of  36 variables:
##  $ Class          : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
##  $ date           : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
##  $ plant.stand    : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ precip         : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ temp           : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ hail           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ crop.hist      : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
##  $ area.dam       : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
##  $ sever          : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
##  $ seed.tmt       : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
##  $ germ           : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
##  $ plant.growth   : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaves         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ leaf.halo      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.marg      : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.size      : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
##  $ leaf.shread    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.malf      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ leaf.mild      : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stem           : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ lodging        : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
##  $ stem.cankers   : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
##  $ canker.lesion  : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
##  $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ext.decay      : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ mycelium       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ int.discolor   : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sclerotia      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.pods     : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ fruit.spots    : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
##  $ seed           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mold.growth    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.discolor  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ seed.size      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ shriveling     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ roots          : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...

head(Soybean)

##      Class date plant.stand precip temp hail crop.hist area.dam sever seed.tmt
##      germ plant.growth leaves leaf.halo leaf.marg leaf.size leaf.shread
##      leaf.malf leaf.mild stem lodging stem.cankers canker.lesion
##      fruiting.bodies ext.decay mycelium int.discolor sclerotia fruit.pods
##      fruit.spots seed mold.growth seed.discolor seed.size shriveling roots
##  [ reached 'max' / getOption("max.print") -- omitted 6 rows ]

summary(Soybean)

##                  Class          date     plant.stand  precip      temp    
##    hail     crop.hist  area.dam    sever     seed.tmt     germ     plant.growth
##  leaves  leaf.halo  leaf.marg  leaf.size  leaf.shread leaf.malf  leaf.mild 
##    stem     lodging    stem.cankers canker.lesion fruiting.bodies ext.decay 
##  mycelium   int.discolor sclerotia  fruit.pods fruit.spots   seed    
##  mold.growth seed.discolor seed.size  shriveling  roots    
##  [ reached getOption("max.print") -- omitted 7 rows ]

nearZeroVar(Soybean)

## [1] 19 26 28

names(Soybean)[19]

## [1] "leaf.mild"

names(Soybean)[26]

## [1] "mycelium"

names(Soybean)[28]

## [1] "sclerotia"

Ans: After reviewing the output of the summary function, I do not see any degenerate functions (or constant functions) within this dataset Soybean. As a side note, I did investigate into predictors that are of near zero variance, meaning of little-to-no value in contributing as a predictor to the model. They are leaf.mild, mycelium, and sclerotia.

(b)

Roughly 18% of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?

# Percentage of missing by class ordered by pct_miss in descending order
Soybean %>%
  group_by(Class) %>%
  miss_var_summary() %>%
  arrange(desc(pct_miss)) %>% 
  as.data.frame()

##                         Class    variable n_miss pct_miss
## 1 diaporthe-pod-&-stem-blight        hail     15      100
## 2 diaporthe-pod-&-stem-blight       sever     15      100
## 3 diaporthe-pod-&-stem-blight    seed.tmt     15      100
## 4 diaporthe-pod-&-stem-blight   leaf.halo     15      100
## 5 diaporthe-pod-&-stem-blight   leaf.marg     15      100
## 6 diaporthe-pod-&-stem-blight   leaf.size     15      100
## 7 diaporthe-pod-&-stem-blight leaf.shread     15      100
##  [ reached 'max' / getOption("max.print") -- omitted 658 rows ]

gg_miss_fct(Soybean, Class) + labs(title="% of Missing in Predictor variable by Class",  x ="Class", y="Predictor Variable")

Ans: The table above clearly shows that there isn’t a predictor that is missing significantly more than the others. It also shows that there are 5 classes that are susceptible to missing predictors systematically throughout the dataset or data collection process. They are 2-4-d-injury, cyst-nematode, diaporthe-pod-&-stem_blight, herbicide-injury, and phytophthora-rot.

(c)

Develop a strategy for handling missing data, either by eliminating predictors or imputation.

Ans: Imputation seems to be the best strategy for taking care of the missing values for some of these classes. But before we proceed, do we know whether it’s a natural phenomenon of having missing predictors to afore-mentioned classes. Are they just not relevant to be begin with so that the data collection process was skipped intentionally? If the answer to that is a no, we can proceed with imputation. Besides a tree-based approach, I’d use K-nearest neigbor model to impute the missing values for the specific classes seen above because it does make sense that the values in these predictors in the training set confine the range of imputed data and since it’s likely that the classes that we’re dealing with are likely some classes that are highly correlated to some of the other classes that has predictor populated. I would not go the route of using nearZeroVar function to eliminate predictors as I don’t see the results (from part a) applied to our chart above.