library(mlbench)
library(tidyverse)
library(corrplot)
library(reshape2) # for melt
library(fpp3)
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
glimpse(Glass)
## Rows: 214
## Columns: 10
## $ RI <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.~
## $ Na <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13~
## $ Mg <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,~
## $ Al <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,~
## $ Si <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72~
## $ K <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,~
## $ Ca <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,~
## $ Ba <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Fe <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,~
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
The Glass types are 1,2,3,5,6,7. There is no type 4. Adjust for consistency.
Glass$Type <- factor(Glass$Type, labels = c(1,2,3,4,5,6))
Add unique ID numbers for each observation.
Glass <- cbind(data.frame(Id = as.integer(rownames(Glass))), Glass)
Glass_melt = melt(subset(Glass, select=-c(Type)), id.vars = "Id")
ggplot(aes(value), data = Glass_melt) + geom_histogram(stat = "bin", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables", x = "Variable", y = "Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are many zeroes in
Mg, K,
Ba, and Fe. The RI,
Na, Al, Si and Ca
predictors barely look normally distributed. Curiously, could there be a
correlation between Glass type and the presence of zeroes?
unique(Glass$Type)
## [1] 1 2 3 4 5 6
## Levels: 1 2 3 4 5 6
Which class Types have all these zeroes?
zeroes <- data.frame()
ba_zero <- Glass %>%
group_by(Type) %>%
filter(Ba==0) %>%
summarize(
Ba_Zero = n()
)
mg_zero <- Glass %>%
group_by(Type) %>%
filter(Mg==0) %>%
summarize(
Mg_Zero = n()
)
mg_zero <- rbind(mg_zero, data.frame(Type=c(1,3), Mg_Zero=c(0,0))) %>% arrange(Type)
fe_zero <- Glass %>%
group_by(Type) %>%
filter(Fe==0) %>%
summarize(
Fe_Zero = n()
)
ri_zero <- data.frame(Type=c(1,2,3,4,5,6), RI_Zero=c(0,0,0,0,0,0))
na_zero <- data.frame(Type=c(1,2,3,4,5,6), Na_Zero=c(0,0,0,0,0,0))
al_zero <- data.frame(Type=c(1,2,3,4,5,6), Al_Zero=c(0,0,0,0,0,0))
si_zero <- data.frame(Type=c(1,2,3,4,5,6), Si_Zero=c(0,0,0,0,0,0))
k_zero <- Glass %>%
group_by(Type) %>%
filter(K==0) %>%
summarize(
K_Zero = n()
)
k_zero <- rbind(k_zero, c(Type=5,K_Zero=0)) %>% arrange(Type)
ca_zero <- data.frame(Type=c(1,2,3,4,5,6), Ca_Zero=c(0,0,0,0,0,0))
zeroes <- cbind(ri_zero, na_zero, mg_zero, al_zero, si_zero, k_zero, ca_zero, ba_zero, fe_zero)
zeroes <- zeroes[, c(1,2,4,6,8,10,12, 14,16,18)]
head(zeroes)
## Type RI_Zero Na_Zero Mg_Zero Al_Zero Si_Zero K_Zero Ca_Zero Ba_Zero Fe_Zero
## 1 1 0 0 0 0 0 1 0 67 45
## 2 2 0 0 9 0 0 3 0 70 44
## 3 3 0 0 0 0 0 1 0 16 12
## 4 4 0 0 7 0 0 9 0 11 11
## 5 5 0 0 3 0 0 0 0 9 9
## 6 6 0 0 23 0 0 16 0 3 23
Ba and Fe contain the most zeroes.Many machine learning models make the assumption that the predictors are normally distributed. This is not the case for the Glass dataset. One model, Naive Bayes, requires that the predictors are discrete. We can bin these continuous values to discretize them. Naive Bayes also assumes that the predictors are class conditionally independent. Class conditional independence is when the value of a particular feature is independent of the value of any other feature, given the class variable (sciencedirect.com). Rather than use log transformations to force each of the predictors into being normally distributed, we could bin the continuous values.
data(Soybean)
Frequency distributions for the categorized predictors
Soybean <- cbind(data.frame(Id = as.integer(rownames(Soybean))), Soybean)
soy_melt1 = melt(subset(Soybean, select=c(Id, date:leaves)), id.vars = "Id")
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(aes(value), data = soy_melt1) + geom_histogram(stat = "count", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables", x = "Variable", y = "Count")
## Warning in geom_histogram(stat = "count", fill = "navyblue"): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`
soy_melt2 = melt(subset(Soybean, select=c(Id, leaf.halo:ext.decay)), id.vars = "Id")
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(aes(value), data = soy_melt2) + geom_histogram(stat = "count", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables, con't", x = "Variable", y = "Count")
## Warning in geom_histogram(stat = "count", fill = "navyblue"): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`
soy_melt3 = melt(subset(Soybean, select=c(Id, mycelium:roots)), id.vars = "Id")
## Warning: attributes are not identical across measure variables; they will be
## dropped
ggplot(aes(value), data = soy_melt3) + geom_histogram(stat = "count", fill = "navyblue") + facet_wrap(~variable, scales = "free") + labs(title = "Distributions of Continuous Variables, con't 2", x = "Variable", y = "Count")
## Warning in geom_histogram(stat = "count", fill = "navyblue"): Ignoring unknown
## parameters: `binwidth`, `bins`, and `pad`
There is a lot of class imbalance. This can be seen by the widely differing heights of the bars within each subgraph. If the classes were balanced, bars within the subgraphs would be the same heights.
aggregate(. ~ Class, data=select(Soybean, -Id), function(x) {sum(is.na(x))}, na.action = NULL)
## Class date plant.stand precip temp hail crop.hist
## 1 2-4-d-injury 1 16 16 16 16 16
## 2 alternarialeaf-spot 0 0 0 0 0 0
## 3 anthracnose 0 0 0 0 0 0
## 4 bacterial-blight 0 0 0 0 0 0
## 5 bacterial-pustule 0 0 0 0 0 0
## 6 brown-spot 0 0 0 0 0 0
## 7 brown-stem-rot 0 0 0 0 0 0
## 8 charcoal-rot 0 0 0 0 0 0
## 9 cyst-nematode 0 14 14 14 14 0
## 10 diaporthe-pod-&-stem-blight 0 6 0 0 15 0
## 11 diaporthe-stem-canker 0 0 0 0 0 0
## 12 downy-mildew 0 0 0 0 0 0
## 13 frog-eye-leaf-spot 0 0 0 0 0 0
## 14 herbicide-injury 0 0 8 0 8 0
## 15 phyllosticta-leaf-spot 0 0 0 0 0 0
## 16 phytophthora-rot 0 0 0 0 68 0
## 17 powdery-mildew 0 0 0 0 0 0
## 18 purple-seed-stain 0 0 0 0 0 0
## 19 rhizoctonia-root-rot 0 0 0 0 0 0
## area.dam sever seed.tmt germ plant.growth leaves leaf.halo leaf.marg
## 1 1 16 16 16 16 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0
## 9 0 14 14 14 0 0 14 14
## 10 0 15 15 6 0 0 15 15
## 11 0 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0
## 14 0 8 8 8 0 0 0 0
## 15 0 0 0 0 0 0 0 0
## 16 0 68 68 68 0 0 55 55
## 17 0 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0 0
## leaf.size leaf.shread leaf.malf leaf.mild stem lodging stem.cankers
## 1 0 16 0 16 16 16 16
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 14 14 14 14 0 14 14
## 10 15 15 15 15 0 15 0
## 11 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0
## 14 0 0 0 8 0 8 8
## 15 0 0 0 0 0 0 0
## 16 55 55 55 55 0 68 0
## 17 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0
## canker.lesion fruiting.bodies ext.decay mycelium int.discolor sclerotia
## 1 16 16 16 16 16 16
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## 9 14 14 14 14 14 14
## 10 0 0 0 0 0 0
## 11 0 0 0 0 0 0
## 12 0 0 0 0 0 0
## 13 0 0 0 0 0 0
## 14 8 8 8 8 8 8
## 15 0 0 0 0 0 0
## 16 0 68 0 0 0 0
## 17 0 0 0 0 0 0
## 18 0 0 0 0 0 0
## 19 0 0 0 0 0 0
## fruit.pods fruit.spots seed mold.growth seed.discolor seed.size shriveling
## 1 16 16 16 16 16 16 16
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 14 0 0 14 0 14
## 10 0 0 0 0 0 0 0
## 11 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0
## 14 0 8 8 8 8 8 8
## 15 0 0 0 0 0 0 0
## 16 68 68 68 68 68 68 68
## 17 0 0 0 0 0 0 0
## 18 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0
## roots
## 1 16
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 15
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## 18 0
## 19 0
phyllostica-leaf-spot has many more missing values than
the other classes. Some classes have zero missing values, like
anthracnose.
Some classes should be dropped altogether because they are missing so many values across the predictors:
The missing values in the cyst-nematode class can be
imputed, since the other classes are not missing many values in
comparison. Impute them using the mean or median.