<- c("caret", "corrplot", "AppliedPredictiveModeling", "e1071",
libraries "patchwork", "MASS", "ggplot2", "mlbench", "dplyr", "tidyr")
# Loop through the list and load each library
for(lib in libraries) {
library(lib, character.only = TRUE)
}
data(Soybean)
data(Glass)
Kuhn Johnson Chapter 3 homework
3.1 Glass
The UCIrvine Machine Learning Repository contains a data set related to glass identification. The data consist of 214 glass samples labeled as one of seven class categories. There are nine predictors, including the refractive index and percentages of eight elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.
3.1.A Predictor distributions and relationships
Using visualizations, explore the predictor variables to understand their distributions as well as the relationships between predictors.
There’s lots to see in these distribution charts. First, there are some strong correlations. The histograms show more than half of these variables having skew or outliers, that might require transformation if we are to model.
<- ggplot(Glass, aes(x=Si)) + geom_histogram(bins=30) + ggtitle("Si")
plot_Si <- ggplot(Glass, aes(x=Na)) + geom_histogram(bins=30) + ggtitle("Na")
plot_Na <- ggplot(Glass, aes(x=Al)) + geom_histogram(bins=30) + ggtitle("Al")
plot_Al
<- ggplot(Glass, aes(x=Ba)) + geom_histogram(bins=30) + ggtitle("Ba")
plot_Ba <- ggplot(Glass, aes(x=RI)) + geom_histogram(bins=30) + ggtitle("RI")
plot_RI <- ggplot(Glass, aes(x=Fe)) + geom_histogram(bins=30) + ggtitle("Fe")
plot_Fe
<- ggplot(Glass, aes(x=Mg)) + geom_histogram(bins=30) + ggtitle("Mg")
plot_Mg <- ggplot(Glass, aes(x=K)) + geom_histogram(bins=30) + ggtitle("K")
plot_K <- ggplot(Glass, aes(x=Ca)) + geom_histogram(bins=30) + ggtitle("Ca")
plot_Ca
## Ca is taken out if you factor out insignificant factors.
<- cor(Glass[, sapply(Glass, is.numeric)])
correlations <- findCorrelation(correlations, cutoff = .75)
highCorr <- Glass[, -highCorr]
filteredData <- cor(filteredData[, sapply(filteredData, is.numeric)])
filteredCorrelations corrplot(filteredCorrelations, order = "hclust")
mtext("Glass Variable Correlation Chart", side = 2, line = 2)
| plot_Na | plot_Al ) /
(plot_Si | plot_RI | plot_Fe ) /
(plot_Ba | plot_K | plot_Ca) (plot_Mg
3.1.B Outliers and Skew
Do there appear to be any outliers in the data? Are any predictors skewed?
Yes to both. Both the box and distribution plots show both outliers and skew. Ba and Fe are positively skewed, suggesting a transformation might normalize data and improve model performance. Si and Na seem fine.
<- ggplot(Glass, aes(x=Si)) + geom_histogram(bins=30) + ggtitle("Si")
plot_Si <- ggplot(Glass, aes(x=Na)) + geom_histogram(bins=30) + ggtitle("Na")
plot_Na <- ggplot(Glass, aes(x=Al)) + geom_histogram(bins=30) + ggtitle("Al")
plot_Al <- ggplot(Glass, aes(x=Ba)) + geom_histogram(bins=30) + ggtitle("Ba")
plot_Ba
<- ggplot(Glass, aes(x=RI)) + geom_histogram(bins=30) + ggtitle("RI")
plot_RI <- ggplot(Glass, aes(x=Fe)) + geom_histogram(bins=30) + ggtitle("Fe")
plot_Fe <- ggplot(Glass, aes(x=Mg)) + geom_histogram(bins=30) + ggtitle("Mg")
plot_Mg <- ggplot(Glass, aes(x=K)) + geom_histogram(bins=30) + ggtitle("K")
plot_K
boxplot(Glass[, sapply(Glass, is.numeric)],
main = "Boxplot for All Predictor Variables",
las = 2,
col = rainbow(ncol(Glass[, sapply(Glass, is.numeric)]))
)
3.1.C Transformations
Are there any relevant transformations of one or more predictors that might improve the classification model?
Yes, BoxCox transformation would work well here. It might look something like this.
head(Glass)
RI Na Mg Al Si K Ca Ba Fe Type
1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
<- sapply(Glass[, -ncol(Glass)], is.numeric)
numeric_predictors for(i in which(numeric_predictors)) {
<- skewness(Glass[[i]])
skewness_value = 1
skewness_threshold if(abs(skewness_value) > skewness_threshold) {
# Find the best lambda for Box-Cox transformation
<- BoxCoxTrans(Glass[[i]])
bc_transform # Apply the transformation
<- predict(bc_transform, Glass[[i]])
Glass[[i]]
}
}head(Glass)
RI Na Mg Al Si K Ca Ba Fe Type
1 0.2838746 13.64 4.49 1.10 71.78 0.06 0.8254539 0 0.00 1
2 0.2829051 13.89 3.60 1.36 72.73 0.48 0.8145827 0 0.00 1
3 0.2824954 13.53 3.55 1.54 72.99 0.39 0.8139144 0 0.00 1
4 0.2829194 13.21 3.69 1.29 72.61 0.57 0.8195032 0 0.00 1
5 0.2828507 13.27 3.62 1.24 73.08 0.55 0.8176698 0 0.00 1
6 0.2824323 12.79 3.61 1.62 72.97 0.64 0.8176698 0 0.26 1
3.2. Soybeans
The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmental conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
3.2.A Degenerates
Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
Yes, there are degenerative distributions. To be considered degenerate, 1. either a single category dominates, 2. there is very low frequency of most categories, or 3. there is perfect separation. In this case, most of the variables seem to be spiked at zero, falling into scenario 2.
Let’s look at them all, you can see for yourself.
<- list()
plot_list
for (var in names(Soybean)) {
if (is.factor(Soybean[[var]])) {
<- ggplot(Soybean, aes_string(x = var)) +
p geom_bar() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = var, y = 'Count') + # Set axis labels
ggtitle(var)
<- p
plot_list[[var]]
}
}
<- wrap_plots(plot_list, ncol = 4)
plot_grid plot_grid
3.2.B Review Missing Data
Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?
There is a definite pattern with missing data. There are some classes that are full of data, and some classes have almost no data at all. Let’s place them side by side to see.
# first let's consider the whole dataset.
<- Soybean %>%
missing_info summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "column_name", values_to = "missing") %>%
mutate(missing_percentage = (missing / nrow(Soybean)) * 100) %>%
arrange(desc(missing_percentage))
# now let's look by class.
<- Soybean %>%
missing_by_class group_by(Class) %>%
summarise_all(~ mean(is.na(.))) %>%
pivot_longer(cols = -Class, names_to = "column_name", values_to = "Missing_Percentage") %>%
group_by(Class) %>%
summarise(Total_Missing_Percentage = mean(Missing_Percentage))
<- ggplot(missing_by_class, aes(x = reorder(Class, -Total_Missing_Percentage), y = Total_Missing_Percentage)) +
by_class_missing geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Class", y = "Missing Data Percentage", title = "Missing Data Percentage by Class")
<- ggplot(missing_info, aes(x = reorder(column_name, -missing_percentage), y = missing_percentage)) +
total_missing geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Column Name", y = "Missing Percentage", title = "Missing Data Percentage by Column")
| by_class_missing ) (total_missing
3.2.C Imputation
Develop a strategy for handling missing data, either by eliminating predictors or imputation.
Perhaps the best alternative would be imputation. And for somebody new to this, perhaps a simple imputation strategy like mean averages is a good start. It should most likely be imputed by class, so let’s try that out with code
Please note it took about three hours to write ths code. Simple yes, but not easy.
<- Soybean %>%
Soybean_imputed mutate(across(-Class, ~as.numeric(as.character(.)))) %>%
group_by(Class) %>%
mutate(across(everything(), ~if_else(is.na(.), mean(., na.rm = TRUE), .))) %>%
ungroup() %>%
mutate(across(-Class, ~factor(., ordered = TRUE))) #Convert the numeric columns back to ordinal factors
print("Before imputation")
[1] "Before imputation"
print(sapply(Soybean, function(x) sum(is.na(x))))
Class date plant.stand precip temp
0 1 36 38 30
hail crop.hist area.dam sever seed.tmt
121 16 1 121 121
germ plant.growth leaves leaf.halo leaf.marg
112 16 0 84 84
leaf.size leaf.shread leaf.malf leaf.mild stem
84 100 84 108 16
lodging stem.cankers canker.lesion fruiting.bodies ext.decay
121 38 38 106 38
mycelium int.discolor sclerotia fruit.pods fruit.spots
38 38 38 84 106
seed mold.growth seed.discolor seed.size shriveling
92 92 106 92 106
roots
31
print("After imputation")
[1] "After imputation"
print(sapply(Soybean_imputed, function(x) sum(is.na(x))))
Class date plant.stand precip temp
0 0 0 0 0
hail crop.hist area.dam sever seed.tmt
0 0 0 0 0
germ plant.growth leaves leaf.halo leaf.marg
0 0 0 0 0
leaf.size leaf.shread leaf.malf leaf.mild stem
0 0 0 0 0
lodging stem.cankers canker.lesion fruiting.bodies ext.decay
0 0 0 0 0
mycelium int.discolor sclerotia fruit.pods fruit.spots
0 0 0 0 0
seed mold.growth seed.discolor seed.size shriveling
0 0 0 0 0
roots
0