The goal of this analysis is to understand the characteristics of different glass types and build a classification model that can accurately predict the type of glass based on its chemical composition. We will: - Explore the data through visualizations to detect patterns. - Identify outliers and skewness that may affect our model. - Apply transformations and feature selection to improve accuracy. - Train a classification model to predict glass types.
library(AppliedPredictiveModeling)
library(mlbench)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## corrplot 0.95 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
We use the mlbench
package to access the Glass
dataset, which contains 9 chemical composition
features and a glass type classification.
data("Glass")
str(Glass) # Check structure
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass) # Get summary statistics
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
head(Glass) # View first few rows
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
To build a good model, we must understand the distribution of each feature. If a feature is highly skewed or contains extreme values, it may require transformation.
ggplot(Glass, aes(x=RI)) +
geom_histogram(aes(y=after_stat(density)), bins=30, fill="blue", alpha=0.5) +
geom_density(color="red", linewidth=1) +
ggtitle("Distribution of Refractive Index (RI)") +
theme_minimal()
# Compute correlation matrix (excluding the "Type" column)
cor_matrix <- cor(Glass[, -10])
# Create a heatmap
corrplot(cor_matrix, method="color", type="upper",
tl.col="black", tl.srt=45, addCoef.col = "black")
ggpairs(Glass[, -10]) # Excluding Type column
ggplot(Glass, aes(x = "", y = Na)) +
geom_boxplot(fill="lightblue", color="black") +
ggtitle("Boxplot of Sodium (Na)") +
theme_minimal()
skewness_values <- sapply(Glass[, -10], skewness)
skewness_values
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
If a feature is highly skewed, it may need transformation to make it more useful for classification.
Glass$Na_log <- log(Glass$Na)
Glass$Mg_log <- log(Glass$Mg + 1) # Avoid log(0)
Glass$K_log <- log(Glass$K + 1)
Glass$Na_sqrt <- sqrt(Glass$Na)
Glass$Mg_sqrt <- sqrt(Glass$Mg)
Glass$K_sqrt <- sqrt(Glass$K)
set.seed(52086)
index <- createDataPartition(Glass_selected$Type, p=0.8, list=FALSE)
train_data <- Glass_selected[index, ]
test_data <- Glass_selected[-index, ]
rf_model <- randomForest(Type ~ ., data=train_data, importance=TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Type ~ ., data = train_data, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21.84%
## Confusion matrix:
## 1 2 3 5 6 7 class.error
## 1 49 6 1 0 0 0 0.1250000
## 2 7 47 2 3 2 0 0.2295082
## 3 5 3 6 0 0 0 0.5714286
## 5 0 3 0 7 0 1 0.3636364
## 6 0 1 0 0 7 0 0.1250000
## 7 0 3 0 1 0 20 0.1666667
predictions <- predict(rf_model, test_data)
conf_matrix <- confusionMatrix(predictions, test_data$Type)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 13 3 2 0 0 1
## 2 1 11 1 2 0 1
## 3 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 1 0
## 7 0 1 0 0 0 3
##
## Overall Statistics
##
## Accuracy : 0.7
## 95% CI : (0.5347, 0.8344)
## No Information Rate : 0.375
## P-Value [Acc > NIR] : 3.088e-05
##
## Kappa : 0.5527
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.9286 0.7333 0.000 0.00 1.000 0.6000
## Specificity 0.7692 0.8000 1.000 1.00 1.000 0.9714
## Pos Pred Value 0.6842 0.6875 NaN NaN 1.000 0.7500
## Neg Pred Value 0.9524 0.8333 0.925 0.95 1.000 0.9444
## Prevalence 0.3500 0.3750 0.075 0.05 0.025 0.1250
## Detection Rate 0.3250 0.2750 0.000 0.00 0.025 0.0750
## Detection Prevalence 0.4750 0.4000 0.000 0.00 0.025 0.1000
## Balanced Accuracy 0.8489 0.7667 0.500 0.50 1.000 0.7857
library(mlbench)
data("Soybean")
str(Soybean) # Check structure
## 'data.frame': 683 obs. of 36 variables:
## $ Class : Factor w/ 19 levels "2-4-d-injury",..: 11 11 11 11 11 11 11 11 11 11 ...
## $ date : Factor w/ 7 levels "0","1","2","3",..: 7 5 4 4 7 6 6 5 7 5 ...
## $ plant.stand : Ord.factor w/ 2 levels "0"<"1": 1 1 1 1 1 1 1 1 1 1 ...
## $ precip : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ temp : Ord.factor w/ 3 levels "0"<"1"<"2": 2 2 2 2 2 2 2 2 2 2 ...
## $ hail : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ crop.hist : Factor w/ 4 levels "0","1","2","3": 2 3 2 2 3 4 3 2 4 3 ...
## $ area.dam : Factor w/ 4 levels "0","1","2","3": 2 1 1 1 1 1 1 1 1 1 ...
## $ sever : Factor w/ 3 levels "0","1","2": 2 3 3 3 2 2 2 2 2 3 ...
## $ seed.tmt : Factor w/ 3 levels "0","1","2": 1 2 2 1 1 1 2 1 2 1 ...
## $ germ : Ord.factor w/ 3 levels "0"<"1"<"2": 1 2 3 2 3 2 1 3 2 3 ...
## $ plant.growth : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaves : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ leaf.halo : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.marg : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.size : Ord.factor w/ 3 levels "0"<"1"<"2": 3 3 3 3 3 3 3 3 3 3 ...
## $ leaf.shread : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.malf : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ leaf.mild : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ stem : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ lodging : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 2 1 1 1 ...
## $ stem.cankers : Factor w/ 4 levels "0","1","2","3": 4 4 4 4 4 4 4 4 4 4 ...
## $ canker.lesion : Factor w/ 4 levels "0","1","2","3": 2 2 1 1 2 1 2 2 2 2 ...
## $ fruiting.bodies: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ext.decay : Factor w/ 3 levels "0","1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ mycelium : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ int.discolor : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ sclerotia : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.pods : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ fruit.spots : Factor w/ 4 levels "0","1","2","4": 4 4 4 4 4 4 4 4 4 4 ...
## $ seed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ mold.growth : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.discolor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ seed.size : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ shriveling : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ roots : Factor w/ 3 levels "0","1","2": 1 1 1 1 1 1 1 1 1 1 ...
categorical_vars <- names(Soybean)[sapply(Soybean, is.factor)]
summary(Soybean[, categorical_vars])
## Class date plant.stand precip temp
## brown-spot : 92 5 :149 0 :354 0 : 74 0 : 80
## alternarialeaf-spot: 91 4 :131 1 :293 1 :112 1 :374
## frog-eye-leaf-spot : 91 3 :118 NA's: 36 2 :459 2 :199
## phytophthora-rot : 88 2 : 93 NA's: 38 NA's: 30
## anthracnose : 44 6 : 90
## brown-stem-rot : 44 (Other):101
## (Other) :233 NA's : 1
## hail crop.hist area.dam sever seed.tmt germ plant.growth
## 0 :435 0 : 65 0 :123 0 :195 0 :305 0 :165 0 :441
## 1 :127 1 :165 1 :227 1 :322 1 :222 1 :213 1 :226
## NA's:121 2 :219 2 :145 2 : 45 2 : 35 2 :193 NA's: 16
## 3 :218 3 :187 NA's:121 NA's:121 NA's:112
## NA's: 16 NA's: 1
##
##
## leaves leaf.halo leaf.marg leaf.size leaf.shread leaf.malf leaf.mild
## 0: 77 0 :221 0 :357 0 : 51 0 :487 0 :554 0 :535
## 1:606 1 : 36 1 : 21 1 :327 1 : 96 1 : 45 1 : 20
## 2 :342 2 :221 2 :221 NA's:100 NA's: 84 2 : 20
## NA's: 84 NA's: 84 NA's: 84 NA's:108
##
##
##
## stem lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0 :296 0 :520 0 :379 0 :320 0 :473 0 :497
## 1 :371 1 : 42 1 : 39 1 : 83 1 :104 1 :135
## NA's: 16 NA's:121 2 : 36 2 :177 NA's:106 2 : 13
## 3 :191 3 : 65 NA's: 38
## NA's: 38 NA's: 38
##
##
## mycelium int.discolor sclerotia fruit.pods fruit.spots seed
## 0 :639 0 :581 0 :625 0 :407 0 :345 0 :476
## 1 : 6 1 : 44 1 : 20 1 :130 1 : 75 1 :115
## NA's: 38 2 : 20 NA's: 38 2 : 14 2 : 57 NA's: 92
## NA's: 38 3 : 48 4 :100
## NA's: 84 NA's:106
##
##
## mold.growth seed.discolor seed.size shriveling roots
## 0 :524 0 :513 0 :532 0 :539 0 :551
## 1 : 67 1 : 64 1 : 59 1 : 38 1 : 86
## NA's: 92 NA's:106 NA's: 92 NA's:106 2 : 15
## NA's: 31
##
##
##
missing_counts <- colSums(is.na(Soybean))
missing_percentage <- (missing_counts / nrow(Soybean)) * 100
missing_df <- data.frame(Variable = names(missing_counts), Missing_Percentage = missing_percentage)
missing_df <- missing_df[order(-missing_df$Missing_Percentage), ]
missing_df
## Variable Missing_Percentage
## hail hail 17.7159590
## sever sever 17.7159590
## seed.tmt seed.tmt 17.7159590
## lodging lodging 17.7159590
## germ germ 16.3982430
## leaf.mild leaf.mild 15.8125915
## fruiting.bodies fruiting.bodies 15.5197657
## fruit.spots fruit.spots 15.5197657
## seed.discolor seed.discolor 15.5197657
## shriveling shriveling 15.5197657
## leaf.shread leaf.shread 14.6412884
## seed seed 13.4699854
## mold.growth mold.growth 13.4699854
## seed.size seed.size 13.4699854
## leaf.halo leaf.halo 12.2986823
## leaf.marg leaf.marg 12.2986823
## leaf.size leaf.size 12.2986823
## leaf.malf leaf.malf 12.2986823
## fruit.pods fruit.pods 12.2986823
## precip precip 5.5636896
## stem.cankers stem.cankers 5.5636896
## canker.lesion canker.lesion 5.5636896
## ext.decay ext.decay 5.5636896
## mycelium mycelium 5.5636896
## int.discolor int.discolor 5.5636896
## sclerotia sclerotia 5.5636896
## plant.stand plant.stand 5.2708638
## roots roots 4.5387994
## temp temp 4.3923865
## crop.hist crop.hist 2.3426061
## plant.growth plant.growth 2.3426061
## stem stem 2.3426061
## date date 0.1464129
## area.dam area.dam 0.1464129
## Class Class 0.0000000
## leaves leaves 0.0000000
for (var in categorical_vars) {
mode_value <- names(sort(table(Soybean[[var]]), decreasing=TRUE))[1] # Find the mode
Soybean[[var]][is.na(Soybean[[var]])] <- mode_value # Impute missing values
}
This analysis shows how we explored, cleaned, transformed, and modeled the Glass and Soybean datasets to make predictions. 🚀