library(mlbench)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
dim(Glass)
## [1] 214 10
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
colSums(is.na(Glass))
## RI Na Mg Al Si K Ca Ba Fe Type
## 0 0 0 0 0 0 0 0 0 0
table(Glass$Type)
##
## 1 2 3 5 6 7
## 70 76 17 13 9 29
prop.table(table(Glass$Type))
##
## 1 2 3 5 6 7
## 0.32710280 0.35514019 0.07943925 0.06074766 0.04205607 0.13551402
ggplot(Glass, aes(x = Type)) +
geom_bar() +
labs(
title = "Distribution of Glass Types",
x = "Glass Type",
y = "Count"
) +
theme_minimal()
num_vars <- names(Glass)[sapply(Glass, is.numeric)]
for (v in num_vars) {
p <- ggplot(Glass, aes_string(x = v)) +
geom_histogram(bins = 20, fill = "steelblue", color = "white") +
labs(
title = paste("Histogram of", v),
x = v,
y = "Count"
) +
theme_minimal()
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(Glass, aes(x = Type, y = Na)) +
geom_boxplot(fill = "lightblue") +
theme_minimal() +
labs(title = "Na by Glass Type")
ggplot(Glass, aes(x = Type, y = Mg)) +
geom_boxplot(fill = "lightgreen") +
theme_minimal() +
labs(title = "Mg by Glass Type")
ggplot(Glass, aes(x = Type, y = Ca)) +
geom_boxplot(fill = "lightpink") +
theme_minimal() +
labs(title = "Ca by Glass Type")
num_data <- Glass %>% select(where(is.numeric))
cor_mat <- cor(num_data)
round(cor_mat, 2)
## RI Na Mg Al Si K Ca Ba Fe
## RI 1.00 -0.19 -0.12 -0.41 -0.54 -0.29 0.81 0.00 0.14
## Na -0.19 1.00 -0.27 0.16 -0.07 -0.27 -0.28 0.33 -0.24
## Mg -0.12 -0.27 1.00 -0.48 -0.17 0.01 -0.44 -0.49 0.08
## Al -0.41 0.16 -0.48 1.00 -0.01 0.33 -0.26 0.48 -0.07
## Si -0.54 -0.07 -0.17 -0.01 1.00 -0.19 -0.21 -0.10 -0.09
## K -0.29 -0.27 0.01 0.33 -0.19 1.00 -0.32 -0.04 -0.01
## Ca 0.81 -0.28 -0.44 -0.26 -0.21 -0.32 1.00 -0.11 0.12
## Ba 0.00 0.33 -0.49 0.48 -0.10 -0.04 -0.11 1.00 -0.06
## Fe 0.14 -0.24 0.08 -0.07 -0.09 -0.01 0.12 -0.06 1.00
library(corrplot)
## corrplot 0.95 loaded
corrplot(
cor_mat,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45
)
Based on the histograms and boxplots above, several predictors show skewed distributions and possible outliers. Variables such as Ba, Fe, and K are strongly right-skewed, with many values near zero and a few much larger values. The boxplots also show multiple points outside the whiskers for Na and Ca across several glass types, indicating potential outliers. So the dataset does contain both skewed predictors and outlier observations.
Because some predictors are clearly skewed, transformations such as log or square-root transforms could help make their distributions more symmetric. In particular, Ba, Fe, and K would be reasonable candidates. However, in the SVM model below, all predictors are centered and scaled before training, which already reduces scale effects. Since radial SVM is less sensitive to skewness than linear models, transformation may help but is not strictly required here.
library(caret)
## Loading required package: lattice
set.seed(123)
train_index <- createDataPartition(Glass$Type, p = 0.7, list = FALSE)
train_data <- Glass[train_index, ]
test_data <- Glass[-train_index, ]
dim(train_data)
## [1] 153 10
dim(test_data)
## [1] 61 10
prop.table(table(train_data$Type))
##
## 1 2 3 5 6 7
## 0.32026144 0.35294118 0.07843137 0.06535948 0.04575163 0.13725490
prop.table(table(test_data$Type))
##
## 1 2 3 5 6 7
## 0.34426230 0.36065574 0.08196721 0.04918033 0.03278689 0.13114754
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(AppliedPredictiveModeling)
set.seed(231)
sigDist <- sigest(Type ~ ., data = Glass, frac = 1)
sigDist
## 90% 50% 10%
## 0.03407935 0.11297847 0.62767315
svmTuneGrid <- data.frame(
sigma = as.vector(sigDist)[1],
C = 2^(-2:10)
)
svmTuneGrid
## sigma C
## 1 0.03407935 0.25
## 2 0.03407935 0.50
## 3 0.03407935 1.00
## 4 0.03407935 2.00
## 5 0.03407935 4.00
## 6 0.03407935 8.00
## 7 0.03407935 16.00
## 8 0.03407935 32.00
## 9 0.03407935 64.00
## 10 0.03407935 128.00
## 11 0.03407935 256.00
## 12 0.03407935 512.00
## 13 0.03407935 1024.00
set.seed(1056)
svmFit <- train(
Type ~ .,
data = Glass,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(
method = "repeatedcv",
repeats = 5
)
)
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 214 samples
## 9 predictor
## 6 classes: '1', '2', '3', '5', '6', '7'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 194, 191, 194, 192, 193, 194, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5462532 0.3286847
## 0.50 0.5721161 0.3689913
## 1.00 0.6364625 0.4706664
## 2.00 0.6832063 0.5506030
## 4.00 0.7028410 0.5801977
## 8.00 0.6980384 0.5744221
## 16.00 0.7015910 0.5810250
## 32.00 0.7014216 0.5834227
## 64.00 0.7121792 0.6028075
## 128.00 0.6985407 0.5863168
## 256.00 0.7029157 0.5932771
## 512.00 0.7084049 0.6017276
## 1024.00 0.7059784 0.5981677
##
## Tuning parameter 'sigma' was held constant at a value of 0.03407935
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03407935 and C = 64.
plot(svmFit, scales = list(x = list(log = 2)))