# Install packages if needed
# install.packages("mlbench")
# install.packages("ggplot2")
# install.packages("GGally")
# install.packages("corrplot")
# install.packages("e1071")
# install.packages("kernlab")
# install.packages("AppliedPredictiveModeling")
# install.packages("caret")
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.5.2
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.5.2
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
Glass_long <- reshape2::melt(Glass, id.vars = "Type")
ggplot(Glass_long, aes(x = value)) +
facet_wrap(~variable, scales = "free") +
geom_histogram(bins = 30, fill = "blue") +
theme_minimal()
ggplot(Glass_long, aes(x = Type, y = value)) +
facet_wrap(~variable, scales = "free") +
geom_boxplot(fill = "green") +
theme_minimal()
ggpairs(Glass, aes(color = Type))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
The histograms show that several predictors are not normally distributed. RI and Si are fairly symmetric with small variability, while Na, Ca, K, Ba, and Fe are right-skewed, with Ba and Fe having many zero values. Boxplots by glass type show that variables such as Mg, Al, and Ba differ noticeably across classes, suggesting they are useful for classification. Pairwise plots indicate correlations among Ca, Mg, and Al, and show overlapping but structured class patterns.
b.Do there appear to be any outliers in the data? Are any predictors skewed? Show all the work!
boxplot(Glass[, -10], main = "Boxplots of Predictors")
skewness_values <- apply(Glass[, -10], 2, skewness)
skewness_values
## RI Na Mg Al Si K Ca
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889 2.0184463
## Ba Fe
## 3.3686800 1.7298107
Boxplots reveal several outliers, especially in Ba, Fe, K, and Ca. These appear consistently and likely represent real chemical differences rather than errors. The skewness values confirm this: K (6.46) and Ba (3.37) are highly right-skewed, with Ca (2.02) and Fe (1.73) also showing notable right skew. Na (0.45) is only mildly skewed, while RI and Si appear closer to symmetric. Overall, several predictors are right-skewed with outliers, supporting the need for transformation before modeling.
Glass_trans <- Glass
Glass_trans$Ba <- log(Glass$Ba + 1)
Glass_trans$Fe <- log(Glass$Fe + 1)
Glass_trans$K <- log(Glass$K + 1)
Glass_long2 <- reshape2::melt(Glass_trans, id.vars = "Type")
ggplot(Glass_long2, aes(x = value)) +
facet_wrap(~variable, scales = "free") +
geom_histogram(bins = 30, fill = "red") +
theme_minimal()
Because of skewness, zero inflation, and differing scales,
transformations are appropriate. A power transformation such as
Yeo–Johnson helps reduce skewness for variables like Ba, Fe, and K,
while centering and scaling ensure that no single predictor dominates
the model due to its scale.
set.seed(231)
sigDist <- sigest(Type ~ ., data = Glass, frac = 1)
sigDist
## 90% 50% 10%
## 0.03407935 0.11297847 0.62767315
svmTuneGrid <- data.frame(
sigma = as.vector(sigDist)[1],
C = 2^(-2:10)
)
svmTuneGrid
## sigma C
## 1 0.03407935 0.25
## 2 0.03407935 0.50
## 3 0.03407935 1.00
## 4 0.03407935 2.00
## 5 0.03407935 4.00
## 6 0.03407935 8.00
## 7 0.03407935 16.00
## 8 0.03407935 32.00
## 9 0.03407935 64.00
## 10 0.03407935 128.00
## 11 0.03407935 256.00
## 12 0.03407935 512.00
## 13 0.03407935 1024.00
set.seed(1056)
svmFit <- train(
Type ~ .,
data = Glass,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(method = "repeatedcv", repeats = 5)
)
plot(svmFit, scales = list(x = list(log = 2)))
svmFit$bestTune
## sigma C
## 9 0.03407935 64
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 214 samples
## 9 predictor
## 6 classes: '1', '2', '3', '5', '6', '7'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 194, 191, 194, 192, 193, 194, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5462532 0.3286847
## 0.50 0.5721161 0.3689913
## 1.00 0.6364625 0.4706664
## 2.00 0.6832063 0.5506030
## 4.00 0.7028410 0.5801977
## 8.00 0.6980384 0.5744221
## 16.00 0.7015910 0.5810250
## 32.00 0.7014216 0.5834227
## 64.00 0.7121792 0.6028075
## 128.00 0.6985407 0.5863168
## 256.00 0.7029157 0.5932771
## 512.00 0.7084049 0.6017276
## 1024.00 0.7059784 0.5981677
##
## Tuning parameter 'sigma' was held constant at a value of 0.03407935
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03407935 and C = 64.
The tuning plot shows that SVM accuracy improves as the cost parameter C increases, then levels off, indicating a good trade-off between flexibility and overfitting. The nonlinear and overlapping patterns seen in the visualizations support the use of a radial kernel SVM, which is well-suited to this data.