Exercise 2

# Install packages if needed
# install.packages("mlbench")
# install.packages("ggplot2")
# install.packages("GGally")
# install.packages("corrplot")
# install.packages("e1071")
# install.packages("kernlab")
# install.packages("AppliedPredictiveModeling")
# install.packages("caret")




library(mlbench)

## Warning: package 'mlbench' was built under R version 4.5.2

library(ggplot2)
library(GGally)

## Warning: package 'GGally' was built under R version 4.5.2

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.5.2

## corrplot 0.95 loaded

library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:ggplot2':
## 
##     element

library(kernlab)

## Warning: package 'kernlab' was built under R version 4.5.2

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.2

library(caret)

## Warning: package 'caret' was built under R version 4.5.2

## Loading required package: lattice

data(Glass)

str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

Utilize suitable visualizations (employ any types of data visualization you deem appropriate) to explore the predictor variables, aiming to understand their distributions and relationships among them.

Glass_long <- reshape2::melt(Glass, id.vars = "Type")

ggplot(Glass_long, aes(x = value)) +
  facet_wrap(~variable, scales = "free") +
  geom_histogram(bins = 30, fill = "blue") +
  theme_minimal()

ggplot(Glass_long, aes(x = Type, y = value)) +
  facet_wrap(~variable, scales = "free") +
  geom_boxplot(fill = "green") +
  theme_minimal()

ggpairs(Glass, aes(color = Type))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero
## There was 1 warning in `summarise()`.
## ℹ In argument: `text = text_fn(.data$x, .data$y)`.
## ℹ In group 5: `color = 6`.
## Caused by warning in `cor()`:
## ! the standard deviation is zero

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

The histograms show that several predictors are not normally distributed. RI and Si are fairly symmetric with small variability, while Na, Ca, K, Ba, and Fe are right-skewed, with Ba and Fe having many zero values. Boxplots by glass type show that variables such as Mg, Al, and Ba differ noticeably across classes, suggesting they are useful for classification. Pairwise plots indicate correlations among Ca, Mg, and Al, and show overlapping but structured class patterns.

b.Do there appear to be any outliers in the data? Are any predictors skewed? Show all the work!

boxplot(Glass[, -10], main = "Boxplots of Predictors")

skewness_values <- apply(Glass[, -10], 2, skewness)
skewness_values

##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107

Boxplots reveal several outliers, especially in Ba, Fe, K, and Ca. These appear consistently and likely represent real chemical differences rather than errors. The skewness values confirm this: K (6.46) and Ba (3.37) are highly right-skewed, with Ca (2.02) and Fe (1.73) also showing notable right skew. Na (0.45) is only mildly skewed, while RI and Si appear closer to symmetric. Overall, several predictors are right-skewed with outliers, supporting the need for transformation before modeling.

Are there any relevant transformations of one or more predictors that might improve the classification model? Show all the work!

Glass_trans <- Glass

Glass_trans$Ba <- log(Glass$Ba + 1)
Glass_trans$Fe <- log(Glass$Fe + 1)
Glass_trans$K  <- log(Glass$K + 1)

Glass_long2 <- reshape2::melt(Glass_trans, id.vars = "Type")

ggplot(Glass_long2, aes(x = value)) +
  facet_wrap(~variable, scales = "free") +
  geom_histogram(bins = 30, fill = "red") +
  theme_minimal()

Because of skewness, zero inflation, and differing scales, transformations are appropriate. A power transformation such as Yeo–Johnson helps reduce skewness for variables like Ba, Fe, and K, while centering and scaling ensure that no single predictor dominates the model due to its scale.

Fit SVM model (You may refer to Chapter 4 material for details) using the following R codes: (This code will be discussed in detail in the following chapters)

set.seed(231)

sigDist <- sigest(Type ~ ., data = Glass, frac = 1)
sigDist

##        90%        50%        10% 
## 0.03407935 0.11297847 0.62767315

svmTuneGrid <- data.frame(
  sigma = as.vector(sigDist)[1],
  C = 2^(-2:10)
)

svmTuneGrid

##         sigma       C
## 1  0.03407935    0.25
## 2  0.03407935    0.50
## 3  0.03407935    1.00
## 4  0.03407935    2.00
## 5  0.03407935    4.00
## 6  0.03407935    8.00
## 7  0.03407935   16.00
## 8  0.03407935   32.00
## 9  0.03407935   64.00
## 10 0.03407935  128.00
## 11 0.03407935  256.00
## 12 0.03407935  512.00
## 13 0.03407935 1024.00

set.seed(1056)

svmFit <- train(
  Type ~ ., 
  data = Glass,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneGrid = svmTuneGrid,
  trControl = trainControl(method = "repeatedcv", repeats = 5)
)

plot(svmFit, scales = list(x = list(log = 2)))

svmFit$bestTune

##        sigma  C
## 9 0.03407935 64

svmFit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 214 samples
##   9 predictor
##   6 classes: '1', '2', '3', '5', '6', '7' 
## 
## Pre-processing: centered (9), scaled (9) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 194, 191, 194, 192, 193, 194, ... 
## Resampling results across tuning parameters:
## 
##   C        Accuracy   Kappa    
##      0.25  0.5462532  0.3286847
##      0.50  0.5721161  0.3689913
##      1.00  0.6364625  0.4706664
##      2.00  0.6832063  0.5506030
##      4.00  0.7028410  0.5801977
##      8.00  0.6980384  0.5744221
##     16.00  0.7015910  0.5810250
##     32.00  0.7014216  0.5834227
##     64.00  0.7121792  0.6028075
##    128.00  0.6985407  0.5863168
##    256.00  0.7029157  0.5932771
##    512.00  0.7084049  0.6017276
##   1024.00  0.7059784  0.5981677
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03407935
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03407935 and C = 64.

The tuning plot shows that SVM accuracy improves as the cost parameter C increases, then levels off, indicating a good trade-off between flexibility and overfitting. The nonlinear and overlapping patterns seen in the visualizations support the use of a radial kernel SVM, which is well-suited to this data.