Exercise2

library(mlbench)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

a) Exploratory Visualization of Predictors

data(Glass)
str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

dim(Glass)

## [1] 214  10

head(Glass)

##        RI    Na   Mg   Al    Si    K   Ca Ba   Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0 0.00    1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0 0.00    1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0 0.00    1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0 0.00    1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0 0.00    1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07  0 0.26    1

summary(Glass)

##        RI              Na              Mg              Al       
##  Min.   :1.511   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.91   1st Qu.:2.115   1st Qu.:1.190  
##  Median :1.518   Median :13.30   Median :3.480   Median :1.360  
##  Mean   :1.518   Mean   :13.41   Mean   :2.685   Mean   :1.445  
##  3rd Qu.:1.519   3rd Qu.:13.82   3rd Qu.:3.600   3rd Qu.:1.630  
##  Max.   :1.534   Max.   :17.38   Max.   :4.490   Max.   :3.500  
##        Si              K                Ca               Ba       
##  Min.   :69.81   Min.   :0.0000   Min.   : 5.430   Min.   :0.000  
##  1st Qu.:72.28   1st Qu.:0.1225   1st Qu.: 8.240   1st Qu.:0.000  
##  Median :72.79   Median :0.5550   Median : 8.600   Median :0.000  
##  Mean   :72.65   Mean   :0.4971   Mean   : 8.957   Mean   :0.175  
##  3rd Qu.:73.09   3rd Qu.:0.6100   3rd Qu.: 9.172   3rd Qu.:0.000  
##  Max.   :75.41   Max.   :6.2100   Max.   :16.190   Max.   :3.150  
##        Fe          Type  
##  Min.   :0.00000   1:70  
##  1st Qu.:0.00000   2:76  
##  Median :0.00000   3:17  
##  Mean   :0.05701   5:13  
##  3rd Qu.:0.10000   6: 9  
##  Max.   :0.51000   7:29

colSums(is.na(Glass))

##   RI   Na   Mg   Al   Si    K   Ca   Ba   Fe Type 
##    0    0    0    0    0    0    0    0    0    0

table(Glass$Type)

## 
##  1  2  3  5  6  7 
## 70 76 17 13  9 29

prop.table(table(Glass$Type))

## 
##          1          2          3          5          6          7 
## 0.32710280 0.35514019 0.07943925 0.06074766 0.04205607 0.13551402

ggplot(Glass, aes(x = Type)) +
  geom_bar() +
  labs(
    title = "Distribution of Glass Types",
    x = "Glass Type",
    y = "Count"
  ) +
  theme_minimal()

num_vars <- names(Glass)[sapply(Glass, is.numeric)]

for (v in num_vars) {
  p <- ggplot(Glass, aes_string(x = v)) +
    geom_histogram(bins = 20, fill = "steelblue", color = "white") +
    labs(
      title = paste("Histogram of", v),
      x = v,
      y = "Count"
    ) +
    theme_minimal()
  
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(Glass, aes(x = Type, y = Na)) +
  geom_boxplot(fill = "lightblue") +
  theme_minimal() +
  labs(title = "Na by Glass Type")

ggplot(Glass, aes(x = Type, y = Mg)) +
  geom_boxplot(fill = "lightgreen") +
  theme_minimal() +
  labs(title = "Mg by Glass Type")

ggplot(Glass, aes(x = Type, y = Ca)) +
  geom_boxplot(fill = "lightpink") +
  theme_minimal() +
  labs(title = "Ca by Glass Type")

num_data <- Glass %>% select(where(is.numeric))

cor_mat <- cor(num_data)

round(cor_mat, 2)

##       RI    Na    Mg    Al    Si     K    Ca    Ba    Fe
## RI  1.00 -0.19 -0.12 -0.41 -0.54 -0.29  0.81  0.00  0.14
## Na -0.19  1.00 -0.27  0.16 -0.07 -0.27 -0.28  0.33 -0.24
## Mg -0.12 -0.27  1.00 -0.48 -0.17  0.01 -0.44 -0.49  0.08
## Al -0.41  0.16 -0.48  1.00 -0.01  0.33 -0.26  0.48 -0.07
## Si -0.54 -0.07 -0.17 -0.01  1.00 -0.19 -0.21 -0.10 -0.09
## K  -0.29 -0.27  0.01  0.33 -0.19  1.00 -0.32 -0.04 -0.01
## Ca  0.81 -0.28 -0.44 -0.26 -0.21 -0.32  1.00 -0.11  0.12
## Ba  0.00  0.33 -0.49  0.48 -0.10 -0.04 -0.11  1.00 -0.06
## Fe  0.14 -0.24  0.08 -0.07 -0.09 -0.01  0.12 -0.06  1.00

library(corrplot)

## corrplot 0.95 loaded

corrplot(
  cor_mat,
  method = "color",
  type = "upper",
  addCoef.col = "black",
  tl.col = "black",
  tl.srt = 45
)

b) Outliers and Skewness

Based on the histograms and boxplots above, several predictors show skewed distributions and possible outliers. Variables such as Ba, Fe, and K are strongly right-skewed, with many values near zero and a few much larger values. The boxplots also show multiple points outside the whiskers for Na and Ca across several glass types, indicating potential outliers. So the dataset does contain both skewed predictors and outlier observations.

c) Transformations

Because some predictors are clearly skewed, transformations such as log or square-root transforms could help make their distributions more symmetric. In particular, Ba, Fe, and K would be reasonable candidates. However, in the SVM model below, all predictors are centered and scaled before training, which already reduces scale effects. Since radial SVM is less sensitive to skewness than linear models, transformation may help but is not strictly required here.

library(caret)

## Loading required package: lattice

set.seed(123)

train_index <- createDataPartition(Glass$Type, p = 0.7, list = FALSE)

train_data <- Glass[train_index, ]
test_data  <- Glass[-train_index, ]

dim(train_data)

## [1] 153  10

dim(test_data)

## [1] 61 10

prop.table(table(train_data$Type))

## 
##          1          2          3          5          6          7 
## 0.32026144 0.35294118 0.07843137 0.06535948 0.04575163 0.13725490

prop.table(table(test_data$Type))

## 
##          1          2          3          5          6          7 
## 0.34426230 0.36065574 0.08196721 0.04918033 0.03278689 0.13114754

d) SVM Model

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(AppliedPredictiveModeling)

set.seed(231)

sigDist <- sigest(Type ~ ., data = Glass, frac = 1)

sigDist

##        90%        50%        10% 
## 0.03407935 0.11297847 0.62767315

svmTuneGrid <- data.frame(
  sigma = as.vector(sigDist)[1],
  C = 2^(-2:10)
)

svmTuneGrid

##         sigma       C
## 1  0.03407935    0.25
## 2  0.03407935    0.50
## 3  0.03407935    1.00
## 4  0.03407935    2.00
## 5  0.03407935    4.00
## 6  0.03407935    8.00
## 7  0.03407935   16.00
## 8  0.03407935   32.00
## 9  0.03407935   64.00
## 10 0.03407935  128.00
## 11 0.03407935  256.00
## 12 0.03407935  512.00
## 13 0.03407935 1024.00

set.seed(1056)

svmFit <- train(
  Type ~ .,
  data = Glass,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneGrid = svmTuneGrid,
  trControl = trainControl(
    method = "repeatedcv",
    repeats = 5
  )
)

svmFit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 214 samples
##   9 predictor
##   6 classes: '1', '2', '3', '5', '6', '7' 
## 
## Pre-processing: centered (9), scaled (9) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 194, 191, 194, 192, 193, 194, ... 
## Resampling results across tuning parameters:
## 
##   C        Accuracy   Kappa    
##      0.25  0.5462532  0.3286847
##      0.50  0.5721161  0.3689913
##      1.00  0.6364625  0.4706664
##      2.00  0.6832063  0.5506030
##      4.00  0.7028410  0.5801977
##      8.00  0.6980384  0.5744221
##     16.00  0.7015910  0.5810250
##     32.00  0.7014216  0.5834227
##     64.00  0.7121792  0.6028075
##    128.00  0.6985407  0.5863168
##    256.00  0.7029157  0.5932771
##    512.00  0.7084049  0.6017276
##   1024.00  0.7059784  0.5981677
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03407935
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03407935 and C = 64.

plot(svmFit, scales = list(x = list(log = 2)))

Exercise2

Wei You

2026-02-12

a) Exploratory Visualization of Predictors

b) Outliers and Skewness

c) Transformations

d) SVM Model