library(mlbench)
library(mlbench)
library(e1071)
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
library(reshape2)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(caret)
## Loading required package: lattice
library(AppliedPredictiveModeling)
library(GGally)
library(gridExtra)
library(moments)
##
## Attaching package: 'moments'
## The following objects are masked from 'package:e1071':
##
## kurtosis, moment, skewness
data(Glass)
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
head(Glass)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
dim(Glass)
## [1] 214 10
summary(Glass)
## RI Na Mg Al
## Min. :1.511 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190
## Median :1.518 Median :13.30 Median :3.480 Median :1.360
## Mean :1.518 Mean :13.41 Mean :2.685 Mean :1.445
## 3rd Qu.:1.519 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630
## Max. :1.534 Max. :17.38 Max. :4.490 Max. :3.500
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.000
## 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.000
## Median :72.79 Median :0.5550 Median : 8.600 Median :0.000
## Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.175
## 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.000
## Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.150
## Fe Type
## Min. :0.00000 1:70
## 1st Qu.:0.00000 2:76
## Median :0.00000 3:17
## Mean :0.05701 5:13
## 3rd Qu.:0.10000 6: 9
## Max. :0.51000 7:29
predictors <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
sapply(Glass[,predictors], sd)
## RI Na Mg Al Si K
## 0.003036864 0.816603556 1.442407845 0.499269646 0.774545795 0.652191846
## Ca Ba Fe
## 1.423153487 0.497219261 0.097438701
# Histograms of all Predictors
par(mfrow = c(3, 3), mar = c(2,2,2,1))
for (pred in predictors){
mn <- round(mean(Glass[[pred]]), 3)
med <- round(median(Glass[[pred]]), 3)
hist(Glass[,pred],
main = paste(pred, "(skew = ", round(skewness(Glass[[pred]]), 2), ")"),
xlab = pred,
col = "lightblue",
border = "White",
breaks = 25)
abline(v = mean(Glass[[pred]]), col = "red", lwd = 2, lty = 2)
abline(v = median(Glass[[pred]]), col = "navy", lwd = 2, lty = 3)
legend("topright",
legend = c(paste("Mean =", round(mean(Glass[[pred]]), 3)),
paste("Median =", round(median(Glass[[pred]]), 3))),
col = c("red", "navy"), lwd = 3, lty = c(2,3), cex = 0.7)
}
The histograms depict the distribution of each chemical measurement in the Glass dataset. Several predictors are not normally distributed, with many showing right skewness. Variables such as K, Ba, and Fe contain many low values with a few large observations, creating long right tails. Skewness will affect SVM indirectly through scaling.
par(mfrow = c(1,1))
# Boxplots by Glass Type
par(mfrow=c(3,3), mar = c(2,2,2,1))
type_colors <- c("1" = "#E91E63", "2" = "#2196F3", "3" = "#4CAF50",
"5" = "#FF9800", "6" = "#9C27B0", "7" = "#00BCD4")
for (pred in predictors){
boxplot(Glass[,pred] ~ Glass$Type,
main = pred,
xlab = "Glass Type",
ylab = pred,
col = type_colors,
border = "gray30",
outline = TRUE)
}
Predictors such as RI, Na, Al, and Ca show noticeable differences in median values across classes. This suggests that these predictors may be useful for classification. Several predictors contain extreme values beyond the whiskers, indicating possible outliers.
par(mfrow = c(1,1))
# Scatterplot matrix
corr_matrix <- cor(Glass[, predictors])
round(corr_matrix, 3)
## RI Na Mg Al Si K Ca Ba Fe
## RI 1.000 -0.192 -0.122 -0.407 -0.542 -0.290 0.810 0.000 0.143
## Na -0.192 1.000 -0.274 0.157 -0.070 -0.266 -0.275 0.327 -0.241
## Mg -0.122 -0.274 1.000 -0.482 -0.166 0.005 -0.444 -0.492 0.083
## Al -0.407 0.157 -0.482 1.000 -0.006 0.326 -0.260 0.479 -0.074
## Si -0.542 -0.070 -0.166 -0.006 1.000 -0.193 -0.209 -0.102 -0.094
## K -0.290 -0.266 0.005 0.326 -0.193 1.000 -0.318 -0.043 -0.008
## Ca 0.810 -0.275 -0.444 -0.260 -0.209 -0.318 1.000 -0.113 0.125
## Ba 0.000 0.327 -0.492 0.479 -0.102 -0.043 -0.113 1.000 -0.059
## Fe 0.143 -0.241 0.083 -0.074 -0.094 -0.008 0.125 -0.059 1.000
corrplot(corr_matrix,
method = "color",
type = "lower",
addCoef.col = "black",
number.cex = 0.7,
tl.col = "black",
tl.srt = 45,
col = colorRampPalette(c("#F44336","white","#2196F3"))(200),
title = "Correlation Matrix of Predictors",
mar = c(0,0,2,0))
The correlation matrix reveals relationships among predictors. There are notable predictors that show strong positive or negative correlation, indicating redundancy. The strongest correlation occures between RI and Cl, indicating that higher calcium levels are strongly associated with a higher refractive index. RI also shows a moderate negative relationship with Si, suggesting that glass with more silicon tends to have lower refractive index.Most other variables have weak correlations, meaning they vary independently. These results suggest that Ca and RI may provide overlapping information, whihc could be an important factor for model building.
#Top Correlations by absolute value
corr_long <- as.data.frame(as.table(corr_matrix))
corr_long <- corr_long[corr_long$Var1 != corr_long$Var2, ]
corr_long <- corr_long[!duplicated(t(apply(corr_long[,1:2], 1, sort))), ]
corr_long$AbsFreq <- abs(corr_long$Freq)
head(corr_long[order(-corr_long$AbsFreq), ], 10)
## Var1 Var2 Freq AbsFreq
## 7 Ca RI 0.8104027 0.8104027
## 5 Si RI -0.5420522 0.5420522
## 26 Ba Mg -0.4922621 0.4922621
## 22 Al Mg -0.4817985 0.4817985
## 35 Ba Al 0.4794039 0.4794039
## 25 Ca Mg -0.4437500 0.4437500
## 4 Al RI -0.4073260 0.4073260
## 17 Ba Na 0.3266029 0.3266029
## 33 K Al 0.3259584 0.3259584
## 52 Ca K -0.3178362 0.3178362
# Outlier Detection (Z-score)
z_scores <- scale(Glass[,predictors])
outlier_idx <- abs(z_scores) > 3
outlier_counts <- colSums(outlier_idx)
print(outlier_counts)
## RI Na Mg Al Si K Ca Ba Fe
## 3 2 0 3 6 3 7 6 3
outlier_report <- which(outlier_idx, arr.ind = TRUE)
colnames(outlier_report) <- c("Row", "Predictor_Col")
outlier_report <- as.data.frame(outlier_report)
outlier_report$Predictor <- predictors[outlier_report$Predictor_Col]
outlier_report$Value <- mapply(function(r, p) Glass[r,p],
outlier_report$Row, outlier_report$Predictor)
outlier_report$ZScore <- mapply(function(r, c) z_scores[r,c],
outlier_report$Row, outlier_report$Predictor_Col)
print(outlier_report[order(outlier_report$Predictor), c("Row", "Predictor", "Value", "ZScore")])
## Row Predictor Value ZScore
## X164 164 Al 3.50000 4.116199
## X172 172 Al 3.04000 3.194854
## X173 173 Al 3.02000 3.154795
## X107.4 107 Ba 3.15000 5.983182
## X164.2 164 Ba 2.20000 4.072556
## X190 190 Ba 1.68000 3.026740
## X204 204 Ba 1.71000 3.087075
## X208 208 Ba 2.88000 5.440162
## X214 214 Ba 1.67000 3.006628
## X106 106 Ca 13.24000 3.009540
## X107.3 107 Ca 13.30000 3.051700
## X108.2 108 Ca 16.19000 5.082401
## X111 111 Ca 14.68000 4.021377
## X112 112 Ca 14.96000 4.218124
## X113.1 113 Ca 14.40000 3.824631
## X132 132 Ca 13.44000 3.150073
## X146 146 Fe 0.35000 3.006923
## X163 163 Fe 0.37000 3.212180
## X175 175 Fe 0.51000 4.648981
## X172.1 172 K 6.21000 8.759606
## X173.1 173 K 6.21000 8.759606
## X202.1 202 K 2.70000 3.377754
## X107.1 107 Na 10.73000 -3.279254
## X185 185 Na 17.38000 4.864232
## X107 107 RI 1.53125 4.242726
## X108 108 RI 1.53393 5.125215
## X113 113 RI 1.52777 3.096807
## X107.2 107 Si 69.81000 -3.667872
## X108.1 108 Si 70.16000 -3.215994
## X164.1 164 Si 69.89000 -3.564585
## X185.1 185 Si 75.41000 3.562172
## X189 189 Si 70.26000 -3.086886
## X202 202 Si 75.18000 3.265224
par(mfrow = c(3,3), mar = c(2,2,2,1))
for(i in seq_along(predictors)){
pred <- predictors[i]
z_col <- abs(z_scores[, i])
is_out <- z_col > 3
plot(seq_len(nrow(Glass)), Glass[[pred]],
col = ifelse(is_out, "red", "steelblue"),
pch = ifelse(is_out, 17, 16),
cex = ifelse(is_out, 1.2, 0.5),
main = paste0(pred, " (", sum(is_out), " outlier(s))"),
xlab = "Observation Index",
ylab = pred)
}
Outliers were detected using Z-scores, where observations exceeding +- 3 sd were flagged. Ca, Si, and Ba contained the highest number of outliers indicating considerable variation in these variables. K had the most extreme individual outliers with Z-scores above 8 while Si showed both high and low outliers. These outliers may represent chemcially different glass categories.
skew_vals <- sapply(Glass[, predictors], skewness)
kurt_vals <- sapply(Glass[, predictors], kurtosis)
skew_sorted <- sort(skew_vals, decreasing = TRUE)
print(round(skew_sorted, 4))
## K Ba Ca Fe RI Al Na Si Mg
## 6.5056 3.3924 2.0327 1.7420 1.6140 0.9009 0.4510 -0.7253 -1.1445
skew_df <- data.frame(
Predictor = names(skew_vals),
Skewness = round(skew_vals, 4),
Kurtosis = round(kurt_vals, 4),
Severity = ifelse(abs(skew_vals) > 1, "High",
ifelse(abs(skew_vals) > 0.5, "Moderate", "Low"))
)
print(skew_df[order(-abs(skew_df$Skewness)), ])
## Predictor Skewness Kurtosis Severity
## K K 6.5056 56.3923 High
## Ba Ba 3.3924 15.2221 High
## Ca Ca 2.0327 9.4990 High
## Fe Fe 1.7420 5.5723 High
## RI RI 1.6140 7.7894 High
## Mg Mg -1.1445 2.5713 High
## Al Al 0.9009 4.9848 Moderate
## Si Si -0.7253 5.8711 Moderate
## Na Na 0.4510 5.9535 Low
barplot(skew_sorted,
main = "Skewness of Predictor Variables",
xlab = "Predictor",
ylab = "Skewness",
col = ifelse(abs(skew_sorted) > 1, "firebrick",
ifelse(abs(skew_sorted) > 0.5, "darkorange", "forestgreen")),
las = 2,
border = "white",
ylim = c(min(skew_sorted) - 0.5, max(skew_sorted) + 0.5))
legend("topright",
legend = c("|skew| > 1 (High)", "|skew| > 0.5 (Moderate)", "|skew| <= 0.5 (Low)"),
fill = c("firebrick", "darkorange", "forestgreen", cex = 0.8))
Skewness values indicate that several predictors are moderately to highly skewed. K shows extreme positive skewness and very high kurtosis, indicating that most samples have low K values while a few do contain large concentrations. This is similar for Ba, Ca, and Fe. Mg shows strong negative skewness and in contrast, Na has low skewness and appears to be more symmetrically distributed.
skewed_preds <- c("K", "Ba", "Ca", "Fe")
par(mfrow = c(4,3), mar = c(2,2,2,1))
for(pred in skewed_preds){
orig <- Glass[[pred]]
log_t <- log1p(orig)
sqrt_t <- sqrt(orig)
skews <- c(round(skewness(orig), 2),
round(skewness(log_t), 2),
round(skewness(sqrt_t), 2))
hist(orig, breaks = 20, col = "steelblue", border = "white",
main = paste0(pred, ": Original (skew=", skews[1], ")"),
xlab = pred)
hist(log_t, breaks = 20, col = "forestgreen", border = "white",
main = paste0(pred, ": log(x+1) (skew=", skews[2], ")"),
xlab = paste("log(", pred, "+1)"))
hist(sqrt_t, breaks = 20, col = "darkorange", border = "white",
main = paste0(pred, ": sqrt(x) (skew=", skews[3], ")"),
xlab = paste("sqrt(", pred, ")"))
}
Because predictor K, Ba, Ca and Fe are highly right-skewed, transformations were explored. Log and square root transformations reduce skewness by compressing large values. After transformation, distribution becomes more symmetric, which can improve model training by reducing the influence of extreme observations.
par(mfrow = c(1,1))
Glass_pos <- Glass[, predictors]
Glass_pos$K <- Glass_pos$K + 0.001
Glass_pos$Ba <- Glass_pos$Ba + 0.001
Glass_pos$Fe <- Glass_pos$Fe + 0.001
Glass_pos$Mg <- Glass_pos$Mg + 0.001
pp_boxcox <- preProcess(Glass_pos, method = "BoxCox")
Glass_bc <- predict(pp_boxcox, Glass_pos)
skew_before <- round(sapply(Glass[, predictors], skewness), 3)
skew_after <- round(sapply(Glass_bc, skewness), 3)
skew_comparison <- data.frame(
Predictor = predictors,
Skew_Before = skew_before,
Skew_After = skew_after,
Improvement = round(abs(skew_before) - abs(skew_after), 3)
)
print(skew_comparison[order(-abs(skew_comparison$Skew_Before)), ])
## Predictor Skew_Before Skew_After Improvement
## K K 6.506 0.127 6.379
## Ba Ba 3.392 1.688 1.704
## Ca Ca 2.033 -0.195 1.838
## Fe Fe 1.742 0.743 0.999
## RI RI 1.614 1.577 0.037
## Mg Mg -1.144 -1.308 -0.164
## Al Al 0.901 0.092 0.809
## Si Si -0.725 -0.655 0.070
## Na Na 0.451 0.034 0.417
Box-Cox transformations show improvement in several predictors, suggesting transformed predictors may improve classification performance. Most notably, K, Ca, Al and Fe showed the most improvement, demonstrating its effectiveness for strongly right skewed features
pp_yj <- preProcess(Glass[, predictors], method = "YeoJohnson")
Glass_yj <- predict(pp_yj, Glass[, predictors])
skew_yj <- round(sapply(Glass_yj, skewness), 3)
print(data.frame(Predictor = predictors, Skew_Raw = skew_before, Skew_YJ = skew_yj))
## Predictor Skew_Raw Skew_YJ
## RI RI 1.614 1.614
## Na Na 0.451 -0.009
## Mg Mg -1.144 -0.883
## Al Al 0.901 0.000
## Si Si -0.725 -0.725
## K K 6.506 -0.071
## Ca Ca 2.033 -0.208
## Ba Ba 3.392 3.392
## Fe Fe 1.742 1.742
YeoJohnson transformation provided similar values to Box-cox however, it was able to handle zero and negative values better. It almost completely corrected the skew for K, Ca, Al and Na. Ba and Fe were not improved, likely because extreme values still dominated.
set.seed(231)
sigDist <-sigest(Type~ ., data = Glass, frac = 1)
sigDist
## 90% 50% 10%
## 0.03407935 0.11297847 0.62767315
svmTuneGrid <-data.frame(
sigma = as.vector(sigDist)[1],
C = 2^(-2:10))
svmTuneGrid
## sigma C
## 1 0.03407935 0.25
## 2 0.03407935 0.50
## 3 0.03407935 1.00
## 4 0.03407935 2.00
## 5 0.03407935 4.00
## 6 0.03407935 8.00
## 7 0.03407935 16.00
## 8 0.03407935 32.00
## 9 0.03407935 64.00
## 10 0.03407935 128.00
## 11 0.03407935 256.00
## 12 0.03407935 512.00
## 13 0.03407935 1024.00
set.seed(1056)
svmFit <-train(Type~ .,
data = Glass,
method = "svmRadial",
preProc = c("center", "scale"),
tuneGrid = svmTuneGrid,
trControl = trainControl(
method = "repeatedcv",
repeats = 5))
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 214 samples
## 9 predictor
## 6 classes: '1', '2', '3', '5', '6', '7'
##
## Pre-processing: centered (9), scaled (9)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 194, 191, 194, 192, 193, 194, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.5462532 0.3286847
## 0.50 0.5721161 0.3689913
## 1.00 0.6364625 0.4706664
## 2.00 0.6832063 0.5506030
## 4.00 0.7028410 0.5801977
## 8.00 0.6980384 0.5744221
## 16.00 0.7015910 0.5810250
## 32.00 0.7014216 0.5834227
## 64.00 0.7121792 0.6028075
## 128.00 0.6985407 0.5863168
## 256.00 0.7029157 0.5932771
## 512.00 0.7084049 0.6017276
## 1024.00 0.7059784 0.5981677
##
## Tuning parameter 'sigma' was held constant at a value of 0.03407935
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03407935 and C = 64.
svmFit$bestTune
## sigma C
## 9 0.03407935 64
max(svmFit$results$Accuracy)
## [1] 0.7121792
svmFit$results
## sigma C Accuracy Kappa AccuracySD KappaSD
## 1 0.03407935 0.25 0.5462532 0.3286847 0.06892510 0.1015915
## 2 0.03407935 0.50 0.5721161 0.3689913 0.10043982 0.1503148
## 3 0.03407935 1.00 0.6364625 0.4706664 0.09011066 0.1339674
## 4 0.03407935 2.00 0.6832063 0.5506030 0.08655001 0.1236686
## 5 0.03407935 4.00 0.7028410 0.5801977 0.08744430 0.1259908
## 6 0.03407935 8.00 0.6980384 0.5744221 0.08565013 0.1237030
## 7 0.03407935 16.00 0.7015910 0.5810250 0.08369320 0.1180196
## 8 0.03407935 32.00 0.7014216 0.5834227 0.08904774 0.1254737
## 9 0.03407935 64.00 0.7121792 0.6028075 0.09893126 0.1392120
## 10 0.03407935 128.00 0.6985407 0.5863168 0.09909836 0.1366574
## 11 0.03407935 256.00 0.7029157 0.5932771 0.10532758 0.1450300
## 12 0.03407935 512.00 0.7084049 0.6017276 0.10654373 0.1466388
## 13 0.03407935 1024.00 0.7059784 0.5981677 0.10801890 0.1494817
Moderate regularization and small sigma produces optimal model performance.
plot(svmFit, scales = list(x = list(log = 2)))
pred_train <- predict(svmFit, Glass)
confusionMatrix(pred_train, Glass$Type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 5 6 7
## 1 65 17 3 0 0 1
## 2 3 58 3 0 0 0
## 3 2 1 11 0 0 0
## 5 0 0 0 13 0 0
## 6 0 0 0 0 9 0
## 7 0 0 0 0 0 28
##
## Overall Statistics
##
## Accuracy : 0.8598
## 95% CI : (0.806, 0.9034)
## No Information Rate : 0.3551
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.809
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 5 Class: 6 Class: 7
## Sensitivity 0.9286 0.7632 0.64706 1.00000 1.00000 0.9655
## Specificity 0.8542 0.9565 0.98477 1.00000 1.00000 1.0000
## Pos Pred Value 0.7558 0.9062 0.78571 1.00000 1.00000 1.0000
## Neg Pred Value 0.9609 0.8800 0.97000 1.00000 1.00000 0.9946
## Prevalence 0.3271 0.3551 0.07944 0.06075 0.04206 0.1355
## Detection Rate 0.3037 0.2710 0.05140 0.06075 0.04206 0.1308
## Detection Prevalence 0.4019 0.2991 0.06542 0.06075 0.04206 0.1308
## Balanced Accuracy 0.8914 0.8598 0.81592 1.00000 1.00000 0.9828
The final SVM achieved a maximum accuracy of approximately 0.8598%. Class 3 has the lowest sensitivity, while classes 5,6, and 7 are perfectly classified. Balance accuracy is high across all classes showing the model handles imbalanced classes fairly well.