Supervised Classification - Support Vector Machine

Library

library(dplyr)
library(e1071)
library(caret)
library(ggplot2)
library(gridExtra)

Data

Steel Plates Faults Data Set

A dataset of steel plates’ faults, classified into 7 different types. The goal was to train machine learning for automatic pattern recognition.

The dataset consists of 27 features describing each fault (location, size, …) and 7 binary features indicating the type of fault (on of 7: Pastry, Z_Scratch, K_Scatch, Stains, Dirtiness, Bumps, Other_Faults). The latter is commonly used as a binary classification target (‘common’ or ‘other’ fault.)

Import Data

steel <- read.csv("https://raw.githubusercontent.com/Nr5D/STA582/main/php9xWOpn.csv") 
str(steel)
## 'data.frame':    1941 obs. of  34 variables:
##  $ V1   : int  42 645 829 853 1289 430 413 190 330 74 ...
##  $ V2   : int  50 651 835 860 1306 441 446 200 343 90 ...
##  $ V3   : int  270900 2538079 1553913 369370 498078 100250 138468 210936 429227 779144 ...
##  $ V4   : int  270944 2538108 1553931 369415 498335 100337 138883 210956 429253 779308 ...
##  $ V5   : int  267 108 71 176 2409 630 9052 132 264 1506 ...
##  $ V6   : int  17 10 8 13 60 20 230 11 15 46 ...
##  $ V7   : int  44 30 19 45 260 87 432 20 26 167 ...
##  $ V8   : int  24220 11397 7972 18996 246930 62357 1481991 20007 29748 180215 ...
##  $ V9   : int  76 84 99 99 37 64 23 124 53 53 ...
##  $ V10  : int  108 123 125 126 126 127 199 172 148 143 ...
##  $ V11  : int  1687 1687 1623 1353 1353 1387 1687 1687 1687 1687 ...
##  $ V12  : int  1 1 1 0 0 0 0 0 0 0 ...
##  $ V13  : int  0 0 0 1 1 1 1 1 1 1 ...
##  $ V14  : int  80 80 100 290 185 40 150 150 150 150 ...
##  $ V15  : num  0.0498 0.7647 0.971 0.7287 0.0695 ...
##  $ V16  : num  0.241 0.379 0.343 0.441 0.449 ...
##  $ V17  : num  0.1818 0.2069 0.3333 0.1556 0.0662 ...
##  $ V18  : num  0.0047 0.0036 0.0037 0.0052 0.0126 0.0079 0.0196 0.0059 0.0077 0.0095 ...
##  $ V19  : num  0.471 0.6 0.75 0.538 0.283 ...
##  $ V20  : num  1 0.967 0.947 1 0.989 ...
##  $ V21  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ V22  : num  2.43 2.03 1.85 2.25 3.38 ...
##  $ V23  : num  0.903 0.778 0.778 0.845 1.23 ...
##  $ V24  : num  1.64 1.46 1.26 1.65 2.41 ...
##  $ V25  : num  0.818 0.793 0.667 0.844 0.934 ...
##  $ V26  : num  -0.291 -0.176 -0.123 -0.157 -0.199 ...
##  $ V27  : num  0.582 0.298 0.215 0.521 1 ...
##  $ V28  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ V29  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V30  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V31  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V32  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V33  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Class: int  1 1 1 1 1 1 1 1 1 1 ...

Keterangan Peubah :

Kode Keterangan
V1 X_Minimum
V2 X_Maximum
V3 Y_Minimum
V4 Y_Maximum
V5 Pixels_Areas
V6 X_Perimeter
V7 Y_Perimeter
V8 Sum_of_Luminosity
V9 Minimum_of_Luminosity
V10 Maximum_of_Luminosity
V11 Length_of_Conveyer
V12 TypeOfSteel_A300
V13 TypeOfSteel_A400
V14 Steel_Plate_Thickness
V15 Edges_Index
V16 Empty_Index
V17 Square_Index
V18 Outside_X_Index
V19 Edges_X_Index
V20 Edges_Y_Index
V21 Outside_Global_Index
V22 LogOfAreas
V23 Log_X_Index
V24 Log_Y_Index
V25 Orientation_Index
V26 Luminosity_Index
V27 SigmoidOfAreas
V28 Pastry
V29 Z_Scratch
V30 K_Scatch
V31 Stains
V32 Dirtiness
V33 Bumps
Class Other_Faults
sapply(steel[,28:34],table)
##    V28  V29  V30  V31  V32  V33 Class
## 0 1783 1751 1550 1869 1886 1539  1268
## 1  158  190  391   72   55  402   673

Pada data tersebut, tersedia 7 peubah yang menggambarkan beberapa tipe dari “steel plates faults”. Pada panduan ini, peubah respon yang digunakan sebagai latihan adalah Class.

steel <- steel %>% select(-c(28:33)) %>% mutate_at(c(12,13,28),factor)
head(steel)
##     V1   V2      V3      V4   V5 V6  V7     V8 V9 V10  V11 V12 V13 V14    V15
## 1   42   50  270900  270944  267 17  44  24220 76 108 1687   1   0  80 0.0498
## 2  645  651 2538079 2538108  108 10  30  11397 84 123 1687   1   0  80 0.7647
## 3  829  835 1553913 1553931   71  8  19   7972 99 125 1623   1   0 100 0.9710
## 4  853  860  369370  369415  176 13  45  18996 99 126 1353   0   1 290 0.7287
## 5 1289 1306  498078  498335 2409 60 260 246930 37 126 1353   0   1 185 0.0695
## 6  430  441  100250  100337  630 20  87  62357 64 127 1387   0   1  40 0.6200
##      V16    V17    V18    V19    V20 V21    V22    V23    V24    V25     V26
## 1 0.2415 0.1818 0.0047 0.4706 1.0000   1 2.4265 0.9031 1.6435 0.8182 -0.2913
## 2 0.3793 0.2069 0.0036 0.6000 0.9667   1 2.0334 0.7782 1.4624 0.7931 -0.1756
## 3 0.3426 0.3333 0.0037 0.7500 0.9474   1 1.8513 0.7782 1.2553 0.6667 -0.1228
## 4 0.4413 0.1556 0.0052 0.5385 1.0000   1 2.2455 0.8451 1.6532 0.8444 -0.1568
## 5 0.4486 0.0662 0.0126 0.2833 0.9885   1 3.3818 1.2305 2.4099 0.9338 -0.1992
## 6 0.3417 0.1264 0.0079 0.5500 1.0000   1 2.7993 1.0414 1.9395 0.8736 -0.2267
##      V27 Class
## 1 0.5822     1
## 2 0.2984     1
## 3 0.2150     1
## 4 0.5212     1
## 5 1.0000     1
## 6 0.9874     1

Pembagian Training - Testing

set.seed(1016)
idx <- createDataPartition(steel$Class, p=0.3, list=FALSE)
steeltrain <- steel[-idx,]
steeltest <- steel[idx,]

Eksplorasi Data

p <- ggplot(steel,aes(x=V15,fill=Class)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)

p <- ggplot(steel,aes(x=V22,fill=Class)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)

p <- ggplot(steel,aes(x=V23,fill=Class)) + theme(legend.position="bottom")
p1 <- p + geom_density(alpha=.3)
p2 <- p + geom_boxplot()
grid.arrange(p1, p2, ncol=2)

Ilustrasi SVM

Terpisah Sempurna

# Generate Random Data Set
set.seed(10)
x <- matrix(rnorm(20*2), ncol = 2)
y <- c(rep(-1,10), rep(1,10))
x[y==1,] <- x[y==1,] + 3/2
data1 <- data.frame(x=x, y=as.factor(y))

ggplot(data = data1, aes(x = x.2, y = x.1, color = y, shape = y)) + 
  geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
  theme(legend.position = "none")

svmfit1 <- svm(y~., data = data1, kernel = "linear", scale = FALSE)
svmfit1
## 
## Call:
## svm(formula = y ~ ., data = data1, kernel = "linear", scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  5
plot(svmfit1, data1, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))

Tidak Terpisah Sempurna

# Generate Random Data Set
x <- matrix(rnorm(20*2), ncol = 2)
y <- c(rep(-1,10), rep(1,10))
x[y==1,] <- x[y==1,] + 1
data2 <- data.frame(x=x, y=as.factor(y))

ggplot(data = data2, aes(x = x.2, y = x.1, color = y, shape = y)) + 
  geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
  theme(legend.position = "none")

svmfit2 <- svm(y~., data = data2, kernel = "linear", cost = 10)
plot(svmfit2, data2, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))

Non-Linear SVM

# Generate Larger random data set
x <- matrix(rnorm(200*2), ncol = 2)
x[1:100,] <- x[1:100,] + 2.5
x[101:150,] <- x[101:150,] - 2.5
y <- c(rep(1,150), rep(2,50))
data3 <- data.frame(x=x,y=as.factor(y))

ggplot(data = data3, aes(x = x.2, y = x.1, color = y, shape = y)) + 
  geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
  theme(legend.position = "none")

svmfit3 <- svm(y~., data = data3, kernel = "radial", gamma = 1, cost = 1)
plot(svmfit3, data3, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))

User Defined Function

perform <- function(pred,data){
  tabel <- caret::confusionMatrix(pred, data$Class, positive = "2")
  result <- c(tabel$overall[1],tabel$byClass[1:2])
  return(result)
}

Pemodelan

Model 1

model.svm1 <- svm(Class~.,data=steeltrain,kernel="linear")
pred.svm1 <- predict(model.svm1,steeltest)
perform(pred.svm1,steeltest)
##    Accuracy Sensitivity Specificity 
##   0.7444254   0.3910891   0.9317585

Model 2

model.svm2 <- svm(Class~.,data=steeltrain,kernel="polynomial",degree=1,cost=1)
pred.svm2 <- predict(model.svm2,steeltest)
perform(pred.svm2,steeltest)
##    Accuracy Sensitivity Specificity 
##   0.7461407   0.3811881   0.9396325

Model 3

model.svm3 <- svm(Class~.,data=steeltrain,kernel="polynomial",degree=2,cost=.1)
pred.svm3 <- predict(model.svm3,steeltest)
perform(pred.svm3,steeltest)
##    Accuracy Sensitivity Specificity 
##   0.7135506   0.1831683   0.9947507

Model 4

model.svm4 <- svm(Class~.,data=steeltrain,kernel="radial",gamma=0.01,cost=10)
pred.svm4 <- predict(model.svm4,steeltest)
perform(pred.svm4,steeltest)
##    Accuracy Sensitivity Specificity 
##   0.7735849   0.5148515   0.9107612

Model 5

model.svm5 <- svm(Class~.,data=steeltrain,kernel="radial",gamma=0.1,cost=5)
pred.svm5 <- predict(model.svm5,steeltest)
perform(pred.svm5,steeltest)
##    Accuracy Sensitivity Specificity 
##   0.8061750   0.6336634   0.8976378

Model 6

model.svm6 <- svm(Class~.,data=steeltrain,kernel="sigmoid",gamma=0.2,cost=0.01)
pred.svm6 <- predict(model.svm6,steeltest)

Perbandingan Antar Model

data.frame(model1=perform(pred.svm1,steeltest),
           model2=perform(pred.svm2,steeltest),
           model3=perform(pred.svm3,steeltest),
           model4=perform(pred.svm4,steeltest),
           model5=perform(pred.svm5,steeltest),
           model6=perform(pred.svm6,steeltest))
##                model1    model2    model3    model4    model5     model6
## Accuracy    0.7444254 0.7461407 0.7135506 0.7735849 0.8061750 0.66209262
## Sensitivity 0.3910891 0.3811881 0.1831683 0.5148515 0.6336634 0.02475248
## Specificity 0.9317585 0.9396325 0.9947507 0.9107612 0.8976378 1.00000000

Hyperparameter Tuning

tuningsvm <- tune(svm,Class~.,data=steeltrain,
                  ranges=list(kernel=c("linear","polynomial","radial","sigmoid")))
tuningsvm$best.model
## 
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(kernel = c("linear", 
##     "polynomial", "radial", "sigmoid")))
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  807
tuningsvm <- tune(svm,Class~., data=steeltrain, kernel="radial",
                  ranges=list(cost = c(0.1,1,10,100)))
tuningsvm$best.model
## 
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(cost = c(0.1, 
##     1, 10, 100)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  10 
## 
## Number of Support Vectors:  697
pred.svm7 <- predict(tuningsvm$best.model,steeltest)
data.frame(model4=perform(pred.svm4,steeltest),
           model7=perform(pred.svm7,steeltest))
##                model4    model7
## Accuracy    0.7735849 0.8164666
## Sensitivity 0.5148515 0.6386139
## Specificity 0.9107612 0.9107612

Pengembangan Materi SVM

Memeperbesar range cost

biaya <- c(seq(0.01,0.1,by=0.01),seq(0.1,1,by=0.1),seq(1,10,by=1),seq(10,100,by=10),seq(100,1000,by=100))
tuningsvm <- tune(svm,Class~., data=steeltrain, kernel="radial",
                  ranges=list(cost = biaya))
tuningsvm$best.model
## 
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(cost = biaya), 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  20 
## 
## Number of Support Vectors:  680
pred.svm8 <- predict(tuningsvm$best.model,steeltest)
data.frame(model4=perform(pred.svm4,steeltest),
           model7=perform(pred.svm7,steeltest),
           model8=perform(pred.svm8,steeltest))
##                model4    model7    model8
## Accuracy    0.7735849 0.8164666 0.8044597
## Sensitivity 0.5148515 0.6386139 0.6336634
## Specificity 0.9107612 0.9107612 0.8950131

  1. Badan Informasi Geospasial, ↩︎