Supervised Classification - Support Vector Machine
Library
library(dplyr)
library(e1071)
library(caret)
library(ggplot2)
library(gridExtra)
Data
Steel Plates Faults Data Set
A dataset of steel plates’ faults, classified into 7 different types. The goal was to train machine learning for automatic pattern recognition.
The dataset consists of 27 features describing each fault (location, size, …) and 7 binary features indicating the type of fault (on of 7: Pastry, Z_Scratch, K_Scatch, Stains, Dirtiness, Bumps, Other_Faults). The latter is commonly used as a binary classification target (‘common’ or ‘other’ fault.)
Import Data
<- read.csv("https://raw.githubusercontent.com/Nr5D/STA582/main/php9xWOpn.csv")
steel str(steel)
## 'data.frame': 1941 obs. of 34 variables:
## $ V1 : int 42 645 829 853 1289 430 413 190 330 74 ...
## $ V2 : int 50 651 835 860 1306 441 446 200 343 90 ...
## $ V3 : int 270900 2538079 1553913 369370 498078 100250 138468 210936 429227 779144 ...
## $ V4 : int 270944 2538108 1553931 369415 498335 100337 138883 210956 429253 779308 ...
## $ V5 : int 267 108 71 176 2409 630 9052 132 264 1506 ...
## $ V6 : int 17 10 8 13 60 20 230 11 15 46 ...
## $ V7 : int 44 30 19 45 260 87 432 20 26 167 ...
## $ V8 : int 24220 11397 7972 18996 246930 62357 1481991 20007 29748 180215 ...
## $ V9 : int 76 84 99 99 37 64 23 124 53 53 ...
## $ V10 : int 108 123 125 126 126 127 199 172 148 143 ...
## $ V11 : int 1687 1687 1623 1353 1353 1387 1687 1687 1687 1687 ...
## $ V12 : int 1 1 1 0 0 0 0 0 0 0 ...
## $ V13 : int 0 0 0 1 1 1 1 1 1 1 ...
## $ V14 : int 80 80 100 290 185 40 150 150 150 150 ...
## $ V15 : num 0.0498 0.7647 0.971 0.7287 0.0695 ...
## $ V16 : num 0.241 0.379 0.343 0.441 0.449 ...
## $ V17 : num 0.1818 0.2069 0.3333 0.1556 0.0662 ...
## $ V18 : num 0.0047 0.0036 0.0037 0.0052 0.0126 0.0079 0.0196 0.0059 0.0077 0.0095 ...
## $ V19 : num 0.471 0.6 0.75 0.538 0.283 ...
## $ V20 : num 1 0.967 0.947 1 0.989 ...
## $ V21 : num 1 1 1 1 1 1 1 1 1 1 ...
## $ V22 : num 2.43 2.03 1.85 2.25 3.38 ...
## $ V23 : num 0.903 0.778 0.778 0.845 1.23 ...
## $ V24 : num 1.64 1.46 1.26 1.65 2.41 ...
## $ V25 : num 0.818 0.793 0.667 0.844 0.934 ...
## $ V26 : num -0.291 -0.176 -0.123 -0.157 -0.199 ...
## $ V27 : num 0.582 0.298 0.215 0.521 1 ...
## $ V28 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ V29 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V30 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V31 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V32 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V33 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Class: int 1 1 1 1 1 1 1 1 1 1 ...
Keterangan Peubah :
Kode | Keterangan |
---|---|
V1 | X_Minimum |
V2 | X_Maximum |
V3 | Y_Minimum |
V4 | Y_Maximum |
V5 | Pixels_Areas |
V6 | X_Perimeter |
V7 | Y_Perimeter |
V8 | Sum_of_Luminosity |
V9 | Minimum_of_Luminosity |
V10 | Maximum_of_Luminosity |
V11 | Length_of_Conveyer |
V12 | TypeOfSteel_A300 |
V13 | TypeOfSteel_A400 |
V14 | Steel_Plate_Thickness |
V15 | Edges_Index |
V16 | Empty_Index |
V17 | Square_Index |
V18 | Outside_X_Index |
V19 | Edges_X_Index |
V20 | Edges_Y_Index |
V21 | Outside_Global_Index |
V22 | LogOfAreas |
V23 | Log_X_Index |
V24 | Log_Y_Index |
V25 | Orientation_Index |
V26 | Luminosity_Index |
V27 | SigmoidOfAreas |
V28 | Pastry |
V29 | Z_Scratch |
V30 | K_Scatch |
V31 | Stains |
V32 | Dirtiness |
V33 | Bumps |
Class | Other_Faults |
sapply(steel[,28:34],table)
## V28 V29 V30 V31 V32 V33 Class
## 0 1783 1751 1550 1869 1886 1539 1268
## 1 158 190 391 72 55 402 673
Pada data tersebut, tersedia 7 peubah yang menggambarkan beberapa tipe dari “steel plates faults”. Pada panduan ini, peubah respon yang digunakan sebagai latihan adalah Class
.
<- steel %>% select(-c(28:33)) %>% mutate_at(c(12,13,28),factor)
steel head(steel)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 42 50 270900 270944 267 17 44 24220 76 108 1687 1 0 80 0.0498
## 2 645 651 2538079 2538108 108 10 30 11397 84 123 1687 1 0 80 0.7647
## 3 829 835 1553913 1553931 71 8 19 7972 99 125 1623 1 0 100 0.9710
## 4 853 860 369370 369415 176 13 45 18996 99 126 1353 0 1 290 0.7287
## 5 1289 1306 498078 498335 2409 60 260 246930 37 126 1353 0 1 185 0.0695
## 6 430 441 100250 100337 630 20 87 62357 64 127 1387 0 1 40 0.6200
## V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26
## 1 0.2415 0.1818 0.0047 0.4706 1.0000 1 2.4265 0.9031 1.6435 0.8182 -0.2913
## 2 0.3793 0.2069 0.0036 0.6000 0.9667 1 2.0334 0.7782 1.4624 0.7931 -0.1756
## 3 0.3426 0.3333 0.0037 0.7500 0.9474 1 1.8513 0.7782 1.2553 0.6667 -0.1228
## 4 0.4413 0.1556 0.0052 0.5385 1.0000 1 2.2455 0.8451 1.6532 0.8444 -0.1568
## 5 0.4486 0.0662 0.0126 0.2833 0.9885 1 3.3818 1.2305 2.4099 0.9338 -0.1992
## 6 0.3417 0.1264 0.0079 0.5500 1.0000 1 2.7993 1.0414 1.9395 0.8736 -0.2267
## V27 Class
## 1 0.5822 1
## 2 0.2984 1
## 3 0.2150 1
## 4 0.5212 1
## 5 1.0000 1
## 6 0.9874 1
Pembagian Training - Testing
set.seed(1016)
<- createDataPartition(steel$Class, p=0.3, list=FALSE)
idx <- steel[-idx,]
steeltrain <- steel[idx,] steeltest
Eksplorasi Data
<- ggplot(steel,aes(x=V15,fill=Class)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
<- ggplot(steel,aes(x=V22,fill=Class)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
<- ggplot(steel,aes(x=V23,fill=Class)) + theme(legend.position="bottom")
p <- p + geom_density(alpha=.3)
p1 <- p + geom_boxplot()
p2 grid.arrange(p1, p2, ncol=2)
Ilustrasi SVM
Terpisah Sempurna
# Generate Random Data Set
set.seed(10)
<- matrix(rnorm(20*2), ncol = 2)
x <- c(rep(-1,10), rep(1,10))
y ==1,] <- x[y==1,] + 3/2
x[y<- data.frame(x=x, y=as.factor(y))
data1
ggplot(data = data1, aes(x = x.2, y = x.1, color = y, shape = y)) +
geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
theme(legend.position = "none")
<- svm(y~., data = data1, kernel = "linear", scale = FALSE)
svmfit1 svmfit1
##
## Call:
## svm(formula = y ~ ., data = data1, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 5
plot(svmfit1, data1, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))
Tidak Terpisah Sempurna
# Generate Random Data Set
<- matrix(rnorm(20*2), ncol = 2)
x <- c(rep(-1,10), rep(1,10))
y ==1,] <- x[y==1,] + 1
x[y<- data.frame(x=x, y=as.factor(y))
data2
ggplot(data = data2, aes(x = x.2, y = x.1, color = y, shape = y)) +
geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
theme(legend.position = "none")
<- svm(y~., data = data2, kernel = "linear", cost = 10)
svmfit2 plot(svmfit2, data2, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))
Non-Linear SVM
# Generate Larger random data set
<- matrix(rnorm(200*2), ncol = 2)
x 1:100,] <- x[1:100,] + 2.5
x[101:150,] <- x[101:150,] - 2.5
x[<- c(rep(1,150), rep(2,50))
y <- data.frame(x=x,y=as.factor(y))
data3
ggplot(data = data3, aes(x = x.2, y = x.1, color = y, shape = y)) +
geom_point(size = 2) + scale_color_manual(values=c(1,4)) +
theme(legend.position = "none")
<- svm(y~., data = data3, kernel = "radial", gamma = 1, cost = 1)
svmfit3 plot(svmfit3, data3, color.palette = rainbow, svSymbol=17, symbolPalette = c(1,4))
User Defined Function
<- function(pred,data){
perform <- caret::confusionMatrix(pred, data$Class, positive = "2")
tabel <- c(tabel$overall[1],tabel$byClass[1:2])
result return(result)
}
Pemodelan
Model 1
<- svm(Class~.,data=steeltrain,kernel="linear")
model.svm1 <- predict(model.svm1,steeltest)
pred.svm1 perform(pred.svm1,steeltest)
## Accuracy Sensitivity Specificity
## 0.7444254 0.3910891 0.9317585
Model 2
<- svm(Class~.,data=steeltrain,kernel="polynomial",degree=1,cost=1)
model.svm2 <- predict(model.svm2,steeltest)
pred.svm2 perform(pred.svm2,steeltest)
## Accuracy Sensitivity Specificity
## 0.7461407 0.3811881 0.9396325
Model 3
<- svm(Class~.,data=steeltrain,kernel="polynomial",degree=2,cost=.1)
model.svm3 <- predict(model.svm3,steeltest)
pred.svm3 perform(pred.svm3,steeltest)
## Accuracy Sensitivity Specificity
## 0.7135506 0.1831683 0.9947507
Model 4
<- svm(Class~.,data=steeltrain,kernel="radial",gamma=0.01,cost=10)
model.svm4 <- predict(model.svm4,steeltest)
pred.svm4 perform(pred.svm4,steeltest)
## Accuracy Sensitivity Specificity
## 0.7735849 0.5148515 0.9107612
Model 5
<- svm(Class~.,data=steeltrain,kernel="radial",gamma=0.1,cost=5)
model.svm5 <- predict(model.svm5,steeltest)
pred.svm5 perform(pred.svm5,steeltest)
## Accuracy Sensitivity Specificity
## 0.8061750 0.6336634 0.8976378
Model 6
<- svm(Class~.,data=steeltrain,kernel="sigmoid",gamma=0.2,cost=0.01)
model.svm6 <- predict(model.svm6,steeltest) pred.svm6
Perbandingan Antar Model
data.frame(model1=perform(pred.svm1,steeltest),
model2=perform(pred.svm2,steeltest),
model3=perform(pred.svm3,steeltest),
model4=perform(pred.svm4,steeltest),
model5=perform(pred.svm5,steeltest),
model6=perform(pred.svm6,steeltest))
## model1 model2 model3 model4 model5 model6
## Accuracy 0.7444254 0.7461407 0.7135506 0.7735849 0.8061750 0.66209262
## Sensitivity 0.3910891 0.3811881 0.1831683 0.5148515 0.6336634 0.02475248
## Specificity 0.9317585 0.9396325 0.9947507 0.9107612 0.8976378 1.00000000
Hyperparameter Tuning
<- tune(svm,Class~.,data=steeltrain,
tuningsvm ranges=list(kernel=c("linear","polynomial","radial","sigmoid")))
$best.model tuningsvm
##
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(kernel = c("linear",
## "polynomial", "radial", "sigmoid")))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 807
<- tune(svm,Class~., data=steeltrain, kernel="radial",
tuningsvm ranges=list(cost = c(0.1,1,10,100)))
$best.model tuningsvm
##
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(cost = c(0.1,
## 1, 10, 100)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 697
<- predict(tuningsvm$best.model,steeltest)
pred.svm7 data.frame(model4=perform(pred.svm4,steeltest),
model7=perform(pred.svm7,steeltest))
## model4 model7
## Accuracy 0.7735849 0.8164666
## Sensitivity 0.5148515 0.6386139
## Specificity 0.9107612 0.9107612
Pengembangan Materi SVM
Memeperbesar range cost
<- c(seq(0.01,0.1,by=0.01),seq(0.1,1,by=0.1),seq(1,10,by=1),seq(10,100,by=10),seq(100,1000,by=100)) biaya
<- tune(svm,Class~., data=steeltrain, kernel="radial",
tuningsvm ranges=list(cost = biaya))
$best.model tuningsvm
##
## Call:
## best.tune(method = svm, train.x = Class ~ ., data = steeltrain, ranges = list(cost = biaya),
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 20
##
## Number of Support Vectors: 680
<- predict(tuningsvm$best.model,steeltest)
pred.svm8 data.frame(model4=perform(pred.svm4,steeltest),
model7=perform(pred.svm7,steeltest),
model8=perform(pred.svm8,steeltest))
## model4 model7 model8
## Accuracy 0.7735849 0.8164666 0.8044597
## Sensitivity 0.5148515 0.6386139 0.6336634
## Specificity 0.9107612 0.9107612 0.8950131
Badan Informasi Geospasial, abdul.aziz@big.go.id↩︎