Support Vector Machine (SVM) diperkenalkan oleh Vapnik pada tahun 1992 sebagai suatu teknik klasifikasi yang efisien untuk masalah nonlinier. SVM berbeda dengan teknik klasifikasi di era 1980-an, seperti decission tree dan ANN, yang secara konsep kurang begitu jelas dan seringkali terjebak pada optimum lokal. SVM memiliki konsep yang jauh lebih matang, lebih jelas secara matematis, dibanding teknik-teknik klasifikasi sebelum era 1990-an. (Suyanto,2018)

Membaca data

bank <- read.csv("UniversalBank.csv")

Struktur Bank

str(bank)
## 'data.frame':    5000 obs. of  14 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age               : int  25 45 39 35 35 37 53 50 35 34 ...
##  $ Experience        : int  1 19 15 9 8 13 27 24 10 9 ...
##  $ Income            : int  49 34 11 100 45 29 72 22 81 180 ...
##  $ ZIP.Code          : int  91107 90089 94720 94112 91330 92121 91711 93943 90089 93023 ...
##  $ Family            : int  4 3 1 1 4 4 2 1 3 1 ...
##  $ CCAvg             : num  1.6 1.5 1 2.7 1 0.4 1.5 0.3 0.6 8.9 ...
##  $ Education         : int  1 1 1 2 2 2 2 3 2 3 ...
##  $ Mortgage          : int  0 0 0 0 0 155 0 0 104 0 ...
##  $ Personal.Loan     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Securities.Account: int  1 1 0 0 0 0 0 0 0 0 ...
##  $ CD.Account        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Online            : int  0 0 0 0 0 1 1 0 1 0 ...
##  $ CreditCard        : int  0 0 0 0 1 0 0 1 0 0 ...

Deskripsi statistik data

library(psych)
describe(bank)
##                    vars    n     mean      sd  median  trimmed     mad
## ID                    1 5000  2500.50 1443.52  2500.5  2500.50 1853.25
## Age                   2 5000    45.34   11.46    45.0    45.38   14.83
## Experience            3 5000    20.10   11.47    20.0    20.13   14.83
## Income                4 5000    73.77   46.03    64.0    68.83   43.00
## ZIP.Code              5 5000 93152.50 2121.85 93437.0 93236.13 1968.89
## Family                6 5000     2.40    1.15     2.0     2.37    1.48
## CCAvg                 7 5000     1.94    1.75     1.5     1.65    1.33
## Education             8 5000     1.88    0.84     2.0     1.85    1.48
## Mortgage              9 5000    56.50  101.71     0.0    32.98    0.00
## Personal.Loan        10 5000     0.10    0.29     0.0     0.00    0.00
## Securities.Account   11 5000     0.10    0.31     0.0     0.01    0.00
## CD.Account           12 5000     0.06    0.24     0.0     0.00    0.00
## Online               13 5000     0.60    0.49     1.0     0.62    0.00
## CreditCard           14 5000     0.29    0.46     0.0     0.24    0.00
##                     min   max range   skew kurtosis    se
## ID                    1  5000  4999   0.00    -1.20 20.41
## Age                  23    67    44  -0.03    -1.15  0.16
## Experience           -3    43    46  -0.03    -1.12  0.16
## Income                8   224   216   0.84    -0.05  0.65
## ZIP.Code           9307 96651 87344 -12.49   485.52 30.01
## Family                1     4     3   0.16    -1.40  0.02
## CCAvg                 0    10    10   1.60     2.64  0.02
## Education             1     3     2   0.23    -1.55  0.01
## Mortgage              0   635   635   2.10     4.75  1.44
## Personal.Loan         0     1     1   2.74     5.52  0.00
## Securities.Account    0     1     1   2.59     4.69  0.00
## CD.Account            0     1     1   3.69    11.61  0.00
## Online                0     1     1  -0.39    -1.84  0.01
## CreditCard            0     1     1   0.90    -1.18  0.01

Analisis data tak bernilai

library(Amelia)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.5.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2020 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(bank, main="Missings Map", col=c("yellow", "black"), legend=FALSE)

#Data tidak mengandung NA

Merapikan data untuk persiapan pembuatan data training dan testing

bank=bank[,-c(1,5)] 
bank$CreditCard <- as.factor(bank$CreditCard)
Under=ifelse(bank$Education==1,1,0) 
Grad=ifelse(bank$Education== 2,1,0) 
bank=cbind(bank,Under,Grad)
bank_2=bank[,-6] 
bank_2[-11] <- scale(bank_2[-11])

Membuat data traing dan testing

library(caTools)
set.seed(1111)
spl_1 = sample.split(bank_2$CreditCard, SplitRatio=0.7)

Training = subset(bank_2, spl_1==TRUE) 
Testing = subset(bank_2, spl_1==T)

Membuat model SVM

library(e1071)
models <- svm(CreditCard~., data=Training, scale=T, kernel="linear", type="C-classification" )
summary(models)
## 
## Call:
## svm(formula = CreditCard ~ ., data = Training, kernel = "linear", 
##     type = "C-classification", scale = T)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.08333333 
## 
## Number of Support Vectors:  2016
## 
##  ( 1067 949 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Pengujian data testing

pred = predict(models, Testing[-11])
tabel <- table(as.numeric(pred),as.numeric(Testing$CreditCard))
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
confusionMatrix(tabel)
## Confusion Matrix and Statistics
## 
##    
##        1    2
##   1 2425  863
##   2   46  166
##                                           
##                Accuracy : 0.7403          
##                  95% CI : (0.7254, 0.7547)
##     No Information Rate : 0.706           
##     P-Value [Acc > NIR] : 3.651e-06       
##                                           
##                   Kappa : 0.1857          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9814          
##             Specificity : 0.1613          
##          Pos Pred Value : 0.7375          
##          Neg Pred Value : 0.7830          
##              Prevalence : 0.7060          
##          Detection Rate : 0.6929          
##    Detection Prevalence : 0.9394          
##       Balanced Accuracy : 0.5714          
##                                           
##        'Positive' Class : 1               
##