Naive Bayes & SVM

Library

Beberapa Library yang digunakan pada praktikum kali ini :

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4

## Warning: package 'tibble' was built under R version 4.0.3

## Warning: package 'tidyr' was built under R version 4.0.3

## Warning: package 'dplyr' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

library(knitr)
library(ggplot2)
library(tidyr)
library(e1071)
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.3

library(rpart)
library(DMwR)

## Warning: package 'DMwR' was built under R version 4.0.3

## Loading required package: grid

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(imager)

## Warning: package 'imager' was built under R version 4.0.5

## Loading required package: magrittr

## Warning: package 'magrittr' was built under R version 4.0.3

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following object is masked from 'package:grid':
## 
##     depth

## The following object is masked from 'package:stringr':
## 
##     boundary

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

Dataset

Data yang digunakan adalah data Subs to be reload.csv

Data_reload <- read.csv("Subs to be reload.csv", header = T)

str(Data_reload)

## 'data.frame':    1000000 obs. of  27 variables:
##  $ ï..msisdn         : num  6.28e+10 6.28e+10 6.29e+11 6.29e+11 6.29e+11 ...
##  $ current_tier      : chr  "Platinum" "Gold" "Gold" "Platinum" ...
##  $ available_points  : num  6210 11250 3475 9200 4925 ...
##  $ vlr_attached_p3d  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ flag_arpu_90d     : chr  "4. hvc" "4. hvc" "4. hvc" "4. hvc" ...
##  $ flag_arpu_last_30d: chr  "1. nvc" "4. hvc" "4. hvc" "3. mvc" ...
##  $ tenure_rgu        : int  619 178 1307 1307 982 1307 1307 1307 684 1219 ...
##  $ rgu_flag          : chr  "RGU 30D" "RGU 30D" "RGU 30D" "RGU 30D" ...
##  $ rld_30d           : int  0 1 1 1 0 1 0 1 1 1 ...
##  $ rld_60d           : int  0 1 1 1 0 1 1 1 1 1 ...
##  $ rld_90d           : int  1 1 1 1 1 1 0 1 1 1 ...
##  $ rld_tot_30d       : int  0 200000 170000 100000 0 50000 0 100000 62000 30000 ...
##  $ rld_tot_60d       : int  0 100000 195000 100000 0 80000 10000 100000 62000 30000 ...
##  $ rld_tot_90d       : int  12000 100000 105000 125000 112000 50000 0 100000 40000 30000 ...
##  $ reload_p90d       : int  12000 400000 470000 325000 112000 180000 10000 300000 164000 90000 ...
##  $ tot_month_rld     : int  1 3 3 3 1 3 1 3 3 3 ...
##  $ denom_30d         : int  0 100000 50000 100000 0 50000 0 100000 15000 30000 ...
##  $ denom_60d         : int  0 100000 60000 50000 0 50000 10000 100000 30000 30000 ...
##  $ denom_90d         : int  12000 100000 50000 100000 100000 50000 0 100000 10000 30000 ...
##  $ curr_balance      : num  250 0 300 0 175 ...
##  $ active_pack       : int  0 1 1 1 1 1 1 1 0 1 ...
##  $ status            : chr  "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
##  $ n_days            : int  54 346 171 346 299 348 43 176 359 136 ...
##  $ arpu_rld          : num  12000 133333 156667 108333 112000 ...
##  $ cust_flag         : chr  "Moderate" "Healthy" "Healthy" "Healthy" ...
##  $ rld_nm            : int  0 1 1 1 0 1 0 1 1 0 ...
##  $ rld_tot           : int  0 200000 150000 110000 0 100000 0 100000 50000 0 ...

knitr::include_graphics("C:/Users/ASUS/Downloads/indosat.jpeg")

Praprosesing Data

Untuk kebutuhan praktikum kali ini , Data yang digunakan adalah data tenure_rgu<1000, available_points>14000, dan current_tier Gold dan silver.

Data_reload <- Data_reload  %>% filter (tenure_rgu<1000)
Data_reload <- Data_reload  %>% filter (available_points>14000) 
Data_reload<- Data_reload %>% filter (current_tier=="Gold"|current_tier=="Silver")%>%na.omit()

Variabel

Data yang digunakan pada metode naive bayes kita menggunakan respon berbentuk kategorik dalam hal ini kita menggunakan vaiabel rld_nm.

olahdis<- Data_reload %>% select(rld_nm, available_points , tenure_rgu, rld_tot_30d , rld_tot_60d , rld_tot_90d, reload_p90d   , denom_30d , denom_60d, denom_90d , curr_balance ,  n_days , arpu_rld )
olahdis$rld_nm<-as.factor(olahdis$rld_nm)

olahdis_x <- olahdis %>% select(-rld_nm)
olahdis_x_standardize <- as.data.frame(scale(olahdis_x))
olahdis_x_standardize$rld_nm<-olahdis$rld_nm
head(olahdis_x_standardize)

##   available_points tenure_rgu rld_tot_30d rld_tot_60d rld_tot_90d reload_p90d
## 1        2.6004811  1.9196572   1.3460775  2.14810667   2.4888049  2.58934638
## 2        0.1321487 -1.2571564   0.2837051  2.14810667   0.4913592  1.23883626
## 3        0.7702555 -1.5238519   3.3123467 -0.03200377  -1.1668976  1.22672406
## 4       -0.9034608  1.3784223  -0.4147607 -0.14100929   0.7174851  0.01550421
## 5       -0.3467233  0.6332438   0.4656584  0.65836454  -0.8277087  0.18204694
## 6        0.3657449 -0.8531912  -0.1975906 -0.21367964  -0.4131445 -0.35694589
##    denom_30d  denom_60d  denom_90d curr_balance    n_days    arpu_rld rld_nm
## 1  0.3475460  0.5799248  0.6118686   -0.2048690 0.5719785  2.36849283      1
## 2 -0.4827043 -0.1669859 -0.6921495   -0.1364453 0.5828688  1.07544619      1
## 3 -0.2059542 -0.1669859 -0.9222703   -0.1869822 0.5719785  2.34674877      0
## 4  0.3475460 -0.1669859  0.6118686   -0.1969450 0.5066365 -0.09583372      0
## 5 -0.2059542  0.5799248 -0.5387356   -0.1783497 0.5828688  0.06362270      0
## 6  0.3475460  0.5799248  0.6118686   -0.1344499 0.2888299 -0.45243627      0

Pembagian Data

set.seed(10)
split <- createDataPartition(olahdis_x_standardize$rld_nm , p = 0.7, list = FALSE) 
train_set <- olahdis_x_standardize[split,]
test_set <- olahdis_x_standardize[-split,]

SVM

Pemodelan Support Vector Machine Kernel Linear

model.svm1 <- svm(rld_nm ~.,data=train_set,kernel="linear",  scale=TRUE)
model.svm1

## 
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  334

pred.svm1 <- predict(model.svm1,test_set)
tabel <- caret::confusionMatrix(pred.svm1, as.factor(test_set$rld_nm), positive = "1")
tabel

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 179  63
##          1   0   0
##                                           
##                Accuracy : 0.7397          
##                  95% CI : (0.6796, 0.7938)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.5338          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 5.662e-15       
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.7397          
##              Prevalence : 0.2603          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
##

model.svm1_balance <- svm(rld_nm ~.,data=train_set,kernel="linear", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm1_balance

## 
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "linear", class.weights = c(`0` = 0.3, 
##     `1` = 0.7), scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  420

pred.svm1_balance <- predict(model.svm1_balance,test_set)
tabel <- caret::confusionMatrix(pred.svm1_balance, as.factor(test_set$rld_nm), positive = "1")
tabel

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 133  27
##          1  46  36
##                                           
##                Accuracy : 0.6983          
##                  95% CI : (0.6363, 0.7555)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.93622         
##                                           
##                   Kappa : 0.2865          
##                                           
##  Mcnemar's Test P-Value : 0.03514         
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 0.7430          
##          Pos Pred Value : 0.4390          
##          Neg Pred Value : 0.8313          
##              Prevalence : 0.2603          
##          Detection Rate : 0.1488          
##    Detection Prevalence : 0.3388          
##       Balanced Accuracy : 0.6572          
##                                           
##        'Positive' Class : 1               
##

Model SVM kernel Sigmoid

model.svm2<- svm(rld_nm ~.,data=train_set,kernel="sigmoid", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm2

## 
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "sigmoid", class.weights = c(`0` = 0.3, 
##     `1` = 0.7), scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  1 
##      coef.0:  0 
## 
## Number of Support Vectors:  337

pred.svm2 <- predict(model.svm2,test_set)
tabel2 <- caret::confusionMatrix(pred.svm2, as.factor(test_set$rld_nm), positive = "1")
tabel2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 119  26
##          1  60  37
##                                           
##                Accuracy : 0.6446          
##                  95% CI : (0.5808, 0.7049)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.999578        
##                                           
##                   Kappa : 0.2146          
##                                           
##  Mcnemar's Test P-Value : 0.000373        
##                                           
##             Sensitivity : 0.5873          
##             Specificity : 0.6648          
##          Pos Pred Value : 0.3814          
##          Neg Pred Value : 0.8207          
##              Prevalence : 0.2603          
##          Detection Rate : 0.1529          
##    Detection Prevalence : 0.4008          
##       Balanced Accuracy : 0.6261          
##                                           
##        'Positive' Class : 1               
##

Model SVM Kernel Radial

model.svm3<- svm(rld_nm ~.,data=train_set,kernel="radial", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm3

## 
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "radial", class.weights = c(`0` = 0.3, 
##     `1` = 0.7), scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  436

pred.svm3<- predict(model.svm3,test_set)
tabel3 <- caret::confusionMatrix(pred.svm3, as.factor(test_set$rld_nm), positive = "1")
tabel3

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 134  25
##          1  45  38
##                                          
##                Accuracy : 0.7107         
##                  95% CI : (0.6492, 0.767)
##     No Information Rate : 0.7397         
##     P-Value [Acc > NIR] : 0.86360        
##                                          
##                   Kappa : 0.319          
##                                          
##  Mcnemar's Test P-Value : 0.02315        
##                                          
##             Sensitivity : 0.6032         
##             Specificity : 0.7486         
##          Pos Pred Value : 0.4578         
##          Neg Pred Value : 0.8428         
##              Prevalence : 0.2603         
##          Detection Rate : 0.1570         
##    Detection Prevalence : 0.3430         
##       Balanced Accuracy : 0.6759         
##                                          
##        'Positive' Class : 1              
##

Model SVM Kernel Polynomial

model.svm4<- svm(rld_nm ~.,data=train_set,kernel="polynomial", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm4

## 
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "polynomial", 
##     class.weights = c(`0` = 0.3, `1` = 0.7), scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  465

pred.svm4<- predict(model.svm4,test_set)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(test_set$rld_nm), positive = "1")
tabel4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 173  61
##          1   6   2
##                                           
##                Accuracy : 0.7231          
##                  95% CI : (0.6622, 0.7785)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.7472          
##                                           
##                   Kappa : -0.0025         
##                                           
##  Mcnemar's Test P-Value : 4.191e-11       
##                                           
##             Sensitivity : 0.031746        
##             Specificity : 0.966480        
##          Pos Pred Value : 0.250000        
##          Neg Pred Value : 0.739316        
##              Prevalence : 0.260331        
##          Detection Rate : 0.008264        
##    Detection Prevalence : 0.033058        
##       Balanced Accuracy : 0.499113        
##                                           
##        'Positive' Class : 1               
##

Contoh Output SVM

plot(model.svm2, train_set,available_points~tenure_rgu)

tuning hyperparameter svm

tuningsvm <- tune(svm,rld_nm~.,data=train_set,
                  ranges=list(kernel=c("radial","linear","polynomial","sigmoid")))
tuningsvm$best.model

## 
## Call:
## best.tune(method = svm, train.x = rld_nm ~ ., data = train_set, ranges = list(kernel = c("radial", 
##     "linear", "polynomial", "sigmoid")))
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  314

Pemodelan naive Bayes

model.nb <- naiveBayes(rld_nm ~., data = train_set, threshold=0.4)
pred.nb <- predict(model.nb,test_set)
tabel <- caret::confusionMatrix(pred.nb, as.factor(test_set$rld_nm), positive = "1")
tabel

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 160  48
##          1  19  15
##                                           
##                Accuracy : 0.7231          
##                  95% CI : (0.6622, 0.7785)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.7472087       
##                                           
##                   Kappa : 0.1551          
##                                           
##  Mcnemar's Test P-Value : 0.0006245       
##                                           
##             Sensitivity : 0.23810         
##             Specificity : 0.89385         
##          Pos Pred Value : 0.44118         
##          Neg Pred Value : 0.76923         
##              Prevalence : 0.26033         
##          Detection Rate : 0.06198         
##    Detection Prevalence : 0.14050         
##       Balanced Accuracy : 0.56597         
##                                           
##        'Positive' Class : 1               
##

Perbandingan Model

olahdis1<- Data_reload %>% select(rld_nm, available_points, tenure_rgu, rld_30d, rld_60d, rld_90d , curr_balance , n_days , arpu_rld)
olahdis1$rld_nm<-as.factor(olahdis1$rld_nm)

perulangan <- 100

df_akurasi <- data.frame("akurasi_svm" = numeric(), "akurasi_nb" = numeric(), "akurasi_pc" = numeric())

SVM_list <- vector(mode="list", length = perulangan)
NB_list<- vector(mode="list", length = perulangan)
PC_list<-vector(mode="list", length = perulangan)

for (i in 1:perulangan){
 
split <- createDataPartition(olahdis1$rld_nm , p = 0.7, list = FALSE) 
train_set1 <- olahdis1[split,]
test_set1 <- olahdis1[-split,]
train_set1_pc<-train_set1 %>% mutate(bobot = ifelse(rld_nm == "0", 0.3, 0.7))
test_set1_pc<-test_set1 %>% mutate(bobot = ifelse(rld_nm  == "0", 0.3, 0.7))
 
  
  #Model Support Vector Machine
 
  
 model.svm4<- svm(rld_nm ~.,data=train_set1,kernel="radial", class.weights= c('0' = 0.3, '1' = 0.7))
 
  
 SVM_list[[i]] <- model.svm4
 
  
  #Model Naive Bayes
  model.nb <- naiveBayes(rld_nm ~., data = train_set1, threshold=0.4)

  NB_list[[i]] <- model.nb
  
#Pohon Klaisfikasi
  
 pohon_classif1 <-  rpart(data=train_set1_pc, rld_nm ~ available_points+ tenure_rgu+ rld_30d+ rld_60d+rld_90d + curr_balance +  n_days + arpu_rld,method = "class",control=rpart.control(cp=0, minsplit=28), weights=train_set1$bobot)

 PC_list[[i]] <- pohon_classif1
  
  #Akurasi SVM
 pred.svm4<- predict(model.svm4,test_set1)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(test_set1$rld_nm), positive = "1")
  akurasi<-as.data.frame(tabel4$overall)
akurasi_svm<-akurasi[1,]

  #Akurasi Naive Bayes

pred.nb <- predict(model.nb,test_set1)
tabel <- caret::confusionMatrix(pred.nb, as.factor(test_set1$rld_nm), positive = "1")

  akurasi_nb<-as.data.frame(tabel$overall)
akurasi_nb<-akurasi_nb[1,]

#akurasi Pohon Klasifikasi
prob.prediksi.02 <- predict(pohon_classif1, newdata=test_set1_pc)
prediksi.02 <- factor(ifelse(prob.prediksi.02[,2] > 0.5, 1, 0),
                         levels = 0:1, labels = c("0", "1"))

#library(caret)
tabel_pc=confusionMatrix(prediksi.02, test_set1_pc$rld_nm )
  akurasi_pc<-as.data.frame(tabel_pc$overall)
akurasi_pc<-akurasi_pc[1,]
 
  
  # Update table akurasi
  df_akurasi<- rbind(df_akurasi, c(akurasi_svm, akurasi_nb, akurasi_pc))
  paste0("ulangan ", i, " selesai.\n")
}

colnames(df_akurasi) = c("SVM", "Naive Bayes", "Pohon")
df_akurasi %>%
  as_tibble() %>%
  mutate(ulangan = 1:perulangan) %>%
  pivot_longer(-ulangan) %>%
  ggplot(aes(name, value)) +
  geom_boxplot()+xlab("Metode")+ylab("Akurasi")