Naive Bayes & SVM
Library
Beberapa Library yang digunakan pada praktikum kali ini :
## Loading required package: lattice
## Loading required package: ggplot2
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## Warning: package 'ROCR' was built under R version 4.0.3
## Warning: package 'DMwR' was built under R version 4.0.3
## Loading required package: grid
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'imager' was built under R version 4.0.5
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 4.0.3
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following object is masked from 'package:grid':
##
## depth
## The following object is masked from 'package:stringr':
##
## boundary
## The following object is masked from 'package:tidyr':
##
## fill
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
Dataset
Data yang digunakan adalah data Subs to be reload.csv
## 'data.frame': 1000000 obs. of 27 variables:
## $ ï..msisdn : num 6.28e+10 6.28e+10 6.29e+11 6.29e+11 6.29e+11 ...
## $ current_tier : chr "Platinum" "Gold" "Gold" "Platinum" ...
## $ available_points : num 6210 11250 3475 9200 4925 ...
## $ vlr_attached_p3d : int 1 1 1 1 1 1 1 1 1 1 ...
## $ flag_arpu_90d : chr "4. hvc" "4. hvc" "4. hvc" "4. hvc" ...
## $ flag_arpu_last_30d: chr "1. nvc" "4. hvc" "4. hvc" "3. mvc" ...
## $ tenure_rgu : int 619 178 1307 1307 982 1307 1307 1307 684 1219 ...
## $ rgu_flag : chr "RGU 30D" "RGU 30D" "RGU 30D" "RGU 30D" ...
## $ rld_30d : int 0 1 1 1 0 1 0 1 1 1 ...
## $ rld_60d : int 0 1 1 1 0 1 1 1 1 1 ...
## $ rld_90d : int 1 1 1 1 1 1 0 1 1 1 ...
## $ rld_tot_30d : int 0 200000 170000 100000 0 50000 0 100000 62000 30000 ...
## $ rld_tot_60d : int 0 100000 195000 100000 0 80000 10000 100000 62000 30000 ...
## $ rld_tot_90d : int 12000 100000 105000 125000 112000 50000 0 100000 40000 30000 ...
## $ reload_p90d : int 12000 400000 470000 325000 112000 180000 10000 300000 164000 90000 ...
## $ tot_month_rld : int 1 3 3 3 1 3 1 3 3 3 ...
## $ denom_30d : int 0 100000 50000 100000 0 50000 0 100000 15000 30000 ...
## $ denom_60d : int 0 100000 60000 50000 0 50000 10000 100000 30000 30000 ...
## $ denom_90d : int 12000 100000 50000 100000 100000 50000 0 100000 10000 30000 ...
## $ curr_balance : num 250 0 300 0 175 ...
## $ active_pack : int 0 1 1 1 1 1 1 1 0 1 ...
## $ status : chr "ACTIVE" "ACTIVE" "ACTIVE" "ACTIVE" ...
## $ n_days : int 54 346 171 346 299 348 43 176 359 136 ...
## $ arpu_rld : num 12000 133333 156667 108333 112000 ...
## $ cust_flag : chr "Moderate" "Healthy" "Healthy" "Healthy" ...
## $ rld_nm : int 0 1 1 1 0 1 0 1 1 0 ...
## $ rld_tot : int 0 200000 150000 110000 0 100000 0 100000 50000 0 ...
Praprosesing Data
Untuk kebutuhan praktikum kali ini , Data yang digunakan adalah data tenure_rgu<1000, available_points>14000, dan current_tier Gold dan silver.
Variabel
Data yang digunakan pada metode naive bayes kita menggunakan respon berbentuk kategorik dalam hal ini kita menggunakan vaiabel rld_nm.
olahdis<- Data_reload %>% select(rld_nm, available_points , tenure_rgu, rld_tot_30d , rld_tot_60d , rld_tot_90d, reload_p90d , denom_30d , denom_60d, denom_90d , curr_balance , n_days , arpu_rld )
olahdis$rld_nm<-as.factor(olahdis$rld_nm)
olahdis_x <- olahdis %>% select(-rld_nm)
olahdis_x_standardize <- as.data.frame(scale(olahdis_x))
olahdis_x_standardize$rld_nm<-olahdis$rld_nm
head(olahdis_x_standardize)
## available_points tenure_rgu rld_tot_30d rld_tot_60d rld_tot_90d reload_p90d
## 1 2.6004811 1.9196572 1.3460775 2.14810667 2.4888049 2.58934638
## 2 0.1321487 -1.2571564 0.2837051 2.14810667 0.4913592 1.23883626
## 3 0.7702555 -1.5238519 3.3123467 -0.03200377 -1.1668976 1.22672406
## 4 -0.9034608 1.3784223 -0.4147607 -0.14100929 0.7174851 0.01550421
## 5 -0.3467233 0.6332438 0.4656584 0.65836454 -0.8277087 0.18204694
## 6 0.3657449 -0.8531912 -0.1975906 -0.21367964 -0.4131445 -0.35694589
## denom_30d denom_60d denom_90d curr_balance n_days arpu_rld rld_nm
## 1 0.3475460 0.5799248 0.6118686 -0.2048690 0.5719785 2.36849283 1
## 2 -0.4827043 -0.1669859 -0.6921495 -0.1364453 0.5828688 1.07544619 1
## 3 -0.2059542 -0.1669859 -0.9222703 -0.1869822 0.5719785 2.34674877 0
## 4 0.3475460 -0.1669859 0.6118686 -0.1969450 0.5066365 -0.09583372 0
## 5 -0.2059542 0.5799248 -0.5387356 -0.1783497 0.5828688 0.06362270 0
## 6 0.3475460 0.5799248 0.6118686 -0.1344499 0.2888299 -0.45243627 0
Pembagian Data
SVM
Pemodelan Support Vector Machine Kernel Linear
##
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "linear", scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 334
pred.svm1 <- predict(model.svm1,test_set)
tabel <- caret::confusionMatrix(pred.svm1, as.factor(test_set$rld_nm), positive = "1")
tabel
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 179 63
## 1 0 0
##
## Accuracy : 0.7397
## 95% CI : (0.6796, 0.7938)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.5338
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 5.662e-15
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7397
## Prevalence : 0.2603
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
model.svm1_balance <- svm(rld_nm ~.,data=train_set,kernel="linear", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm1_balance
##
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "linear", class.weights = c(`0` = 0.3,
## `1` = 0.7), scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 420
pred.svm1_balance <- predict(model.svm1_balance,test_set)
tabel <- caret::confusionMatrix(pred.svm1_balance, as.factor(test_set$rld_nm), positive = "1")
tabel
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 133 27
## 1 46 36
##
## Accuracy : 0.6983
## 95% CI : (0.6363, 0.7555)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.93622
##
## Kappa : 0.2865
##
## Mcnemar's Test P-Value : 0.03514
##
## Sensitivity : 0.5714
## Specificity : 0.7430
## Pos Pred Value : 0.4390
## Neg Pred Value : 0.8313
## Prevalence : 0.2603
## Detection Rate : 0.1488
## Detection Prevalence : 0.3388
## Balanced Accuracy : 0.6572
##
## 'Positive' Class : 1
##
Model SVM kernel Sigmoid
model.svm2<- svm(rld_nm ~.,data=train_set,kernel="sigmoid", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm2
##
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "sigmoid", class.weights = c(`0` = 0.3,
## `1` = 0.7), scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## coef.0: 0
##
## Number of Support Vectors: 337
pred.svm2 <- predict(model.svm2,test_set)
tabel2 <- caret::confusionMatrix(pred.svm2, as.factor(test_set$rld_nm), positive = "1")
tabel2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 119 26
## 1 60 37
##
## Accuracy : 0.6446
## 95% CI : (0.5808, 0.7049)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.999578
##
## Kappa : 0.2146
##
## Mcnemar's Test P-Value : 0.000373
##
## Sensitivity : 0.5873
## Specificity : 0.6648
## Pos Pred Value : 0.3814
## Neg Pred Value : 0.8207
## Prevalence : 0.2603
## Detection Rate : 0.1529
## Detection Prevalence : 0.4008
## Balanced Accuracy : 0.6261
##
## 'Positive' Class : 1
##
Model SVM Kernel Radial
model.svm3<- svm(rld_nm ~.,data=train_set,kernel="radial", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm3
##
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "radial", class.weights = c(`0` = 0.3,
## `1` = 0.7), scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 436
pred.svm3<- predict(model.svm3,test_set)
tabel3 <- caret::confusionMatrix(pred.svm3, as.factor(test_set$rld_nm), positive = "1")
tabel3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 134 25
## 1 45 38
##
## Accuracy : 0.7107
## 95% CI : (0.6492, 0.767)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.86360
##
## Kappa : 0.319
##
## Mcnemar's Test P-Value : 0.02315
##
## Sensitivity : 0.6032
## Specificity : 0.7486
## Pos Pred Value : 0.4578
## Neg Pred Value : 0.8428
## Prevalence : 0.2603
## Detection Rate : 0.1570
## Detection Prevalence : 0.3430
## Balanced Accuracy : 0.6759
##
## 'Positive' Class : 1
##
Model SVM Kernel Polynomial
model.svm4<- svm(rld_nm ~.,data=train_set,kernel="polynomial", class.weights= c('0' = 0.3, '1' = 0.7), scale=TRUE)
model.svm4
##
## Call:
## svm(formula = rld_nm ~ ., data = train_set, kernel = "polynomial",
## class.weights = c(`0` = 0.3, `1` = 0.7), scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 465
pred.svm4<- predict(model.svm4,test_set)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(test_set$rld_nm), positive = "1")
tabel4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 173 61
## 1 6 2
##
## Accuracy : 0.7231
## 95% CI : (0.6622, 0.7785)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.7472
##
## Kappa : -0.0025
##
## Mcnemar's Test P-Value : 4.191e-11
##
## Sensitivity : 0.031746
## Specificity : 0.966480
## Pos Pred Value : 0.250000
## Neg Pred Value : 0.739316
## Prevalence : 0.260331
## Detection Rate : 0.008264
## Detection Prevalence : 0.033058
## Balanced Accuracy : 0.499113
##
## 'Positive' Class : 1
##
tuning hyperparameter svm
tuningsvm <- tune(svm,rld_nm~.,data=train_set,
ranges=list(kernel=c("radial","linear","polynomial","sigmoid")))
tuningsvm$best.model
##
## Call:
## best.tune(method = svm, train.x = rld_nm ~ ., data = train_set, ranges = list(kernel = c("radial",
## "linear", "polynomial", "sigmoid")))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 314
Pemodelan naive Bayes
model.nb <- naiveBayes(rld_nm ~., data = train_set, threshold=0.4)
pred.nb <- predict(model.nb,test_set)
tabel <- caret::confusionMatrix(pred.nb, as.factor(test_set$rld_nm), positive = "1")
tabel
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 160 48
## 1 19 15
##
## Accuracy : 0.7231
## 95% CI : (0.6622, 0.7785)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.7472087
##
## Kappa : 0.1551
##
## Mcnemar's Test P-Value : 0.0006245
##
## Sensitivity : 0.23810
## Specificity : 0.89385
## Pos Pred Value : 0.44118
## Neg Pred Value : 0.76923
## Prevalence : 0.26033
## Detection Rate : 0.06198
## Detection Prevalence : 0.14050
## Balanced Accuracy : 0.56597
##
## 'Positive' Class : 1
##
`
Perbandingan Model
olahdis1<- Data_reload %>% select(rld_nm, available_points, tenure_rgu, rld_30d, rld_60d, rld_90d , curr_balance , n_days , arpu_rld)
olahdis1$rld_nm<-as.factor(olahdis1$rld_nm)
perulangan <- 100
df_akurasi <- data.frame("akurasi_svm" = numeric(), "akurasi_nb" = numeric(), "akurasi_pc" = numeric())
SVM_list <- vector(mode="list", length = perulangan)
NB_list<- vector(mode="list", length = perulangan)
PC_list<-vector(mode="list", length = perulangan)
for (i in 1:perulangan){
split <- createDataPartition(olahdis1$rld_nm , p = 0.7, list = FALSE)
train_set1 <- olahdis1[split,]
test_set1 <- olahdis1[-split,]
train_set1_pc<-train_set1 %>% mutate(bobot = ifelse(rld_nm == "0", 0.3, 0.7))
test_set1_pc<-test_set1 %>% mutate(bobot = ifelse(rld_nm == "0", 0.3, 0.7))
#Model Support Vector Machine
model.svm4<- svm(rld_nm ~.,data=train_set1,kernel="radial", class.weights= c('0' = 0.3, '1' = 0.7))
SVM_list[[i]] <- model.svm4
#Model Naive Bayes
model.nb <- naiveBayes(rld_nm ~., data = train_set1, threshold=0.4)
NB_list[[i]] <- model.nb
#Pohon Klaisfikasi
pohon_classif1 <- rpart(data=train_set1_pc, rld_nm ~ available_points+ tenure_rgu+ rld_30d+ rld_60d+rld_90d + curr_balance + n_days + arpu_rld,method = "class",control=rpart.control(cp=0, minsplit=28), weights=train_set1$bobot)
PC_list[[i]] <- pohon_classif1
#Akurasi SVM
pred.svm4<- predict(model.svm4,test_set1)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(test_set1$rld_nm), positive = "1")
akurasi<-as.data.frame(tabel4$overall)
akurasi_svm<-akurasi[1,]
#Akurasi Naive Bayes
pred.nb <- predict(model.nb,test_set1)
tabel <- caret::confusionMatrix(pred.nb, as.factor(test_set1$rld_nm), positive = "1")
akurasi_nb<-as.data.frame(tabel$overall)
akurasi_nb<-akurasi_nb[1,]
#akurasi Pohon Klasifikasi
prob.prediksi.02 <- predict(pohon_classif1, newdata=test_set1_pc)
prediksi.02 <- factor(ifelse(prob.prediksi.02[,2] > 0.5, 1, 0),
levels = 0:1, labels = c("0", "1"))
#library(caret)
tabel_pc=confusionMatrix(prediksi.02, test_set1_pc$rld_nm )
akurasi_pc<-as.data.frame(tabel_pc$overall)
akurasi_pc<-akurasi_pc[1,]
# Update table akurasi
df_akurasi<- rbind(df_akurasi, c(akurasi_svm, akurasi_nb, akurasi_pc))
paste0("ulangan ", i, " selesai.\n")
}
colnames(df_akurasi) = c("SVM", "Naive Bayes", "Pohon")
df_akurasi %>%
as_tibble() %>%
mutate(ulangan = 1:perulangan) %>%
pivot_longer(-ulangan) %>%
ggplot(aes(name, value)) +
geom_boxplot()+xlab("Metode")+ylab("Akurasi")