Memanggil Library R
library(tidyverse)
library(rsample)
library(caret)
library(ROCR)Input Data
insurance <- read.csv("carInsurance_train.csv")
insuranceData ini merupakan modifikasi dari data bank marketing UCI Machine Learning Datasets, menjadi data yang membahas tentang carInsurance. Download Metadata atau dapat dilihat juga eksperimen dataset yang sama menggunakan model Logistic Regression di sini.
Inspect Dataset
library(inspectdf)
sum <- read.csv("carInsurance_train.csv")
sum %>%
inspect_cat() %>%
show_plot() Dari visualisasi inspectdf::inspect_cat() ini, kita dapat melihat bahwa dataset memiliki banyak missing values, dan persebaran data yang tidak proporsional. Kolom CallEnd dan CallStart juga memiliki tendensi untuk menjadi redundant variable (tipe data datetime, kurang representatif untuk dijadikan model)
Memberi Label pada Beberapa Variabel
Karena pada saat ini saya mencoba untuk berekperimen dengan menggunakan model NaiveBayes, maka dari itu semua variabel prediktor akan dijadikan factor. Beberapa kelas factor 0 dan 1 direpresentasikan dengan nilai “No” dan “Yes”.
insurance$Default_labeled <-
ifelse(insurance$Default == 1, "Yes" , "No")
insurance$Household_labeled <-
ifelse(insurance$HHInsurance == 1, "Yes", "No")
insurance$CarLoan_labeled <-
ifelse(insurance$CarLoan == 1, "Yes", "No")
insurance$PrevAttempts_labeled <-
ifelse(insurance$PrevAttempts == 1, "Yes", "No")
insurance$CarInsurance_labeled <-
ifelse(insurance$CarInsurance == 1, "Yes", "No")insurance_ready <- insurance %>%
select(-c(
Outcome,
CallStart,
CallEnd,
Communication,
LastContactDay,
LastContactMonth,
Id
)) %>%
select(-c(Default, HHInsurance, CarLoan, PrevAttempts, CarInsurance))
insurance_readyHandling Missing Values
#Check missing values
dfcheck1 <- insurance_ready %>%
is.na() %>%
colSums() %>%
as.data.frame() %>%
rownames_to_column(var = "var") %>%
rename(total = 2) %>%
filter(total !=0) %>%
arrange(desc(total)) %>%
mutate(percent = total/nrow(.))
dfcheck1insurance_ready$Job <- insurance_ready$Job %>%
replace(is.na(.), "unknown")#Check missing values
dfcheck2 <- insurance_ready %>%
is.na() %>%
colSums() %>%
as.data.frame() %>%
rownames_to_column(var = "var") %>%
rename(total = 2) %>%
filter(total != 0) %>%
arrange(desc(total)) %>%
mutate(percent = total / nrow(.))
dfcheck2insurance_ready$Education <- insurance_ready$Education %>%
replace(is.na(.), "unknown")unique(insurance_ready$Education)#> [1] "tertiary" "primary" "secondary" "unknown"
unique(insurance_ready$Job)#> [1] "management" "blue-collar" "student" "technician"
#> [5] "admin." "services" "self-employed" "retired"
#> [9] "unknown" "housemaid" "entrepreneur" "unemployed"
Melihat Persebaran Data
summary(insurance_ready)#> Age Job Marital Education
#> Min. :18.00 Length:4000 Length:4000 Length:4000
#> 1st Qu.:32.00 Class :character Class :character Class :character
#> Median :39.00 Mode :character Mode :character Mode :character
#> Mean :41.21
#> 3rd Qu.:49.00
#> Max. :95.00
#> Balance NoOfContacts DaysPassed Default_labeled
#> Min. :-3058.0 Min. : 1.000 Min. : -1.00 Length:4000
#> 1st Qu.: 111.0 1st Qu.: 1.000 1st Qu.: -1.00 Class :character
#> Median : 551.5 Median : 2.000 Median : -1.00 Mode :character
#> Mean : 1532.9 Mean : 2.607 Mean : 48.71
#> 3rd Qu.: 1619.0 3rd Qu.: 3.000 3rd Qu.: -1.00
#> Max. :98417.0 Max. :43.000 Max. :854.00
#> Household_labeled CarLoan_labeled PrevAttempts_labeled
#> Length:4000 Length:4000 Length:4000
#> Class :character Class :character Class :character
#> Mode :character Mode :character Mode :character
#>
#>
#>
#> CarInsurance_labeled
#> Length:4000
#> Class :character
#> Mode :character
#>
#>
#>
#age, balance, NoOfContactsDari hasil summarize() ini, kita dapat menyimpulkan bahwa beberapa data numerik ada yang tersebar secara tidak normal. Dalam kasus ini, saya akan menggunakan teknik binning pada semua variabel agar bisa digunakan dalam algoritma NaiveBayes().
Memeriksa variansi data beberapa variabel numerik
unique(insurance_ready$NoOfContacts)#> [1] 2 5 1 4 8 3 14 7 6 12 11 25 34 9 26 24 10 38 20 13 16 17 32 21 15
#> [26] 23 29 19 28 18 22 43 30 41 27
max(insurance_ready$Balance)#> [1] 98417
#<500, <1000, <10000, 10000+, minusBinning Variabel Numerik
insurance_fornaive <- insurance_ready %>%
mutate(
Age = case_when(Age < 30 ~ "18-30",
Age >= 31 &
Age <= 50 ~ "31-50",
TRUE ~ "51+"),
NoOfContacts = case_when(
NoOfContacts <= 2 ~ "1-2 times",
NoOfContacts >= 3 &
NoOfContacts <= 10 ~ "3+ times",
TRUE ~ "10+ times"
)
) %>%
mutate(
Balance = case_when(
Balance <= 0 ~ "<500",
Balance >= 501 &
Balance <= 999 ~ ">=500",
Balance >= 1000 &
Balance <= 9999 ~ ">=1000",
Balance >= 10000 &
Balance <= 49999 ~ ">=10000",
Balance >= 50000 &
Balance <= 98417 ~ ">=50000",
TRUE ~ "minus"
)
) %>%
mutate(NowContacted = case_when(DaysPassed == -1 ~ "Not Contacted",
TRUE ~ "Contacted")) %>%
mutate(
DaysPassed = case_when(
DaysPassed == -1 ~ "Not contacted",
DaysPassed >= 91 &
DaysPassed < 181 ~ "91-180 Days",
DaysPassed > 180 &
DaysPassed < 271 ~ "181-270 Days",
DaysPassed > 270 &
DaysPassed < 361 ~ "271-360 Days",
TRUE ~ "more than a year"
)
)
insurance_fornaive#<500, <1000, <10000, 10000+ (98417), minusinsurance_fornaive %>%
mutate_if(is.character, as.factor)Splitting Data Train dan Test
RNGkind(sample.kind = "Rounding")
set.seed(100)
# train-test splitting
library(rsample)
init <- initial_split(insurance_fornaive, prop = 0.8, strata = "CarInsurance_labeled")
train <- training(init)
test <- testing(init)Cek Proporsi Kelas Target
prop.table(table(train$CarInsurance_labeled))#>
#> No Yes
#> 0.5989372 0.4010628
Membuat Model
library(e1071)
naive <- naiveBayes(CarInsurance_labeled ~ . , data = train, laplace = 1)Membuat Label Prediksi
test$pred <- predict(naive, newdata = test, type = "class")Evaluasi Model: Confusion Matrix
library(caret)
u <- union(test$pred, test$CarInsurance_labeled)
t <- table(factor(test$pred, u), factor(test$CarInsurance_labeled, u))
confusionMatrix(t, positive = "Yes")#> Confusion Matrix and Statistics
#>
#>
#> No Yes
#> No 370 151
#> Yes 110 170
#>
#> Accuracy : 0.6742
#> 95% CI : (0.6405, 0.7065)
#> No Information Rate : 0.5993
#> P-Value [Acc > NIR] : 0.00000709
#>
#> Kappa : 0.3069
#>
#> Mcnemar's Test P-Value : 0.01329
#>
#> Sensitivity : 0.5296
#> Specificity : 0.7708
#> Pos Pred Value : 0.6071
#> Neg Pred Value : 0.7102
#> Prevalence : 0.4007
#> Detection Rate : 0.2122
#> Detection Prevalence : 0.3496
#> Balanced Accuracy : 0.6502
#>
#> 'Positive' Class : Yes
#>
Dari hasil evaluasi menggunakan confusionmatrix(), dapat dikatakan bahwa model dapat memprediksi positive class dengan baik. Namun di sisi lain, performa model (apabila mengikuti kebutuhan bisnis: matriks precision) mungkin bisa ditingkatkan. Cara yang kali ini saya gunakan adalah dengan tuning model, untuk menyeleksi beberapa prediktor
summary(naive) #> Length Class Mode
#> apriori 2 table numeric
#> tables 12 -none- list
#> levels 2 -none- character
#> isnumeric 12 -none- logical
#> call 4 -none- call
Memeriksa Proporsi Setiap Prediktor
naive#>
#> Naive Bayes Classifier for Discrete Predictors
#>
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#>
#> A-priori probabilities:
#> Y
#> No Yes
#> 0.5989372 0.4010628
#>
#> Conditional probabilities:
#> Age
#> Y 18-30 31-50 51+
#> No 0.1096033 0.6649269 0.2270355
#> Yes 0.1652377 0.5471551 0.2899454
#>
#> Job
#> Y admin. blue-collar entrepreneur housemaid management retired
#> No 0.118475992 0.220250522 0.033402923 0.029749478 0.213465553 0.046450939
#> Yes 0.113016368 0.137957911 0.022603274 0.025720966 0.244738893 0.091971941
#> Job
#> Y self-employed services student technician unemployed unknown
#> No 0.037056367 0.094467641 0.015135699 0.168058455 0.024530271 0.005219207
#> Yes 0.037412315 0.073265783 0.053000779 0.161340608 0.041309431 0.007014809
#>
#> Marital
#> Y divorced married single
#> No 0.1122129 0.6189979 0.2703549
#> Yes 0.1340608 0.5214341 0.3468433
#>
#> Education
#> Y primary secondary tertiary unknown
#> No 0.15083507 0.52922756 0.28235908 0.03966597
#> Yes 0.12548714 0.45362432 0.37412315 0.04988309
#>
#> Balance
#> Y <500 >=1000 >=10000 >=500 >=50000 minus
#> No 0.171189979 0.288100209 0.019832985 0.170668058 0.001043841 0.352296451
#> Yes 0.105222136 0.384255651 0.023382697 0.179267342 0.001558846 0.310989867
#>
#> NoOfContacts
#> Y 1-2 times 10+ times 3+ times
#> No 0.65866388 0.03496868 0.30793319
#> Yes 0.74201091 0.01247077 0.24785659
#>
#> DaysPassed
#> Y 181-270 Days 271-360 Days 91-180 Days more than a year Not contacted
#> No 0.04279749 0.04540710 0.05114823 0.02661795 0.83663883
#> Yes 0.08651598 0.04442712 0.13406080 0.08261886 0.65627436
#>
#> Default_labeled
#> Y No Yes
#> No 0.98173278 0.01931106
#> Yes 0.99064692 0.01091193
#>
#> Household_labeled
#> Y No Yes
#> No 0.4253653 0.5756785
#> Yes 0.6399065 0.3616524
#>
#> CarLoan_labeled
#> Y No Yes
#> No 0.83820459 0.16283925
#> Yes 0.90491037 0.09664848
#>
#> PrevAttempts_labeled
#> Y No Yes
#> No 0.93632568 0.06471816
#> Yes 0.88854248 0.11301637
#>
#> NowContacted
#> Y Contacted Not Contacted
#> No 0.1644050 0.8366388
#> Yes 0.3452845 0.6562744
dapat kita lihat bahwa prediktor DaysPassed memiliki kelas yang banyak, dan lebih banyak diisi oleh “Not contacted”
max(insurance_fornaive$DaysPassed)#> [1] "Not contacted"
insurance_fornaiveModel Tuning: membuang prediktor DaysPassed
naive2 <- naiveBayes(CarInsurance_labeled ~ Age + Marital + Balance + NoOfContacts + Default_labeled + Household_labeled + CarLoan_labeled + PrevAttempts_labeled + NowContacted, data = train, laplace = 1)Memeriksa Kembali Proporsi Prediktor
naive2#>
#> Naive Bayes Classifier for Discrete Predictors
#>
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#>
#> A-priori probabilities:
#> Y
#> No Yes
#> 0.5989372 0.4010628
#>
#> Conditional probabilities:
#> Age
#> Y 18-30 31-50 51+
#> No 0.1096033 0.6649269 0.2270355
#> Yes 0.1652377 0.5471551 0.2899454
#>
#> Marital
#> Y divorced married single
#> No 0.1122129 0.6189979 0.2703549
#> Yes 0.1340608 0.5214341 0.3468433
#>
#> Balance
#> Y <500 >=1000 >=10000 >=500 >=50000 minus
#> No 0.171189979 0.288100209 0.019832985 0.170668058 0.001043841 0.352296451
#> Yes 0.105222136 0.384255651 0.023382697 0.179267342 0.001558846 0.310989867
#>
#> NoOfContacts
#> Y 1-2 times 10+ times 3+ times
#> No 0.65866388 0.03496868 0.30793319
#> Yes 0.74201091 0.01247077 0.24785659
#>
#> Default_labeled
#> Y No Yes
#> No 0.98173278 0.01931106
#> Yes 0.99064692 0.01091193
#>
#> Household_labeled
#> Y No Yes
#> No 0.4253653 0.5756785
#> Yes 0.6399065 0.3616524
#>
#> CarLoan_labeled
#> Y No Yes
#> No 0.83820459 0.16283925
#> Yes 0.90491037 0.09664848
#>
#> PrevAttempts_labeled
#> Y No Yes
#> No 0.93632568 0.06471816
#> Yes 0.88854248 0.11301637
#>
#> NowContacted
#> Y Contacted Not Contacted
#> No 0.1644050 0.8366388
#> Yes 0.3452845 0.6562744
Re-Evaluasi Model
Confusion Matrix
test$pred2 <- predict(naive2, newdata = test, type = "class")confusionMatrix(test$pred2,
test$CarInsurance_labeled %>% as.factor(),
positive = "Yes")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 385 167
#> Yes 95 154
#>
#> Accuracy : 0.6729
#> 95% CI : (0.6392, 0.7053)
#> No Information Rate : 0.5993
#> P-Value [Acc > NIR] : 0.000009906
#>
#> Kappa : 0.2927
#>
#> Mcnemar's Test P-Value : 0.000011524
#>
#> Sensitivity : 0.4798
#> Specificity : 0.8021
#> Pos Pred Value : 0.6185
#> Neg Pred Value : 0.6975
#> Prevalence : 0.4007
#> Detection Rate : 0.1923
#> Detection Prevalence : 0.3109
#> Balanced Accuracy : 0.6409
#>
#> 'Positive' Class : Yes
#>
Berdasarkan hasil tuning model, kita dapat melihat bahwa performa matriks Pos Pred Value naik sekitar satu persen. Model cukup baik dalam memprediksi Positive Class, namun memang perlu diakui juga bahwa performa model juga tetap tidak signifikan.
pred_raw <- predict(naive2, newdata = test, type = "raw") Area Under Curve (AUC)
# objek prediction
roc_pred <- ROCR::prediction(predictions = pred_raw[,2], labels = test$CarInsurance_labeled)
# ROC curve
plot(performance(prediction.obj = roc_pred, measure = "tpr",x.measure = "fpr"))# nilai AUC
auc_score <- performance(prediction.obj = roc_pred, measure = "auc")
auc_score@y.values#> [[1]]
#> [1] 0.6766225
Berdasarkan visualisasi dan score AUC, kita dapat melihat bahwa kemungkinan model NaiveBayes() ini kurang baik dalam memprediksi data aktual, jika dikontekskan dengan dataset CarInsurance.
Berkaca dari eksplorasi dataset yang sama, namun menggunakan model Logistic Regression, performa model terlihat lebih baik. Maka dari itu dapat disimpulkan juga bahwa setiap algoritma memiliki kemampuannya dan limitasinya tersendiri.