About Dataset

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. What the best algorithms to predict a term deposit and improve the next campaign efficiency? Use this full version of bank marketing campaign from UCI Machine Learning Repository to find this answer!

1. Preparation

1.1 Importing Library

library(tidyverse) 
library(plotly)
library(gridExtra) 
library(dplyr)
library(GGally) 
library(caret) 
library(naivebayes) 
library(rpart)
library(rpart.plot) 
library(randomForest)
library(ROCR) 
library(e1071) 
library(repr) 
library(devtools) 

1.2 Read Data

banks <- read.csv("bank-full.csv",header=T,sep=';')
head(banks)
#>   age          job marital education default balance housing loan contact day
#> 1  58   management married  tertiary      no    2143     yes   no unknown   5
#> 2  44   technician  single secondary      no      29     yes   no unknown   5
#> 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
#> 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
#> 5  33      unknown  single   unknown      no       1      no   no unknown   5
#> 6  35   management married  tertiary      no     231     yes   no unknown   5
#>   month durations campaign pdays previous poutcome  y
#> 1   may       261        1    -1        0  unknown no
#> 2   may       151        1    -1        0  unknown no
#> 3   may        76        1    -1        0  unknown no
#> 4   may        92        1    -1        0  unknown no
#> 5   may       198        1    -1        0  unknown no
#> 6   may       139        1    -1        0  unknown no

1.3 Handling Type Data

banks <- banks %>%
  mutate(job = as.factor(job),
         marital = as.factor(marital),
         education = as.factor(education),
         default = as.factor(default),
         housing = as.factor(housing),
         loan = as.factor(loan),
         contact = as.factor(contact),
         month = as.factor(month),
         poutcome = as.factor(poutcome),
         y = as.factor(y))
str(banks)
#> 'data.frame':    45210 obs. of  17 variables:
#>  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
#>  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
#>  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
#>  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
#>  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
#>  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
#>  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
#>  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
#>  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
#>  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
#>  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
#>  $ durations: int  261 151 76 92 198 139 217 380 50 55 ...
#>  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
#>  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
#>  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

1.4 Handle Missing Value

colSums(is.na(banks))
#>       age       job   marital education   default   balance   housing      loan 
#>         0         0         0         0         0         0         0         0 
#>   contact       day     month durations  campaign     pdays  previous  poutcome 
#>         0         0         0         0         0         0         0         0 
#>         y 
#>         0

Insight

As we can see the feature ‘durations’ have GREAT importance to predict our target bigger than double of all another features Let’s remove this feature to do a realistic model

# Dropping duration feature
banks$durations <- NULL

2. EDA

2.1 Boxplot of each numerical features

# Creating a boxplot for each numerical variables vs target
BoxPlotAge <- ggplotly(ggplot(banks, aes(x = y, y = age)) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "Age") +
  scale_x_discrete(name = "Target") +
  ggtitle("Age") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotBalance <- ggplotly(ggplot(banks, aes(x = y, y = balance )) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "Balance") +
  scale_x_discrete(name = "Target") +
  ggtitle("Balance") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotDay <- ggplotly(ggplot(banks, aes(x = y, y = day )) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "Day") +
  scale_x_discrete(name = "Target") +
  ggtitle("Day") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotCampaign <- ggplotly(ggplot(banks, aes(x = y, y = campaign )) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "Campaign") +
  scale_x_discrete(name = "Target") +
  ggtitle("Campaign") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotpDays <- ggplotly(ggplot(banks, aes(x = y, y = pdays )) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "pdays") +
  scale_x_discrete(name = "Target") +
  ggtitle("Pdays") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotPrevious <- ggplotly(ggplot(banks, aes(x = y, y = previous )) +
  geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
  scale_y_continuous(name = "Previous") +
  scale_x_discrete(name = "Target") +
  ggtitle("Previous") +
  theme_gray() + 
  theme(axis.text.x = element_text(angle = 1, hjust = 1)
        ,axis.title = element_text(size = rel(2), angle = 1)
       ,plot.title = element_text(size = rel(2.2))
        ,axis.text =  element_text(size = rel(1.8))
        ,axis.ticks = element_line(size = 1)))

BoxPlotAge
BoxPlotBalance
BoxPlotDay
BoxPlotCampaign
BoxPlotPrevious
BoxPlotpDays

2.2 Visualizing each categorical feature

# Creating and storaging in variables a graphic with the freaquencys above
PlotJob <- ggplotly(ggplot(banks %>% count(job, y), aes(x = job, y = n )) +
  geom_bar(aes(fill = job), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "Job") +
  ggtitle("Job") +
  facet_wrap(~y)
)

PlotMarital <- ggplotly(ggplot(banks %>% count(marital, y), aes(x = marital, y = n )) +
  geom_bar(aes(fill = marital), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "Marital") +
  ggtitle("Marital")+
  facet_wrap(~y)
)

PlotEducation <- ggplotly(ggplot(banks %>% count(education, y), aes(x = education, y = n )) +
  geom_bar(aes(fill = education), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "education") +
  ggtitle("Education") +
  facet_wrap(~y)
)


PlotDefault <- ggplotly(ggplot(banks %>% count(default, y), aes(x = default, y = n )) +
  geom_bar(aes(fill = default), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "default") +
  ggtitle("Default") +
  facet_wrap(~y)
)

PlotHousing <- ggplotly(ggplot(banks %>% count(housing, y), aes(x = housing, y = n )) +
  geom_bar(aes(fill = housing), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "housing") +
  ggtitle("Housing") +
  facet_wrap(~y)
)

PlotLoan <- ggplotly(ggplot(banks %>% count(loan, y), aes(x = loan, y = n )) +
  geom_bar(aes(fill = loan), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "loan") +
  ggtitle("Loan") +
  facet_wrap(~y)
)

PlotContact <- ggplotly(ggplot(banks %>% count(contact, y), aes(x = contact, y = n )) +
  geom_bar(aes(fill = contact), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "contact") +
  ggtitle("Contact") +
  facet_wrap(~y)
)

PlotMoonth <- ggplotly(ggplot(banks %>% count(month, y), aes(x = month, y = n )) +
  geom_bar(aes(fill = month), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "month") +
  ggtitle("month") +
  facet_wrap(~y)
)

PlotOutCome <- ggplotly(ggplot(banks %>% count(poutcome, y), aes(x = poutcome, y = n )) +
  geom_bar(aes(fill = poutcome), stat = "identity", color = "white") +
  scale_y_discrete(name = "Frequencia") +
  scale_x_discrete(name = "poutcome") +
  ggtitle("Poutcome")+
  facet_wrap(~y)
)
PlotJob
PlotMarital 
PlotEducation
PlotDefault
PlotHousing
PlotLoan 
PlotContact
PlotMoonth 
PlotOutCome 

Insight

• Dari data tersebut menunjukan individu yang menikah, telah menyelesaikan pendidikan sekunder, dan orang yang tidak memiliki rumah lebih cenderung untuk berlangganan deposito jangka panjang.

• Orang yang tidak memiliki pinjaman dan memiliki poutcome (hasil kampanye pemasaran sebelumnya) yang tidak diketahui, cenderung berlangganan deposito jangka panjang. Bulan Mei, Agustus, dan Juli juga lebih baik untuk menjalankan kampanye pemasaran.

2.3 Analyze Correlation

ggplotly(banks %>%
  keep(is.numeric) %>%
  ggcorr(name = 'correlations'
         ,label = T
         )
)

Insight

Dari grafik diatas korelasi kuat antara previous dengan pdays yaitu 0.5.

2.4 Ploting class proportion

# counting yes and no 
proportion <- banks %>% count(y)

# Ploting  class proportion

ggplotly(ggplot(proportion, aes(x = y, y = n)) +
  geom_bar(fill = c('#FD8A8A','#A8D1D1'), stat = "identity") +
  geom_text(aes(label=n)) +
  ggtitle('Class proportion'))

2.5 Dealing with unbalance class problem

set.seed(50)
bank_oversample <- upSample(banks, banks$y) # creating a new dataset with oversampling

dim(bank_oversample)
#> [1] 79842    16
# Counting categorys frequency
proportion2 <- bank_oversample %>% count(Class)

ggplotly(ggplot(proportion2, aes(x = Class, y = n)) +
  geom_bar(fill = c('#FD8A8A','#A8D1D1'), stat = "identity") +
  geom_text(aes(label=n)) +
  ggtitle('Class proportion')
)
set.seed(50) # setting a seed to reproduce this model 
inTrain <- createDataPartition(banks$y, p = 0.7, list = FALSE) # Partitioning the dataset 70% train 30% test

train_noover <- banks[inTrain, ] # Creating the train dataset

test_noover <- banks[-inTrain, ] # Creating the test dataset

cat('no_oversampling train dataset dimensions: ',dim(train_noover),'\n')
#> no_oversampling train dataset dimensions:  31648 16
cat('no_oversampling test dataset dimensions: ',dim(test_noover),'\n')
#> no_oversampling test dataset dimensions:  13562 16
set.seed(50) # setting a seed to reproduce this model 
inTrain2 <- createDataPartition(bank_oversample$Class, p = 0.7, list = FALSE) # Partitioning the dataset 70% train 30% test

train_over <- bank_oversample[inTrain2, ] # Creating the train dataset

test_over <- bank_oversample[-inTrain2, ] # Creating the test dataset

cat('Oversampling train dataset dimensions: ',dim(train_over),'\n')
#> Oversampling train dataset dimensions:  55890 16
cat('Oversampling test dataset dimensions: ',dim(test_over),'\n')
#> Oversampling test dataset dimensions:  23952 16

3. Training Model

3.1 Naive Bayes

# Training the model
set.seed(50)
nb1 = naive_bayes(Class ~ . , laplace = 1, usekernel = F, data = train_over)

# Predicting on train_over dataset
nb_train_pred1 <- predict(nb1, train_over, type = "class")
#visualisasi cinfusion matrix
source_url('https://github.com/wellingtsilvdev/codes-with-real-utilities/blob/master/draw_confusion_matrix.R?raw=TRUE') # calling travel function
# Plotting confusion matrix
confusion_train <- confusionMatrix(nb_train_pred1, train_over$Class, positive = 'yes')
draw_confusion_matrix(confusion_train)

Insight

68% of accuracy with sensitivity and specificity very balanced

# Predicting in test_noover dataset
nb_test_pred1 <- predict(nb1, test_noover, type = "class")

confusion_test <- confusionMatrix(nb_test_pred1, test_noover$y, positive = 'yes')
draw_confusion_matrix(confusion_test) # plotting confusionmatrix

Insight

Dari tes model mendapatkan akurasi 68%, tentu saja model ini tidak cukup baik. Model memiliki sensitivi 66% dan specificity 68%.

3.1.1 Improve Naive Bayes Model

3.1.1.1 Treating Campaign

Membagi data pada variabel “campaign” menjadi dua kategori yaitu “priority” (untuk campaign yang melakukan 4 atau kurang kontak) dan “not_priority” (untuk campaign yang melakukan lebih dari 4 kontak).

banks$campaign_cat <- as.factor(ifelse(banks$campaign<=4,'priority','not_priority'))
banks$campaign <- NULL # Removing the old feature

3.1.1.2 Treating Age

Mengklasifikasikan umur ke young, mature, dan old dengan ketentuan:

<30 = young

30-50 = mature

up 50 = old

banks$age_cat <-  ifelse(banks$age>=50,'old',NA)
banks$age_cat <-  ifelse(banks$age<=30,'young',banks$age_cat)
banks$age_cat <-  ifelse(banks$age>30 & banks$age<50,'mature',banks$age_cat)
banks$age_cat <- as.factor(banks$age_cat)

3.1.1.3 Treating Previous

Previous 0 berarti client belum pernah contact sebelumnya

# Classifying previous into contacted before and non_contacted before
banks$cont_before <- as.factor(ifelse(banks$previous==0,'no','yes'))

3.1.1.3 Treating Balances

Mengklasifikasikan balance client berdasarkan tingkat kekayaan, jika saldo kurang dari 1500 maka akan dianggap negatif

banks$balance_lvl <- ifelse(banks$balance<1500,'negative',NA)
banks$balance_lvl <- ifelse(banks$balance<=10000 & banks$balance>=0,'lvl1',banks$balance_lvl)
banks$balance_lvl <- ifelse(banks$balance>10000 & banks$balance<=40000,'lvl2',banks$balance_lvl)
banks$balance_lvl <- ifelse(banks$balance>40000,'lvl3',banks$balance_lvl)
banks$balance_lvl <- as.factor(banks$balance_lvl)

3.1.1.4 Treating Day

Mengklasifikasikan day ke bulan, middle month dan final of month

banks$moth_stage <- ifelse(banks$day<=7,'start_m',NA)
banks$moth_stage <- ifelse(banks$day>=22,'final_m',banks$moth_stage)
banks$moth_stage <- ifelse(banks$day<22 & banks$day>7,'middle_m',banks$moth_stage)
banks$moth_stage <- as.factor(banks$moth_stage)
#check again
head(banks)
#>   age          job marital education default balance housing loan contact day
#> 1  58   management married  tertiary      no    2143     yes   no unknown   5
#> 2  44   technician  single secondary      no      29     yes   no unknown   5
#> 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
#> 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
#> 5  33      unknown  single   unknown      no       1      no   no unknown   5
#> 6  35   management married  tertiary      no     231     yes   no unknown   5
#>   month pdays previous poutcome  y campaign_cat age_cat cont_before balance_lvl
#> 1   may    -1        0  unknown no     priority     old          no        lvl1
#> 2   may    -1        0  unknown no     priority  mature          no        lvl1
#> 3   may    -1        0  unknown no     priority  mature          no        lvl1
#> 4   may    -1        0  unknown no     priority  mature          no        lvl1
#> 5   may    -1        0  unknown no     priority  mature          no        lvl1
#> 6   may    -1        0  unknown no     priority  mature          no        lvl1
#>   moth_stage
#> 1    start_m
#> 2    start_m
#> 3    start_m
#> 4    start_m
#> 5    start_m
#> 6    start_m
colSums(is.na(banks))
#>          age          job      marital    education      default      balance 
#>            0            0            0            0            0            0 
#>      housing         loan      contact          day        month        pdays 
#>            0            0            0            0            0            0 
#>     previous     poutcome            y campaign_cat      age_cat  cont_before 
#>            0            0            0            0            0            0 
#>  balance_lvl   moth_stage 
#>            0            0

3.1.1.5 Creating new Feature on Categorical Features

# Creating a feature to represent people married and with no house
banks$no_house_married <- as.factor(ifelse(banks$marital=='married' & banks$housing=='no',1,0))

# Creating a feature to represent people with secondary education and no loan
banks$no_loan_secedu <- as.factor(ifelse(banks$education=='secondary' & banks$loan=='no',1,0))

# Creating a feature to represent people with no default and no house
banks$no_credit_no_house <- as.factor(ifelse(banks$default=='no' & banks$housing=='no',1,0))

# Creating a feature to represent people that are concated by celluar and was never concated before
banks$cell_cont_before <- as.factor(ifelse(banks$contact=='cellular' & banks$cont_before=='no',1,0))

# Creating a feature to represent people that are in mature age and have a wealth lvl1
banks$mature_lvl1 <- as.factor(ifelse(banks$age_cat=='mature' & banks$balance_lvl=='lvl1',1,0))

3.1.1.6 Creating new Feature on Numerical Features

# Creating a feature with relation between age and balance
banks$age_balan <- banks$balance/banks$age

# Creating a feature with relation between previous and pdays
banks$previous_pdays <- banks$previous/banks$pdays

3.1.1.7 Converting feature into Category

banks$cat_age_balan <- ifelse(banks$age_balan>=100,'out',NA) # more then 100 let's classify as out(outlayer)
banks$cat_age_balan <- ifelse(banks$age_balan<0,'out',banks$cat_age_balan) # less then 0 let's classify as out
banks$cat_age_balan <- ifelse(banks$age_balan>=0 & banks$age_balan<100,'in',banks$cat_age_balan)
banks$cat_age_balan <- as.factor(banks$cat_age_balan) # converting into factors

# let's classify previous_pdays diferent from 0 as outlayers
banks$cat_previous_pdays <- as.factor(ifelse(banks$previous_pdays==0,'inn','outt'))
# removing numerical variables
banks$age <- NULL
banks$balance <- NULL
banks$previous <- NULL
banks$pdays <- NULL
banks$day <- NULL
banks$age_balan <- NULL
banks$previous_pdays <- NULL

3.1.1.8 Modeling new Feature

Train model

# Using oversampling
set.seed(50)
bank_oversample2 <- upSample(banks, banks$y)

# Creating train and test data without oversampling to do validation
set.seed(50)
inTrain <- createDataPartition(banks$y, p = 0.7, list = FALSE)

train_noover <- banks[inTrain, ]

test_noover <- banks[-inTrain, ]

# Creating train and test data with oversampling to training
set.seed(50)
inTrain2 <- createDataPartition(bank_oversample2$Class, p = 0.7, list = FALSE)

train_over <- bank_oversample2[inTrain2, ]

test_over <- bank_oversample2[-inTrain2, ]
# Training the model2
set.seed(50)
nb2 = naive_bayes(Class ~ . , laplace = 1, usekernel = F, data = train_over)

# Predicting on train_over dataset
nb_train_pred2 <- predict(nb2, train_over, type = "class")

confusion_train <- confusionMatrix(nb_train_pred2, train_over$Class, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_train)

Test Model

# predicting in no oversampling test dataset
nb_test_pred2 <- predict(nb2, test_noover, type = "class")

confusion_test <- confusionMatrix(nb_test_pred2, test_noover$y, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_test) # plotting confusion matrix

Insight

Terlihat dari model yang telah dilakukan feature selection terjadi peningkatan, namun model dengan akurasi 69% masih belum baik.

3.1.1.9 Evaluating Model using ROC

#Plotting AUC
probabilits <- predict(nb2, type ='prob', test_noover) 

nb2_probs <- prediction(probabilits[,2], test_noover$y)
plot(performance(nb2_probs, "tpr", "fpr"), col = "red", main = "Area Under the Curve - AUC")
abline(0,1, lty = 8, col = "grey")

#AUC
auc <- performance(nb2_probs, "auc")
value_auc <- slot(auc, "y.values")[[1]]
value_auc
#> [1] 0.7430197

Insight

AUC adalah ukuran evaluasi kinerja model klasifikasi yang menggambarkan seberapa baik model tersebut dapat membedakan antara dua kelas.

Hasil AUC yang diperoleh adalah 0.736184222164374 atau sekitar 73%. Meskipun hasil ini cukup baik, namun tetap dianggap dapat ditingkatkan lagi dengan mencoba model-model klasifikasi lainnya.

3.2 Decision Tree

3.2.1 Training Data

# Training the model
tree1 = rpart(Class ~ ., data = train_over,method='class')

# Plotting the decision tree
rpart.plot(tree1,box.palette = 'RdBu'
           ,shadow.col = 'gray'
           ,nn=T,main='Decision Tree')

### 3.2.2 Confusion Matrix

# Predicting on train_over dataset
tree_train_pred1 <- predict(tree1, train_over, type = "class")

confusion_train <- confusionMatrix(tree_train_pred1, train_over$Class, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_train) # plotting confusion matrix

### 3.2.3 Predicting

# Predicting in test_noover dataset
tree_test_pred1 <- predict(tree1, test_noover, type = "class")

confusion_test <- confusionMatrix(tree_test_pred1, test_noover$y, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_test)

Insight

Dalam penggunaan model decision trees, akurasi model meningkat menjadi 82%, namun sensitivitasnya terlalu kecil, hanya 51%. Model yang dibuat tidak mampu mengeneralisasi hasil pada datase

3.2.4 Evaluating Model

# Plotting ROC curve for decision tree model
probabilits <- predict(tree1, type ='prob', test_noover) 

nb2_probs <- prediction(probabilits[,2], test_noover$y)
plot(performance(nb2_probs, "tpr", "fpr"), col = "red", main = "AUC - Decision Tree")
abline(0,1, lty = 8, col = "grey")

# AUC
auc_trees <- performance(nb2_probs, "auc")
value_auc_trees <- slot(auc_trees, "y.values")[[1]]
value_auc_trees
#> [1] 0.7314527

Insight

Nilai Auc 73% sudah baik namun dengan nilai sensitivity 56%, dikhawatirkan model akan cenderung menghasilkan lebih banyak false negative (FN) atau prediksi negatif palsu, di mana data sebenarnya positif namun diprediksi sebagai negatif.

3.3 Random Forest

3.3.1 Training Data

# Training the model
set.seed(50)
forest1 = randomForest(Class ~ ., data = train_over,ntree=100,importance=T)

# Predicting on train_over dataset
forest_train_pred1 <- predict(forest1, train_over, type = "class")

options(repr.plot.width = 14, repr.plot.height = 10)
plot(forest1,main='Random Forest Error Decreassing') # Ploting the forest error

### 3.3.2 Visualize Variabel Importance

# Catting var importance from model
importance <- as.data.frame(forest1$importance)
importance$features <- row.names(importance)
importance$MeanDecreaseGini <- round(importance$MeanDecreaseGini,2)

# Ploting a bar graphic with features importance proportion
options(repr.plot.width = 24, repr.plot.height = 10)
ggplotly(ggplot(importance, aes(x = features, y = MeanDecreaseGini)) +
  geom_bar(fill = "#8CC0DE", stat = "identity") +
  geom_text(aes(label=MeanDecreaseGini), vjust = -0.3) +
  ggtitle('Features Importance'))

3.3.3 Confusion Matrix Train

# Saving and plotting confusion matrix
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
confusion_forest_train <- confusionMatrix(forest_train_pred1, train_over$Class, positive = 'yes')
draw_confusion_matrix(confusion_forest_train)

Insight

Pada Model Random Forest Terjadi peningkatan di seluruh parameter yaitu pada accuracy menjadi 83% dan sensitivity menjadi 77%

3.3.4 Confusion Matrix Predict

# Predicting in test_noover dataset
forest_test_pred1 <- predict(forest1, test_noover, type = "class")

confusion_forest_test <- confusionMatrix(forest_test_pred1, test_noover$y, positive = 'yes')
draw_confusion_matrix(confusion_forest_test)

Insight

Pada Model Random Forest Terjadi peningkatan di seluruh parameter yaitu pada accuracy menjadi 84% dan sensitivity menjadi 76%

3.3.5 Evaluating Model Using ROC and AUC

#Plotting ROC curve for decision tree model
prob_forest <- predict(forest1, type ='prob', test_noover)

forest_probs <- prediction(prob_forest[,2], test_noover$y)
plot(performance(forest_probs, "tpr", "fpr"), col = "red", main = "AUC - Random Forest")
abline(0,1, lty = 8, col = "grey")

#AUC from random forests
auc_forest <- performance(forest_probs, "auc")
value_auc_forest <- slot(auc_forest, "y.values")[[1]]
value_auc_forest
#> [1] 0.8674292

Insight

Pada evaluasi model random forest nilai AUC sebesar 86% artinya model Random Forest menunjukkan bahwa model tersebut baik dalam membedakan antara dua kelas atau target yang berbeda.

4. Conclusion

Model dengan teknik Random Forest merupakan model terbaik dibandingkan dengan model Naive Bayes dan Decision Trees. Model Random Forest memiliki nilai AUC sebesar 86% dan akurasi sebesar 83% dengan tingkat sensitivitas 76% dan spesifisitas 85%.