The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. What the best algorithms to predict a term deposit and improve the next campaign efficiency? Use this full version of bank marketing campaign from UCI Machine Learning Repository to find this answer!
library(tidyverse)
library(plotly)
library(gridExtra)
library(dplyr)
library(GGally)
library(caret)
library(naivebayes)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ROCR)
library(e1071)
library(repr)
library(devtools)
banks <- read.csv("bank-full.csv",header=T,sep=';')
head(banks)
#> age job marital education default balance housing loan contact day
#> 1 58 management married tertiary no 2143 yes no unknown 5
#> 2 44 technician single secondary no 29 yes no unknown 5
#> 3 33 entrepreneur married secondary no 2 yes yes unknown 5
#> 4 47 blue-collar married unknown no 1506 yes no unknown 5
#> 5 33 unknown single unknown no 1 no no unknown 5
#> 6 35 management married tertiary no 231 yes no unknown 5
#> month durations campaign pdays previous poutcome y
#> 1 may 261 1 -1 0 unknown no
#> 2 may 151 1 -1 0 unknown no
#> 3 may 76 1 -1 0 unknown no
#> 4 may 92 1 -1 0 unknown no
#> 5 may 198 1 -1 0 unknown no
#> 6 may 139 1 -1 0 unknown no
banks <- banks %>%
mutate(job = as.factor(job),
marital = as.factor(marital),
education = as.factor(education),
default = as.factor(default),
housing = as.factor(housing),
loan = as.factor(loan),
contact = as.factor(contact),
month = as.factor(month),
poutcome = as.factor(poutcome),
y = as.factor(y))
str(banks)
#> 'data.frame': 45210 obs. of 17 variables:
#> $ age : int 58 44 33 47 33 35 28 42 58 43 ...
#> $ job : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
#> $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
#> $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
#> $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
#> $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
#> $ housing : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
#> $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
#> $ contact : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
#> $ day : int 5 5 5 5 5 5 5 5 5 5 ...
#> $ month : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
#> $ durations: int 261 151 76 92 198 139 217 380 50 55 ...
#> $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
#> $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
#> $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
colSums(is.na(banks))
#> age job marital education default balance housing loan
#> 0 0 0 0 0 0 0 0
#> contact day month durations campaign pdays previous poutcome
#> 0 0 0 0 0 0 0 0
#> y
#> 0
Insight
As we can see the feature ‘durations’ have GREAT importance to predict our target bigger than double of all another features Let’s remove this feature to do a realistic model
# Dropping duration feature
banks$durations <- NULL
# Creating a boxplot for each numerical variables vs target
BoxPlotAge <- ggplotly(ggplot(banks, aes(x = y, y = age)) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "Age") +
scale_x_discrete(name = "Target") +
ggtitle("Age") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotBalance <- ggplotly(ggplot(banks, aes(x = y, y = balance )) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "Balance") +
scale_x_discrete(name = "Target") +
ggtitle("Balance") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotDay <- ggplotly(ggplot(banks, aes(x = y, y = day )) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "Day") +
scale_x_discrete(name = "Target") +
ggtitle("Day") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotCampaign <- ggplotly(ggplot(banks, aes(x = y, y = campaign )) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "Campaign") +
scale_x_discrete(name = "Target") +
ggtitle("Campaign") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotpDays <- ggplotly(ggplot(banks, aes(x = y, y = pdays )) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "pdays") +
scale_x_discrete(name = "Target") +
ggtitle("Pdays") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotPrevious <- ggplotly(ggplot(banks, aes(x = y, y = previous )) +
geom_boxplot(fill = "#D27685", colour = "#1F3552", alpha = 0.6) +
scale_y_continuous(name = "Previous") +
scale_x_discrete(name = "Target") +
ggtitle("Previous") +
theme_gray() +
theme(axis.text.x = element_text(angle = 1, hjust = 1)
,axis.title = element_text(size = rel(2), angle = 1)
,plot.title = element_text(size = rel(2.2))
,axis.text = element_text(size = rel(1.8))
,axis.ticks = element_line(size = 1)))
BoxPlotAge
BoxPlotBalance
BoxPlotDay
BoxPlotCampaign
BoxPlotPrevious
BoxPlotpDays
# Creating and storaging in variables a graphic with the freaquencys above
PlotJob <- ggplotly(ggplot(banks %>% count(job, y), aes(x = job, y = n )) +
geom_bar(aes(fill = job), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "Job") +
ggtitle("Job") +
facet_wrap(~y)
)
PlotMarital <- ggplotly(ggplot(banks %>% count(marital, y), aes(x = marital, y = n )) +
geom_bar(aes(fill = marital), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "Marital") +
ggtitle("Marital")+
facet_wrap(~y)
)
PlotEducation <- ggplotly(ggplot(banks %>% count(education, y), aes(x = education, y = n )) +
geom_bar(aes(fill = education), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "education") +
ggtitle("Education") +
facet_wrap(~y)
)
PlotDefault <- ggplotly(ggplot(banks %>% count(default, y), aes(x = default, y = n )) +
geom_bar(aes(fill = default), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "default") +
ggtitle("Default") +
facet_wrap(~y)
)
PlotHousing <- ggplotly(ggplot(banks %>% count(housing, y), aes(x = housing, y = n )) +
geom_bar(aes(fill = housing), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "housing") +
ggtitle("Housing") +
facet_wrap(~y)
)
PlotLoan <- ggplotly(ggplot(banks %>% count(loan, y), aes(x = loan, y = n )) +
geom_bar(aes(fill = loan), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "loan") +
ggtitle("Loan") +
facet_wrap(~y)
)
PlotContact <- ggplotly(ggplot(banks %>% count(contact, y), aes(x = contact, y = n )) +
geom_bar(aes(fill = contact), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "contact") +
ggtitle("Contact") +
facet_wrap(~y)
)
PlotMoonth <- ggplotly(ggplot(banks %>% count(month, y), aes(x = month, y = n )) +
geom_bar(aes(fill = month), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "month") +
ggtitle("month") +
facet_wrap(~y)
)
PlotOutCome <- ggplotly(ggplot(banks %>% count(poutcome, y), aes(x = poutcome, y = n )) +
geom_bar(aes(fill = poutcome), stat = "identity", color = "white") +
scale_y_discrete(name = "Frequencia") +
scale_x_discrete(name = "poutcome") +
ggtitle("Poutcome")+
facet_wrap(~y)
)
PlotJob
PlotMarital
PlotEducation
PlotDefault
PlotHousing
PlotLoan
PlotContact
PlotMoonth
PlotOutCome
Insight
• Dari data tersebut menunjukan individu yang menikah, telah menyelesaikan pendidikan sekunder, dan orang yang tidak memiliki rumah lebih cenderung untuk berlangganan deposito jangka panjang.
• Orang yang tidak memiliki pinjaman dan memiliki poutcome (hasil kampanye pemasaran sebelumnya) yang tidak diketahui, cenderung berlangganan deposito jangka panjang. Bulan Mei, Agustus, dan Juli juga lebih baik untuk menjalankan kampanye pemasaran.
ggplotly(banks %>%
keep(is.numeric) %>%
ggcorr(name = 'correlations'
,label = T
)
)
Insight
Dari grafik diatas korelasi kuat antara previous dengan pdays yaitu 0.5.
# counting yes and no
proportion <- banks %>% count(y)
# Ploting class proportion
ggplotly(ggplot(proportion, aes(x = y, y = n)) +
geom_bar(fill = c('#FD8A8A','#A8D1D1'), stat = "identity") +
geom_text(aes(label=n)) +
ggtitle('Class proportion'))
set.seed(50)
bank_oversample <- upSample(banks, banks$y) # creating a new dataset with oversampling
dim(bank_oversample)
#> [1] 79842 16
# Counting categorys frequency
proportion2 <- bank_oversample %>% count(Class)
ggplotly(ggplot(proportion2, aes(x = Class, y = n)) +
geom_bar(fill = c('#FD8A8A','#A8D1D1'), stat = "identity") +
geom_text(aes(label=n)) +
ggtitle('Class proportion')
)
set.seed(50) # setting a seed to reproduce this model
inTrain <- createDataPartition(banks$y, p = 0.7, list = FALSE) # Partitioning the dataset 70% train 30% test
train_noover <- banks[inTrain, ] # Creating the train dataset
test_noover <- banks[-inTrain, ] # Creating the test dataset
cat('no_oversampling train dataset dimensions: ',dim(train_noover),'\n')
#> no_oversampling train dataset dimensions: 31648 16
cat('no_oversampling test dataset dimensions: ',dim(test_noover),'\n')
#> no_oversampling test dataset dimensions: 13562 16
set.seed(50) # setting a seed to reproduce this model
inTrain2 <- createDataPartition(bank_oversample$Class, p = 0.7, list = FALSE) # Partitioning the dataset 70% train 30% test
train_over <- bank_oversample[inTrain2, ] # Creating the train dataset
test_over <- bank_oversample[-inTrain2, ] # Creating the test dataset
cat('Oversampling train dataset dimensions: ',dim(train_over),'\n')
#> Oversampling train dataset dimensions: 55890 16
cat('Oversampling test dataset dimensions: ',dim(test_over),'\n')
#> Oversampling test dataset dimensions: 23952 16
# Training the model
set.seed(50)
nb1 = naive_bayes(Class ~ . , laplace = 1, usekernel = F, data = train_over)
# Predicting on train_over dataset
nb_train_pred1 <- predict(nb1, train_over, type = "class")
#visualisasi cinfusion matrix
source_url('https://github.com/wellingtsilvdev/codes-with-real-utilities/blob/master/draw_confusion_matrix.R?raw=TRUE') # calling travel function
# Plotting confusion matrix
confusion_train <- confusionMatrix(nb_train_pred1, train_over$Class, positive = 'yes')
draw_confusion_matrix(confusion_train)
Insight
68% of accuracy with sensitivity and specificity very balanced
# Predicting in test_noover dataset
nb_test_pred1 <- predict(nb1, test_noover, type = "class")
confusion_test <- confusionMatrix(nb_test_pred1, test_noover$y, positive = 'yes')
draw_confusion_matrix(confusion_test) # plotting confusionmatrix
Insight
Dari tes model mendapatkan akurasi 68%, tentu saja model ini tidak cukup baik. Model memiliki sensitivi 66% dan specificity 68%.
Membagi data pada variabel “campaign” menjadi dua kategori yaitu “priority” (untuk campaign yang melakukan 4 atau kurang kontak) dan “not_priority” (untuk campaign yang melakukan lebih dari 4 kontak).
banks$campaign_cat <- as.factor(ifelse(banks$campaign<=4,'priority','not_priority'))
banks$campaign <- NULL # Removing the old feature
Mengklasifikasikan umur ke young, mature, dan old dengan ketentuan:
<30 = young
30-50 = mature
up 50 = old
banks$age_cat <- ifelse(banks$age>=50,'old',NA)
banks$age_cat <- ifelse(banks$age<=30,'young',banks$age_cat)
banks$age_cat <- ifelse(banks$age>30 & banks$age<50,'mature',banks$age_cat)
banks$age_cat <- as.factor(banks$age_cat)
Previous 0 berarti client belum pernah contact sebelumnya
# Classifying previous into contacted before and non_contacted before
banks$cont_before <- as.factor(ifelse(banks$previous==0,'no','yes'))
Mengklasifikasikan balance client berdasarkan tingkat kekayaan, jika saldo kurang dari 1500 maka akan dianggap negatif
banks$balance_lvl <- ifelse(banks$balance<1500,'negative',NA)
banks$balance_lvl <- ifelse(banks$balance<=10000 & banks$balance>=0,'lvl1',banks$balance_lvl)
banks$balance_lvl <- ifelse(banks$balance>10000 & banks$balance<=40000,'lvl2',banks$balance_lvl)
banks$balance_lvl <- ifelse(banks$balance>40000,'lvl3',banks$balance_lvl)
banks$balance_lvl <- as.factor(banks$balance_lvl)
Mengklasifikasikan day ke bulan, middle month dan final of month
banks$moth_stage <- ifelse(banks$day<=7,'start_m',NA)
banks$moth_stage <- ifelse(banks$day>=22,'final_m',banks$moth_stage)
banks$moth_stage <- ifelse(banks$day<22 & banks$day>7,'middle_m',banks$moth_stage)
banks$moth_stage <- as.factor(banks$moth_stage)
#check again
head(banks)
#> age job marital education default balance housing loan contact day
#> 1 58 management married tertiary no 2143 yes no unknown 5
#> 2 44 technician single secondary no 29 yes no unknown 5
#> 3 33 entrepreneur married secondary no 2 yes yes unknown 5
#> 4 47 blue-collar married unknown no 1506 yes no unknown 5
#> 5 33 unknown single unknown no 1 no no unknown 5
#> 6 35 management married tertiary no 231 yes no unknown 5
#> month pdays previous poutcome y campaign_cat age_cat cont_before balance_lvl
#> 1 may -1 0 unknown no priority old no lvl1
#> 2 may -1 0 unknown no priority mature no lvl1
#> 3 may -1 0 unknown no priority mature no lvl1
#> 4 may -1 0 unknown no priority mature no lvl1
#> 5 may -1 0 unknown no priority mature no lvl1
#> 6 may -1 0 unknown no priority mature no lvl1
#> moth_stage
#> 1 start_m
#> 2 start_m
#> 3 start_m
#> 4 start_m
#> 5 start_m
#> 6 start_m
colSums(is.na(banks))
#> age job marital education default balance
#> 0 0 0 0 0 0
#> housing loan contact day month pdays
#> 0 0 0 0 0 0
#> previous poutcome y campaign_cat age_cat cont_before
#> 0 0 0 0 0 0
#> balance_lvl moth_stage
#> 0 0
# Creating a feature to represent people married and with no house
banks$no_house_married <- as.factor(ifelse(banks$marital=='married' & banks$housing=='no',1,0))
# Creating a feature to represent people with secondary education and no loan
banks$no_loan_secedu <- as.factor(ifelse(banks$education=='secondary' & banks$loan=='no',1,0))
# Creating a feature to represent people with no default and no house
banks$no_credit_no_house <- as.factor(ifelse(banks$default=='no' & banks$housing=='no',1,0))
# Creating a feature to represent people that are concated by celluar and was never concated before
banks$cell_cont_before <- as.factor(ifelse(banks$contact=='cellular' & banks$cont_before=='no',1,0))
# Creating a feature to represent people that are in mature age and have a wealth lvl1
banks$mature_lvl1 <- as.factor(ifelse(banks$age_cat=='mature' & banks$balance_lvl=='lvl1',1,0))
# Creating a feature with relation between age and balance
banks$age_balan <- banks$balance/banks$age
# Creating a feature with relation between previous and pdays
banks$previous_pdays <- banks$previous/banks$pdays
banks$cat_age_balan <- ifelse(banks$age_balan>=100,'out',NA) # more then 100 let's classify as out(outlayer)
banks$cat_age_balan <- ifelse(banks$age_balan<0,'out',banks$cat_age_balan) # less then 0 let's classify as out
banks$cat_age_balan <- ifelse(banks$age_balan>=0 & banks$age_balan<100,'in',banks$cat_age_balan)
banks$cat_age_balan <- as.factor(banks$cat_age_balan) # converting into factors
# let's classify previous_pdays diferent from 0 as outlayers
banks$cat_previous_pdays <- as.factor(ifelse(banks$previous_pdays==0,'inn','outt'))
# removing numerical variables
banks$age <- NULL
banks$balance <- NULL
banks$previous <- NULL
banks$pdays <- NULL
banks$day <- NULL
banks$age_balan <- NULL
banks$previous_pdays <- NULL
Train model
# Using oversampling
set.seed(50)
bank_oversample2 <- upSample(banks, banks$y)
# Creating train and test data without oversampling to do validation
set.seed(50)
inTrain <- createDataPartition(banks$y, p = 0.7, list = FALSE)
train_noover <- banks[inTrain, ]
test_noover <- banks[-inTrain, ]
# Creating train and test data with oversampling to training
set.seed(50)
inTrain2 <- createDataPartition(bank_oversample2$Class, p = 0.7, list = FALSE)
train_over <- bank_oversample2[inTrain2, ]
test_over <- bank_oversample2[-inTrain2, ]
# Training the model2
set.seed(50)
nb2 = naive_bayes(Class ~ . , laplace = 1, usekernel = F, data = train_over)
# Predicting on train_over dataset
nb_train_pred2 <- predict(nb2, train_over, type = "class")
confusion_train <- confusionMatrix(nb_train_pred2, train_over$Class, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_train)
Test Model
# predicting in no oversampling test dataset
nb_test_pred2 <- predict(nb2, test_noover, type = "class")
confusion_test <- confusionMatrix(nb_test_pred2, test_noover$y, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_test) # plotting confusion matrix
Insight
Terlihat dari model yang telah dilakukan feature selection terjadi peningkatan, namun model dengan akurasi 69% masih belum baik.
#Plotting AUC
probabilits <- predict(nb2, type ='prob', test_noover)
nb2_probs <- prediction(probabilits[,2], test_noover$y)
plot(performance(nb2_probs, "tpr", "fpr"), col = "red", main = "Area Under the Curve - AUC")
abline(0,1, lty = 8, col = "grey")
#AUC
auc <- performance(nb2_probs, "auc")
value_auc <- slot(auc, "y.values")[[1]]
value_auc
#> [1] 0.7430197
Insight
AUC adalah ukuran evaluasi kinerja model klasifikasi yang menggambarkan seberapa baik model tersebut dapat membedakan antara dua kelas.
Hasil AUC yang diperoleh adalah 0.736184222164374 atau sekitar 73%. Meskipun hasil ini cukup baik, namun tetap dianggap dapat ditingkatkan lagi dengan mencoba model-model klasifikasi lainnya.
# Training the model
tree1 = rpart(Class ~ ., data = train_over,method='class')
# Plotting the decision tree
rpart.plot(tree1,box.palette = 'RdBu'
,shadow.col = 'gray'
,nn=T,main='Decision Tree')
### 3.2.2 Confusion Matrix
# Predicting on train_over dataset
tree_train_pred1 <- predict(tree1, train_over, type = "class")
confusion_train <- confusionMatrix(tree_train_pred1, train_over$Class, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_train) # plotting confusion matrix
### 3.2.3 Predicting
# Predicting in test_noover dataset
tree_test_pred1 <- predict(tree1, test_noover, type = "class")
confusion_test <- confusionMatrix(tree_test_pred1, test_noover$y, positive = 'yes')
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
draw_confusion_matrix(confusion_test)
Insight
Dalam penggunaan model decision trees, akurasi model meningkat menjadi 82%, namun sensitivitasnya terlalu kecil, hanya 51%. Model yang dibuat tidak mampu mengeneralisasi hasil pada datase
# Plotting ROC curve for decision tree model
probabilits <- predict(tree1, type ='prob', test_noover)
nb2_probs <- prediction(probabilits[,2], test_noover$y)
plot(performance(nb2_probs, "tpr", "fpr"), col = "red", main = "AUC - Decision Tree")
abline(0,1, lty = 8, col = "grey")
# AUC
auc_trees <- performance(nb2_probs, "auc")
value_auc_trees <- slot(auc_trees, "y.values")[[1]]
value_auc_trees
#> [1] 0.7314527
Insight
Nilai Auc 73% sudah baik namun dengan nilai sensitivity 56%, dikhawatirkan model akan cenderung menghasilkan lebih banyak false negative (FN) atau prediksi negatif palsu, di mana data sebenarnya positif namun diprediksi sebagai negatif.
# Training the model
set.seed(50)
forest1 = randomForest(Class ~ ., data = train_over,ntree=100,importance=T)
# Predicting on train_over dataset
forest_train_pred1 <- predict(forest1, train_over, type = "class")
options(repr.plot.width = 14, repr.plot.height = 10)
plot(forest1,main='Random Forest Error Decreassing') # Ploting the forest error
### 3.3.2 Visualize Variabel Importance
# Catting var importance from model
importance <- as.data.frame(forest1$importance)
importance$features <- row.names(importance)
importance$MeanDecreaseGini <- round(importance$MeanDecreaseGini,2)
# Ploting a bar graphic with features importance proportion
options(repr.plot.width = 24, repr.plot.height = 10)
ggplotly(ggplot(importance, aes(x = features, y = MeanDecreaseGini)) +
geom_bar(fill = "#8CC0DE", stat = "identity") +
geom_text(aes(label=MeanDecreaseGini), vjust = -0.3) +
ggtitle('Features Importance'))
# Saving and plotting confusion matrix
options(repr.plot.width = 16, repr.plot.height = 12) # ajusting the plot
confusion_forest_train <- confusionMatrix(forest_train_pred1, train_over$Class, positive = 'yes')
draw_confusion_matrix(confusion_forest_train)
Insight
Pada Model Random Forest Terjadi peningkatan di seluruh parameter yaitu pada accuracy menjadi 83% dan sensitivity menjadi 77%
# Predicting in test_noover dataset
forest_test_pred1 <- predict(forest1, test_noover, type = "class")
confusion_forest_test <- confusionMatrix(forest_test_pred1, test_noover$y, positive = 'yes')
draw_confusion_matrix(confusion_forest_test)
Insight
Pada Model Random Forest Terjadi peningkatan di seluruh parameter yaitu pada accuracy menjadi 84% dan sensitivity menjadi 76%
#Plotting ROC curve for decision tree model
prob_forest <- predict(forest1, type ='prob', test_noover)
forest_probs <- prediction(prob_forest[,2], test_noover$y)
plot(performance(forest_probs, "tpr", "fpr"), col = "red", main = "AUC - Random Forest")
abline(0,1, lty = 8, col = "grey")
#AUC from random forests
auc_forest <- performance(forest_probs, "auc")
value_auc_forest <- slot(auc_forest, "y.values")[[1]]
value_auc_forest
#> [1] 0.8674292
Insight
Pada evaluasi model random forest nilai AUC sebesar 86% artinya model Random Forest menunjukkan bahwa model tersebut baik dalam membedakan antara dua kelas atau target yang berbeda.
Model dengan teknik Random Forest merupakan model terbaik dibandingkan dengan model Naive Bayes dan Decision Trees. Model Random Forest memiliki nilai AUC sebesar 86% dan akurasi sebesar 83% dengan tingkat sensitivitas 76% dan spesifisitas 85%.