First Step

library(tidyverse)
library(GGally)
library(readxl)
library(Amelia)
library(gridExtra)
library(ggthemr)
bankdata <- read_csv("/USERS/LENOVO/Downloads/bankdata.csv")
bankdata %>% glimpse()
## Observations: 11,162
## Variables: 17
## $ age       <dbl> 59, 56, 41, 55, 54, 42, 56, 60, 37, 28, 38, 30, 29, 46, 31,…
## $ job       <chr> "admin.", "admin.", "technician", "services", "admin.", "ma…
## $ marital   <chr> "married", "married", "married", "married", "married", "sin…
## $ education <chr> "secondary", "secondary", "secondary", "secondary", "tertia…
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no",…
## $ balance   <dbl> 2343, 45, 1270, 2476, 184, 0, 830, 545, 1, 5090, 100, 309, …
## $ housing   <chr> "yes", "no", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan      <chr> "no", "no", "no", "no", "no", "yes", "yes", "no", "no", "no…
## $ contact   <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ day       <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,…
## $ month     <chr> "may", "may", "may", "may", "may", "may", "may", "may", "ma…
## $ duration  <dbl> 1042, 1467, 1389, 579, 673, 562, 1201, 1030, 608, 1297, 786…
## $ campaign  <dbl> 1, 1, 1, 1, 2, 2, 1, 1, 1, 3, 1, 2, 4, 2, 2, 1, 3, 1, 2, 1,…
## $ pdays     <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome  <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ deposit   <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "ye…
bankdata <- bankdata %>% 
  mutate_if(is.character, as.factor) %>% 
  mutate_if(is.double, as.integer)
bankdata %>% glimpse()
## Observations: 11,162
## Variables: 17
## $ age       <int> 59, 56, 41, 55, 54, 42, 56, 60, 37, 28, 38, 30, 29, 46, 31,…
## $ job       <fct> admin., admin., technician, services, admin., management, m…
## $ marital   <fct> married, married, married, married, married, single, marrie…
## $ education <fct> secondary, secondary, secondary, secondary, tertiary, terti…
## $ default   <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ balance   <int> 2343, 45, 1270, 2476, 184, 0, 830, 545, 1, 5090, 100, 309, …
## $ housing   <fct> yes, no, yes, yes, no, yes, yes, yes, yes, yes, yes, yes, y…
## $ loan      <fct> no, no, no, no, no, yes, yes, no, no, no, no, no, yes, no, …
## $ contact   <fct> unknown, unknown, unknown, unknown, unknown, unknown, unkno…
## $ day       <int> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,…
## $ month     <fct> may, may, may, may, may, may, may, may, may, may, may, may,…
## $ duration  <int> 1042, 1467, 1389, 579, 673, 562, 1201, 1030, 608, 1297, 786…
## $ campaign  <int> 1, 1, 1, 1, 2, 2, 1, 1, 1, 3, 1, 2, 4, 2, 2, 1, 3, 1, 2, 1,…
## $ pdays     <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome  <fct> unknown, unknown, unknown, unknown, unknown, unknown, unkno…
## $ deposit   <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
head(bankdata)
## # A tibble: 6 x 17
##     age job   marital education default balance housing loan  contact   day
##   <int> <fct> <fct>   <fct>     <fct>     <int> <fct>   <fct> <fct>   <int>
## 1    59 admi… married secondary no         2343 yes     no    unknown     5
## 2    56 admi… married secondary no           45 no      no    unknown     5
## 3    41 tech… married secondary no         1270 yes     no    unknown     5
## 4    55 serv… married secondary no         2476 yes     no    unknown     5
## 5    54 admi… married tertiary  no          184 no      no    unknown     5
## 6    42 mana… single  tertiary  no            0 yes     yes   unknown     5
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## #   pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
bankdata %>% missmap()
## Warning: Unknown or uninitialised column: 'arguments'.

## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.

summary(bankdata)
##       age                 job           marital         education   
##  Min.   :18.00   management :2566   divorced:1293   primary  :1500  
##  1st Qu.:32.00   blue-collar:1944   married :6351   secondary:5476  
##  Median :39.00   technician :1823   single  :3518   tertiary :3689  
##  Mean   :41.23   admin.     :1334                   unknown  : 497  
##  3rd Qu.:49.00   services   : 923                                   
##  Max.   :95.00   retired    : 778                                   
##                  (Other)    :1794                                   
##  default        balance      housing     loan           contact    
##  no :10994   Min.   :-6847   no :5881   no :9702   cellular :8042  
##  yes:  168   1st Qu.:  122   yes:5281   yes:1460   telephone: 774  
##              Median :  550                         unknown  :2346  
##              Mean   : 1529                                         
##              3rd Qu.: 1708                                         
##              Max.   :81204                                         
##                                                                    
##       day            month         duration       campaign     
##  Min.   : 1.00   may    :2824   Min.   :   2   Min.   : 1.000  
##  1st Qu.: 8.00   aug    :1519   1st Qu.: 138   1st Qu.: 1.000  
##  Median :15.00   jul    :1514   Median : 255   Median : 2.000  
##  Mean   :15.66   jun    :1222   Mean   : 372   Mean   : 2.508  
##  3rd Qu.:22.00   nov    : 943   3rd Qu.: 496   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 923   Max.   :3881   Max.   :63.000  
##                  (Other):2217                                  
##      pdays           previous          poutcome    deposit   
##  Min.   : -1.00   Min.   : 0.0000   failure:1228   no :5873  
##  1st Qu.: -1.00   1st Qu.: 0.0000   other  : 537   yes:5289  
##  Median : -1.00   Median : 0.0000   success:1071             
##  Mean   : 51.33   Mean   : 0.8326   unknown:8326             
##  3rd Qu.: 20.75   3rd Qu.: 1.0000                            
##  Max.   :854.00   Max.   :58.0000                            
## 
ggthemr("pale", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
total_term_deposit <- bankdata %>% 
  select(deposit) %>% 
  group_by(deposit) %>%
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  
  ggplot(aes(x=2, y=pct, fill=deposit)) + 
  geom_bar(stat = "identity") + 
  coord_polar("y", start=0) +
  geom_label(aes(label = paste0(pct, "%")), 
           position = position_stack(vjust = 0.5), 
           colour = "black",  
           fontface = "bold") +
  theme(legend.position = "left") +
  xlim(1, 2.5)

total_job_by_deposit <- bankdata %>% 
  select(job) %>% 
  group_by(job) %>% 
  summarize(n=n()) %>% 
  mutate(percent = round(prop.table(n),2)*100) %>% 
  
  ggplot(aes(x = reorder(job, percent), y = percent)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_brewer(palette = "Dark") +
  theme(legend.position = "none") +
  xlab("Jobs") +
  geom_label(aes(label = paste0(percent, "%")),
           position = position_stack(vjust = 1), 
           colour = "black",  
           fontface = "bold"
            )
## Warning in pal_name(palette, type): Unknown palette Dark
grid.arrange(total_term_deposit, total_job_by_deposit, ncol = 2, nrow = 1)

ggthemr("grape", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
###############################################################################################
job_by_deposit <- bankdata %>% 
  select(job, deposit) %>% 
  dplyr::group_by(job, deposit) %>% 
  summarize(n=n()) %>% 
  mutate(percent=round(prop.table(n),2) * 100) %>% 
  
  ggplot(aes(x = reorder(job, -n), y = n, fill = deposit)) +
  geom_bar(stat="identity", width = 0.7, position = position_dodge(width = 0.8)) +
  geom_text(aes(label = sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5,
            colour="black", fontface="bold") +
  labs(x="", y="") + 
  labs(title="Jobs By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))
################################################################################################

################################################################################################
marital_by_deposit <- bankdata %>%
  select(marital, deposit) %>%
  group_by(marital, deposit) %>%
  summarise(n=n()) %>%
  mutate(percent = round(prop.table(n),2) * 100) %>%
  
  ggplot(aes(x = reorder(marital, -n), y = n, fill = deposit)) +
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  theme(legend.position = "none") +
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="") + 
  labs(title="Marital By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))
#################################################################################################

#################################################################################################
education_by_deposit <- bankdata %>%
  select(education, deposit) %>%
  group_by(education, deposit) %>%
  summarise(n=n()) %>%
  mutate(percent = round(prop.table(n),2) * 100) %>%
  
  ggplot(aes(x = reorder(education, -n), y = n, fill = deposit)) +
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="") + 
  labs(title="Education By Deposit") + 
  theme(plot.title=element_text(hjust=0.5)) +
  ylab("")
###############################################################################################

###############################################################################################
month_by_deposit <- bankdata %>%
  select(month, deposit) %>%
  group_by(month, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2) * 100) %>%
  
  ggplot(aes(x=reorder(month, -n), y=n, fill = deposit)) + 
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  #theme_bw() + 
  labs(x="", y="") + 
  labs(title="Month By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))
##############################################################################################

grid.arrange(job_by_deposit, marital_by_deposit, education_by_deposit, month_by_deposit, layout_matrix = rbind(c(1,1), c(2,3), c(4,4)))

Summary

loan_by_deposit <- bankdata %>%
  select(loan, deposit) %>%
  group_by(loan, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),3) * 100) %>%
  
  ggplot(aes(x = loan, y = n, fill = deposit)) +
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="") + 
  labs(title="Loan By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))

housing_by_deposit <- bankdata %>%
  select(housing, deposit) %>%
  group_by(housing, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),3) * 100) %>%
  
  ggplot(aes(x = housing, y = n, fill = deposit)) +
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="") + 
  labs(title="Housing By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))

default_by_deposit <- bankdata %>%
  select(default, deposit) %>%
  group_by(default, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),3) * 100) %>%
  
  ggplot(aes(x = default, y = n, fill = deposit)) +
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="") + 
  labs(title="Default By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))


grid.arrange(loan_by_deposit, housing_by_deposit, default_by_deposit, layout_matrix = rbind(c(1,2,3)))

Summary

ggthemr("flat", type = "outer")

contact_by_deposit <- bankdata %>%
  select(contact, deposit) %>%
  group_by(contact, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2) * 100) %>%
  
  ggplot(aes(x=reorder(contact, -n,), y=n, fill = deposit)) + 
  geom_bar(stat = "identity", width =  0.7,position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ff5d6c", "#8cba51")) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  theme(legend.position = "none") +
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  labs(x="", y="Count") + 
  labs(title="Contact By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))

poutcome_by_deposit <- bankdata %>%
  select(poutcome, deposit) %>%
  group_by(poutcome, deposit) %>%
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2) * 100) %>%
  
  ggplot(aes(x=reorder(poutcome, -n), y=n, fill = deposit)) + 
  geom_bar(stat = "identity", width =  0.7, position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ff5d6c", "#8cba51")) +
  theme(axis.title.x = element_text(size = 12, face = "bold"))+
  xlab("Jobs") +
  geom_text(aes(label= sprintf("%.f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5, 
            colour="black", fontface="bold") + 
  #theme_bw() + 
  labs(x="", y="") + 
  labs(title="Poutcome By Deposit") + 
  theme(plot.title=element_text(hjust=0.5))

grid.arrange(contact_by_deposit, poutcome_by_deposit, layout_matrix = rbind(c(1,2)))

ggthemr("flat dark", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
age_dist <- bankdata %>%
  select(age, deposit) %>%
  
  ggplot(aes(x = age, fill = deposit)) +
  geom_histogram(alpha = 0.8, col = "grey") +
  scale_x_continuous(breaks = seq(min(18), max(95), by = 10)) +
  theme(legend.position = "right") +
  labs(x="", y="") + 
  labs(title="Age Distribution") + 
  theme(plot.title=element_text(hjust=0.5))

balance_dist <- bankdata %>%
  select(balance, deposit) %>%
  
  ggplot(aes(x = balance, fill = deposit)) +
  geom_histogram() +
  theme(legend.position = "none") +
  labs(x="", y="") + 
  labs(title="Balance Distribution") + 
  theme(plot.title=element_text(hjust=0.5))

duration_dist <- bankdata %>%
  select(duration, deposit) %>%
  
  ggplot(aes(x=duration, fill = deposit)) +
  geom_histogram(color = "grey") +
  theme(legend.position = "right") +
  labs(x="", y="") + 
  labs(title="Duration Distribution") + 
  theme(plot.title=element_text(hjust=0.5))

pdays_dist <- bankdata %>%
  select(pdays, deposit) %>%
  
  ggplot(aes(x = pdays, fill = deposit)) +
  geom_histogram(color = "grey") +
  scale_x_continuous(breaks = seq(min(0), max(500), by = 50)) +
  theme(legend.position = "none") +
  labs(x="", y=" ") + 
  labs(title="Campaign Distribution") + 
  theme(plot.title=element_text(hjust=0.5))

campaign_dist <- bankdata %>%
  select(campaign, deposit) %>%
  
  ggplot(aes(x = campaign, fill = deposit)) +
  geom_histogram(alpa = 0.5, col = "grey") +
  theme(legend.position = "none") +
  labs(x="", y="") + 
  labs(title="Campaign Distribution") + 
  theme(plot.title=element_text(hjust=0.5))
## Warning: Ignoring unknown parameters: alpa
grid.arrange(age_dist, balance_dist, duration_dist, campaign_dist, layout_matrix = rbind(c(1,2),c(3,4)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bankdata3 <- bankdata
bankdata3$deposit <- as.numeric(bankdata3$deposit)
ggthemr(type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggcorr(bankdata3, label = T)
## Warning in ggcorr(bankdata3, label = T): data in column(s) 'job', 'marital',
## 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome' are
## not numeric and were ignored

bankdata4 <- bankdata %>% 
  mutate(mean_duration = as.factor(ifelse(duration > mean(duration), "Above_average", "Bellow_average")),
         median_duration = as.factor(ifelse(duration> median(duration), "Above_median", "Bellow_median")),
         mean_balance = as.factor(ifelse(balance > mean(balance), "Above_average", "Bellow_average")),
         median_balance = as.factor(ifelse(balance > median(balance), "Above_median", "Bellow_median")))
duration_mean_dist <- bankdata4 %>% 
  select(mean_duration, deposit) %>% 
  group_by(mean_duration, deposit) %>% 
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2)*100) %>% 
  
  ggplot(aes(x = mean_duration, y = n, fill = deposit))+
  geom_bar(stat= "identity", width = 0.7,  position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ce0f3d", "#333333")) +
  geom_text(aes(label = sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5)


duration_median_dist <- bankdata4 %>% 
  select(median_duration, deposit) %>% 
  group_by(median_duration, deposit) %>% 
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2)*100) %>% 
  
  ggplot(aes(x = median_duration, y = n, fill = deposit))+
  geom_bar(stat= "identity", width = 0.7,  position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ce0f3d", "#333333")) +
  geom_text(aes(label = sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5)

##############################################################################################3

balance_mean_dist <- bankdata4 %>% 
  select(mean_balance, deposit) %>% 
  group_by(mean_balance, deposit) %>% 
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2)*100) %>% 
  
  ggplot(aes(x = mean_balance, y = n, fill = deposit))+
  geom_bar(stat= "identity", width = 0.7,  position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ce0f3d", "#333333")) +
  geom_text(aes(label = sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5)


balance_median_dist <- bankdata4 %>% 
  select(median_balance, deposit) %>% 
  group_by(median_balance, deposit) %>% 
  summarize(n=n()) %>%
  mutate(percent = round(prop.table(n),2)*100) %>% 
  
  ggplot(aes(x = median_balance, y = n, fill = deposit))+
  geom_bar(stat= "identity", width = 0.7,  position = position_dodge(width = 0.8)) +
  scale_fill_manual(values = c("#ce0f3d", "#333333")) +
  geom_text(aes(label = sprintf("%.1f%%", percent)),
            position = position_dodge(width = 0.8),
            vjust = -0.3, size = 3.5)


grid.arrange(duration_mean_dist, duration_median_dist, balance_mean_dist, balance_median_dist, layout_matrix = rbind(c(1,2), c(3,4)))

Analysis

  1. Numerical feature
  • Since duration, balance, previous, and pdays have positif correlation with deposit, we will use those feature.
  1. Categorical feature
  • We will also consider a few categorical features like housing, contact, poutcome.

Job analysis

We have seen that in job group, blue-collar has the lowest persentage return, and retired and student have the highest, we will see why.

ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
job_duration_analysis <- bankdata4 %>% 
  select(job, duration, deposit) %>% 
  group_by(job, deposit) %>% 
  
  ggplot(aes(x = job, y = duration, fill = deposit))+
  scale_fill_manual(values = c("#d45d79", "#6e5773")) +
  geom_boxplot()

job_balance_analysis <- bankdata4 %>% 
  select(job, balance, deposit) %>% 
  group_by(job, deposit) %>% 
  
  ggplot(aes(x = job, y = balance, fill = deposit))+
  scale_fill_manual(values = c("#d45d79", "#6e5773")) +
  geom_jitter(aes(colour = deposit))

grid.arrange(job_duration_analysis, job_balance_analysis, layout_matrix = rbind(c(1,1),c(2,2)))

job_previous_analysis <- bankdata4 %>% 
  select(job, previous, deposit) %>% 
  group_by(job, deposit) %>% 
  
  ggplot(aes(x = job, y = previous, fill = deposit))+
  geom_boxplot()

job_pdays_analysis <- bankdata4 %>% 
  select(job, pdays, deposit) %>% 
  group_by(job, deposit) %>% 
  
  ggplot(aes(x = job, y = pdays, fill = deposit))+
  geom_violin()

grid.arrange(job_previous_analysis, job_pdays_analysis, layout_matrix = rbind(c(1,1),c(2,2)))

job_housing_analysis <- bankdata4 %>% 
  select(job, housing, deposit) %>%
  filter(housing == "yes") %>% 
  group_by(job, deposit, housing) %>%
  summarize(n=n()) %>% 
  
  ggplot(aes(x = reorder(job, n), y =n, fill = deposit))+
  geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8)) +
  coord_flip()

job_housing_analysis2 <- bankdata4 %>% 
  select(job, loan, deposit) %>%
  filter(loan == "no") %>% 
  group_by(job, deposit, loan) %>%
  summarize(n=n()) %>% 
  
  ggplot(aes(x = reorder(job, n), y =n, fill = deposit))+
  geom_bar(stat = "identity", position = position_dodge()) +
  coord_flip()

grid.arrange(job_housing_analysis, job_housing_analysis2, layout_matrix = rbind(c(1,2)))

ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
job_poutcome_analysis <- bankdata4 %>% 
  select(job, poutcome, deposit) %>%
  filter(poutcome == "success") %>% 
  group_by(job, deposit, poutcome) %>%
  summarize(n=n()) %>% 
  
  ggplot(aes(x = reorder(job, -n), y =n, fill = deposit))+
  geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8))+
  scale_y_continuous(breaks = seq(min(0), max(300), by = 50))+
  xlab("Job") + ylab(" ")


job_contact_analysis <- bankdata4 %>% 
  select(job, contact, deposit) %>%
  filter(contact == "cellular") %>% 
  group_by(job, deposit, contact) %>%
  summarize(n=n()) %>% 
  
  ggplot(aes(x = reorder(job, -n), y =n, fill = deposit))+
  geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8))+
  xlab("Job") + ylab(" ")

grid.arrange(job_poutcome_analysis, job_contact_analysis, layout_matrix = rbind(c(1,1), c(2,2)))

Clasification with random forest

  • Split data
library(caTools)
set.seed(101)
sample <- sample.split(bankdata$deposit, SplitRatio = 0.8)
train <- subset(bankdata, sample == T)
test <- subset(bankdata, sample == F)
head(train)
## # A tibble: 6 x 17
##     age job   marital education default balance housing loan  contact   day
##   <int> <fct> <fct>   <fct>     <fct>     <int> <fct>   <fct> <fct>   <int>
## 1    59 admi… married secondary no         2343 yes     no    unknown     5
## 2    56 admi… married secondary no           45 no      no    unknown     5
## 3    41 tech… married secondary no         1270 yes     no    unknown     5
## 4    55 serv… married secondary no         2476 yes     no    unknown     5
## 5    54 admi… married tertiary  no          184 no      no    unknown     5
## 6    42 mana… single  tertiary  no            0 yes     yes   unknown     5
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## #   pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
head(test)
## # A tibble: 6 x 17
##     age job   marital education default balance housing loan  contact   day
##   <int> <fct> <fct>   <fct>     <fct>     <int> <fct>   <fct> <fct>   <int>
## 1    38 admi… single  secondary no          100 yes     no    unknown     7
## 2    46 blue… single  tertiary  no          460 yes     no    unknown     7
## 3    32 blue… single  primary   no          611 yes     no    unknown     8
## 4    43 mana… single  tertiary  no         2067 yes     no    unknown     8
## 5    37 unem… single  secondary no          381 yes     no    unknown     8
## 6    32 mana… single  tertiary  no          311 no      no    unknown    12
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## #   pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
modelrndm <- randomForest(deposit ~., data = train, ntree=500, importance = TRUE)
modelrndm.pred <- predict(modelrndm, test)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
confusionMatrix(modelrndm.pred, test$deposit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  988  93
##        yes 187 965
##                                           
##                Accuracy : 0.8746          
##                  95% CI : (0.8602, 0.8881)
##     No Information Rate : 0.5262          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7496          
##                                           
##  Mcnemar's Test P-Value : 2.732e-08       
##                                           
##             Sensitivity : 0.8409          
##             Specificity : 0.9121          
##          Pos Pred Value : 0.9140          
##          Neg Pred Value : 0.8377          
##              Prevalence : 0.5262          
##          Detection Rate : 0.4425          
##    Detection Prevalence : 0.4841          
##       Balanced Accuracy : 0.8765          
##                                           
##        'Positive' Class : no              
## 
accuracy.rf <- (991+967) / (991+91+184+967)
accuracy.rf
## [1] 0.8768473
importance(modelrndm)
##                   no        yes MeanDecreaseAccuracy MeanDecreaseGini
## age        39.402196  14.478029           39.0287401       305.045320
## job        24.563284   4.370771           20.1572279       256.255719
## marital     4.168157  11.214756           11.3239289        69.155086
## education  12.575682   6.242795           13.1233697        87.882213
## default    -2.833853   3.120639            0.7222635         4.827332
## balance    13.366418  13.634785           19.1649431       329.015624
## housing    31.812715  29.319830           40.0031618       104.722503
## loan        5.664004  13.991543           13.7720345        34.841775
## contact    53.138082  17.824721           59.5976290       159.193741
## day        52.971907   6.609632           47.3068511       290.199084
## month      98.223236  42.111764          108.1526361       572.105745
## duration  212.823083 261.827453          291.5712682      1554.625011
## campaign   14.293294  17.326700           21.9905971       125.306482
## pdays      23.043870  19.323607           28.4301387       173.228175
## previous   17.427722  12.231814           18.2975075        98.475613
## poutcome   53.542975   7.180618           40.0616674       220.840961
importance    <- importance(modelrndm)

varImportance <- data.frame(Variables = row.names(importance), 
                            Importance = round(importance[ ,'MeanDecreaseGini'],2))

# Create a rank variable based on importance
rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance))))

# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance, fill = Importance)) +
  geom_bar(stat='identity') + 
  geom_text(aes(x = Variables, y = 0.5, label = Rank),
            hjust=0, vjust=0.55, size = 4, colour = 'red') +
  labs(x = 'Variables') +
  coord_flip() + 
  theme_classic()

logitmodel <- glm(deposit~., family = binomial("logit"), data = train)
summary(logitmodel)
## 
## Call:
## glm(formula = deposit ~ ., family = binomial("logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3683  -0.6048  -0.2137   0.6203   2.7907  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -8.126e-01  2.968e-01  -2.738  0.00617 ** 
## age                -1.544e-03  3.569e-03  -0.433  0.66520    
## jobblue-collar     -3.568e-01  1.162e-01  -3.071  0.00214 ** 
## jobentrepreneur    -3.895e-01  1.912e-01  -2.037  0.04167 *  
## jobhousemaid       -4.860e-01  2.172e-01  -2.238  0.02525 *  
## jobmanagement      -3.296e-01  1.196e-01  -2.756  0.00585 ** 
## jobretired          2.962e-01  1.652e-01   1.792  0.07308 .  
## jobself-employed   -4.496e-01  1.776e-01  -2.532  0.01134 *  
## jobservices        -3.472e-01  1.338e-01  -2.596  0.00944 ** 
## jobstudent          5.142e-01  1.966e-01   2.615  0.00892 ** 
## jobtechnician      -1.266e-01  1.103e-01  -1.147  0.25140    
## jobunemployed      -1.063e-01  1.840e-01  -0.578  0.56338    
## jobunknown         -3.619e-01  3.807e-01  -0.951  0.34184    
## maritalmarried     -1.496e-01  9.530e-02  -1.570  0.11644    
## maritalsingle       8.177e-02  1.097e-01   0.746  0.45593    
## educationsecondary  1.926e-01  1.040e-01   1.852  0.06404 .  
## educationtertiary   4.766e-01  1.223e-01   3.898 9.70e-05 ***
## educationunknown    1.582e-01  1.699e-01   0.931  0.35200    
## defaultyes         -5.329e-02  2.422e-01  -0.220  0.82586    
## balance             2.511e-05  9.636e-06   2.606  0.00916 ** 
## housingyes         -6.821e-01  6.879e-02  -9.916  < 2e-16 ***
## loanyes            -5.424e-01  9.291e-02  -5.838 5.29e-09 ***
## contacttelephone   -9.747e-02  1.190e-01  -0.819  0.41294    
## contactunknown     -1.535e+00  1.075e-01 -14.279  < 2e-16 ***
## day                 2.286e-03  3.921e-03   0.583  0.55998    
## monthaug           -8.435e-01  1.236e-01  -6.822 8.97e-12 ***
## monthdec            1.655e+00  4.639e-01   3.568  0.00036 ***
## monthfeb           -1.778e-01  1.412e-01  -1.260  0.20778    
## monthjan           -1.082e+00  1.842e-01  -5.875 4.22e-09 ***
## monthjul           -9.272e-01  1.239e-01  -7.482 7.32e-14 ***
## monthjun            3.498e-01  1.463e-01   2.391  0.01679 *  
## monthmar            2.096e+00  2.565e-01   8.172 3.03e-16 ***
## monthmay           -6.545e-01  1.187e-01  -5.515 3.49e-08 ***
## monthnov           -9.151e-01  1.341e-01  -6.827 8.69e-12 ***
## monthoct            1.128e+00  1.941e-01   5.810 6.24e-09 ***
## monthsep            1.023e+00  2.220e-01   4.608 4.07e-06 ***
## duration            5.398e-03  1.375e-04  39.271  < 2e-16 ***
## campaign           -7.758e-02  1.493e-02  -5.195 2.05e-07 ***
## pdays              -4.800e-05  4.666e-04  -0.103  0.91806    
## previous            2.464e-02  1.604e-02   1.536  0.12457    
## poutcomeother       1.865e-01  1.475e-01   1.264  0.20620    
## poutcomesuccess     2.153e+00  1.541e-01  13.977  < 2e-16 ***
## poutcomeunknown    -2.267e-01  1.515e-01  -1.496  0.13456    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12353.8  on 8928  degrees of freedom
## Residual deviance:  7361.1  on 8886  degrees of freedom
## AIC: 7447.1
## 
## Number of Fisher Scoring iterations: 5
logitmodel.pred <- predict(logitmodel, test, type = "response")
resul <- as.factor(ifelse(logitmodel.pred > 0.5, "yes", "no"))
confusionMatrix(resul, test$deposit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1018  220
##        yes  157  838
##                                          
##                Accuracy : 0.8312         
##                  95% CI : (0.815, 0.8465)
##     No Information Rate : 0.5262         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6604         
##                                          
##  Mcnemar's Test P-Value : 0.001407       
##                                          
##             Sensitivity : 0.8664         
##             Specificity : 0.7921         
##          Pos Pred Value : 0.8223         
##          Neg Pred Value : 0.8422         
##              Prevalence : 0.5262         
##          Detection Rate : 0.4559         
##    Detection Prevalence : 0.5544         
##       Balanced Accuracy : 0.8292         
##                                          
##        'Positive' Class : no             
## 
exp(coef(logitmodel))
##        (Intercept)                age     jobblue-collar    jobentrepreneur 
##          0.4436880          0.9984569          0.6999163          0.6774184 
##       jobhousemaid      jobmanagement         jobretired   jobself-employed 
##          0.6150934          0.7192332          1.3446880          0.6378793 
##        jobservices         jobstudent      jobtechnician      jobunemployed 
##          0.7066707          1.6722631          0.8811210          0.8991504 
##         jobunknown     maritalmarried      maritalsingle educationsecondary 
##          0.6963704          0.8610494          1.0852050          1.2123751 
##  educationtertiary   educationunknown         defaultyes            balance 
##          1.6105738          1.1713487          0.9481016          1.0000251 
##         housingyes            loanyes   contacttelephone     contactunknown 
##          0.5055724          0.5813771          0.9071318          0.2155489 
##                day           monthaug           monthdec           monthfeb 
##          1.0022882          0.4302101          5.2334941          0.8370707 
##           monthjan           monthjul           monthjun           monthmar 
##          0.3387544          0.3956445          1.4187925          8.1322493 
##           monthmay           monthnov           monthoct           monthsep 
##          0.5197125          0.4004639          3.0882539          2.7810013 
##           duration           campaign              pdays           previous 
##          1.0054126          0.9253568          0.9999520          1.0249448 
##      poutcomeother    poutcomesuccess    poutcomeunknown 
##          1.2050365          8.6126395          0.7971803