library(tidyverse)
library(GGally)
library(readxl)
library(Amelia)
library(gridExtra)
library(ggthemr)
bankdata <- read_csv("/USERS/LENOVO/Downloads/bankdata.csv")
bankdata %>% glimpse()
## Observations: 11,162
## Variables: 17
## $ age <dbl> 59, 56, 41, 55, 54, 42, 56, 60, 37, 28, 38, 30, 29, 46, 31,…
## $ job <chr> "admin.", "admin.", "technician", "services", "admin.", "ma…
## $ marital <chr> "married", "married", "married", "married", "married", "sin…
## $ education <chr> "secondary", "secondary", "secondary", "secondary", "tertia…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no",…
## $ balance <dbl> 2343, 45, 1270, 2476, 184, 0, 830, 545, 1, 5090, 100, 309, …
## $ housing <chr> "yes", "no", "yes", "yes", "no", "yes", "yes", "yes", "yes"…
## $ loan <chr> "no", "no", "no", "no", "no", "yes", "yes", "no", "no", "no…
## $ contact <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ day <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may", "ma…
## $ duration <dbl> 1042, 1467, 1389, 579, 673, 562, 1201, 1030, 608, 1297, 786…
## $ campaign <dbl> 1, 1, 1, 1, 2, 2, 1, 1, 1, 3, 1, 2, 4, 2, 2, 1, 3, 1, 2, 1,…
## $ pdays <dbl> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome <chr> "unknown", "unknown", "unknown", "unknown", "unknown", "unk…
## $ deposit <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "ye…
bankdata <- bankdata %>%
mutate_if(is.character, as.factor) %>%
mutate_if(is.double, as.integer)
bankdata %>% glimpse()
## Observations: 11,162
## Variables: 17
## $ age <int> 59, 56, 41, 55, 54, 42, 56, 60, 37, 28, 38, 30, 29, 46, 31,…
## $ job <fct> admin., admin., technician, services, admin., management, m…
## $ marital <fct> married, married, married, married, married, single, marrie…
## $ education <fct> secondary, secondary, secondary, secondary, tertiary, terti…
## $ default <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ balance <int> 2343, 45, 1270, 2476, 184, 0, 830, 545, 1, 5090, 100, 309, …
## $ housing <fct> yes, no, yes, yes, no, yes, yes, yes, yes, yes, yes, yes, y…
## $ loan <fct> no, no, no, no, no, yes, yes, no, no, no, no, no, yes, no, …
## $ contact <fct> unknown, unknown, unknown, unknown, unknown, unknown, unkno…
## $ day <int> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8,…
## $ month <fct> may, may, may, may, may, may, may, may, may, may, may, may,…
## $ duration <int> 1042, 1467, 1389, 579, 673, 562, 1201, 1030, 608, 1297, 786…
## $ campaign <int> 1, 1, 1, 1, 2, 2, 1, 1, 1, 3, 1, 2, 4, 2, 2, 1, 3, 1, 2, 1,…
## $ pdays <int> -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,…
## $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ poutcome <fct> unknown, unknown, unknown, unknown, unknown, unknown, unkno…
## $ deposit <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
head(bankdata)
## # A tibble: 6 x 17
## age job marital education default balance housing loan contact day
## <int> <fct> <fct> <fct> <fct> <int> <fct> <fct> <fct> <int>
## 1 59 admi… married secondary no 2343 yes no unknown 5
## 2 56 admi… married secondary no 45 no no unknown 5
## 3 41 tech… married secondary no 1270 yes no unknown 5
## 4 55 serv… married secondary no 2476 yes no unknown 5
## 5 54 admi… married tertiary no 184 no no unknown 5
## 6 42 mana… single tertiary no 0 yes yes unknown 5
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## # pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
bankdata %>% missmap()
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'arguments'.
## Warning: Unknown or uninitialised column: 'imputations'.
summary(bankdata)
## age job marital education
## Min. :18.00 management :2566 divorced:1293 primary :1500
## 1st Qu.:32.00 blue-collar:1944 married :6351 secondary:5476
## Median :39.00 technician :1823 single :3518 tertiary :3689
## Mean :41.23 admin. :1334 unknown : 497
## 3rd Qu.:49.00 services : 923
## Max. :95.00 retired : 778
## (Other) :1794
## default balance housing loan contact
## no :10994 Min. :-6847 no :5881 no :9702 cellular :8042
## yes: 168 1st Qu.: 122 yes:5281 yes:1460 telephone: 774
## Median : 550 unknown :2346
## Mean : 1529
## 3rd Qu.: 1708
## Max. :81204
##
## day month duration campaign
## Min. : 1.00 may :2824 Min. : 2 Min. : 1.000
## 1st Qu.: 8.00 aug :1519 1st Qu.: 138 1st Qu.: 1.000
## Median :15.00 jul :1514 Median : 255 Median : 2.000
## Mean :15.66 jun :1222 Mean : 372 Mean : 2.508
## 3rd Qu.:22.00 nov : 943 3rd Qu.: 496 3rd Qu.: 3.000
## Max. :31.00 apr : 923 Max. :3881 Max. :63.000
## (Other):2217
## pdays previous poutcome deposit
## Min. : -1.00 Min. : 0.0000 failure:1228 no :5873
## 1st Qu.: -1.00 1st Qu.: 0.0000 other : 537 yes:5289
## Median : -1.00 Median : 0.0000 success:1071
## Mean : 51.33 Mean : 0.8326 unknown:8326
## 3rd Qu.: 20.75 3rd Qu.: 1.0000
## Max. :854.00 Max. :58.0000
##
ggthemr("pale", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
total_term_deposit <- bankdata %>%
select(deposit) %>%
group_by(deposit) %>%
summarize(n=n()) %>%
mutate(pct=round(prop.table(n),2) * 100) %>%
ggplot(aes(x=2, y=pct, fill=deposit)) +
geom_bar(stat = "identity") +
coord_polar("y", start=0) +
geom_label(aes(label = paste0(pct, "%")),
position = position_stack(vjust = 0.5),
colour = "black",
fontface = "bold") +
theme(legend.position = "left") +
xlim(1, 2.5)
total_job_by_deposit <- bankdata %>%
select(job) %>%
group_by(job) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2)*100) %>%
ggplot(aes(x = reorder(job, percent), y = percent)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_brewer(palette = "Dark") +
theme(legend.position = "none") +
xlab("Jobs") +
geom_label(aes(label = paste0(percent, "%")),
position = position_stack(vjust = 1),
colour = "black",
fontface = "bold"
)
## Warning in pal_name(palette, type): Unknown palette Dark
grid.arrange(total_term_deposit, total_job_by_deposit, ncol = 2, nrow = 1)
ggthemr("grape", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
###############################################################################################
job_by_deposit <- bankdata %>%
select(job, deposit) %>%
dplyr::group_by(job, deposit) %>%
summarize(n=n()) %>%
mutate(percent=round(prop.table(n),2) * 100) %>%
ggplot(aes(x = reorder(job, -n), y = n, fill = deposit)) +
geom_bar(stat="identity", width = 0.7, position = position_dodge(width = 0.8)) +
geom_text(aes(label = sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Jobs By Deposit") +
theme(plot.title=element_text(hjust=0.5))
################################################################################################
################################################################################################
marital_by_deposit <- bankdata %>%
select(marital, deposit) %>%
group_by(marital, deposit) %>%
summarise(n=n()) %>%
mutate(percent = round(prop.table(n),2) * 100) %>%
ggplot(aes(x = reorder(marital, -n), y = n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
theme(legend.position = "none") +
xlab("Jobs") +
geom_text(aes(label= sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Marital By Deposit") +
theme(plot.title=element_text(hjust=0.5))
#################################################################################################
#################################################################################################
education_by_deposit <- bankdata %>%
select(education, deposit) %>%
group_by(education, deposit) %>%
summarise(n=n()) %>%
mutate(percent = round(prop.table(n),2) * 100) %>%
ggplot(aes(x = reorder(education, -n), y = n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Education By Deposit") +
theme(plot.title=element_text(hjust=0.5)) +
ylab("")
###############################################################################################
###############################################################################################
month_by_deposit <- bankdata %>%
select(month, deposit) %>%
group_by(month, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2) * 100) %>%
ggplot(aes(x=reorder(month, -n), y=n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
#theme_bw() +
labs(x="", y="") +
labs(title="Month By Deposit") +
theme(plot.title=element_text(hjust=0.5))
##############################################################################################
grid.arrange(job_by_deposit, marital_by_deposit, education_by_deposit, month_by_deposit, layout_matrix = rbind(c(1,1), c(2,3), c(4,4)))
Summary
In job group, Blue-collar has the lowest persentage, from 100% observation only 36% people that open a deposit, and 67% don’t and Retired and Student are the highest persentage, from 100% observation 66% open deposit and 34% don’t.
May is the most high marketing activity but has the lowest persentage return, only 33%, while the high performance is in March, April, September, October, December.
loan_by_deposit <- bankdata %>%
select(loan, deposit) %>%
group_by(loan, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),3) * 100) %>%
ggplot(aes(x = loan, y = n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Loan By Deposit") +
theme(plot.title=element_text(hjust=0.5))
housing_by_deposit <- bankdata %>%
select(housing, deposit) %>%
group_by(housing, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),3) * 100) %>%
ggplot(aes(x = housing, y = n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Housing By Deposit") +
theme(plot.title=element_text(hjust=0.5))
default_by_deposit <- bankdata %>%
select(default, deposit) %>%
group_by(default, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),3) * 100) %>%
ggplot(aes(x = default, y = n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="") +
labs(title="Default By Deposit") +
theme(plot.title=element_text(hjust=0.5))
grid.arrange(loan_by_deposit, housing_by_deposit, default_by_deposit, layout_matrix = rbind(c(1,2,3)))
Summary
ggthemr("flat", type = "outer")
contact_by_deposit <- bankdata %>%
select(contact, deposit) %>%
group_by(contact, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2) * 100) %>%
ggplot(aes(x=reorder(contact, -n,), y=n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7,position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ff5d6c", "#8cba51")) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
theme(legend.position = "none") +
xlab("Jobs") +
geom_text(aes(label= sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
labs(x="", y="Count") +
labs(title="Contact By Deposit") +
theme(plot.title=element_text(hjust=0.5))
poutcome_by_deposit <- bankdata %>%
select(poutcome, deposit) %>%
group_by(poutcome, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2) * 100) %>%
ggplot(aes(x=reorder(poutcome, -n), y=n, fill = deposit)) +
geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ff5d6c", "#8cba51")) +
theme(axis.title.x = element_text(size = 12, face = "bold"))+
xlab("Jobs") +
geom_text(aes(label= sprintf("%.f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5,
colour="black", fontface="bold") +
#theme_bw() +
labs(x="", y="") +
labs(title="Poutcome By Deposit") +
theme(plot.title=element_text(hjust=0.5))
grid.arrange(contact_by_deposit, poutcome_by_deposit, layout_matrix = rbind(c(1,2)))
ggthemr("flat dark", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
age_dist <- bankdata %>%
select(age, deposit) %>%
ggplot(aes(x = age, fill = deposit)) +
geom_histogram(alpha = 0.8, col = "grey") +
scale_x_continuous(breaks = seq(min(18), max(95), by = 10)) +
theme(legend.position = "right") +
labs(x="", y="") +
labs(title="Age Distribution") +
theme(plot.title=element_text(hjust=0.5))
balance_dist <- bankdata %>%
select(balance, deposit) %>%
ggplot(aes(x = balance, fill = deposit)) +
geom_histogram() +
theme(legend.position = "none") +
labs(x="", y="") +
labs(title="Balance Distribution") +
theme(plot.title=element_text(hjust=0.5))
duration_dist <- bankdata %>%
select(duration, deposit) %>%
ggplot(aes(x=duration, fill = deposit)) +
geom_histogram(color = "grey") +
theme(legend.position = "right") +
labs(x="", y="") +
labs(title="Duration Distribution") +
theme(plot.title=element_text(hjust=0.5))
pdays_dist <- bankdata %>%
select(pdays, deposit) %>%
ggplot(aes(x = pdays, fill = deposit)) +
geom_histogram(color = "grey") +
scale_x_continuous(breaks = seq(min(0), max(500), by = 50)) +
theme(legend.position = "none") +
labs(x="", y=" ") +
labs(title="Campaign Distribution") +
theme(plot.title=element_text(hjust=0.5))
campaign_dist <- bankdata %>%
select(campaign, deposit) %>%
ggplot(aes(x = campaign, fill = deposit)) +
geom_histogram(alpa = 0.5, col = "grey") +
theme(legend.position = "none") +
labs(x="", y="") +
labs(title="Campaign Distribution") +
theme(plot.title=element_text(hjust=0.5))
## Warning: Ignoring unknown parameters: alpa
grid.arrange(age_dist, balance_dist, duration_dist, campaign_dist, layout_matrix = rbind(c(1,2),c(3,4)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
bankdata3 <- bankdata
bankdata3$deposit <- as.numeric(bankdata3$deposit)
ggthemr(type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggcorr(bankdata3, label = T)
## Warning in ggcorr(bankdata3, label = T): data in column(s) 'job', 'marital',
## 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome' are
## not numeric and were ignored
bankdata4 <- bankdata %>%
mutate(mean_duration = as.factor(ifelse(duration > mean(duration), "Above_average", "Bellow_average")),
median_duration = as.factor(ifelse(duration> median(duration), "Above_median", "Bellow_median")),
mean_balance = as.factor(ifelse(balance > mean(balance), "Above_average", "Bellow_average")),
median_balance = as.factor(ifelse(balance > median(balance), "Above_median", "Bellow_median")))
duration_mean_dist <- bankdata4 %>%
select(mean_duration, deposit) %>%
group_by(mean_duration, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2)*100) %>%
ggplot(aes(x = mean_duration, y = n, fill = deposit))+
geom_bar(stat= "identity", width = 0.7, position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ce0f3d", "#333333")) +
geom_text(aes(label = sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5)
duration_median_dist <- bankdata4 %>%
select(median_duration, deposit) %>%
group_by(median_duration, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2)*100) %>%
ggplot(aes(x = median_duration, y = n, fill = deposit))+
geom_bar(stat= "identity", width = 0.7, position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ce0f3d", "#333333")) +
geom_text(aes(label = sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5)
##############################################################################################3
balance_mean_dist <- bankdata4 %>%
select(mean_balance, deposit) %>%
group_by(mean_balance, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2)*100) %>%
ggplot(aes(x = mean_balance, y = n, fill = deposit))+
geom_bar(stat= "identity", width = 0.7, position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ce0f3d", "#333333")) +
geom_text(aes(label = sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5)
balance_median_dist <- bankdata4 %>%
select(median_balance, deposit) %>%
group_by(median_balance, deposit) %>%
summarize(n=n()) %>%
mutate(percent = round(prop.table(n),2)*100) %>%
ggplot(aes(x = median_balance, y = n, fill = deposit))+
geom_bar(stat= "identity", width = 0.7, position = position_dodge(width = 0.8)) +
scale_fill_manual(values = c("#ce0f3d", "#333333")) +
geom_text(aes(label = sprintf("%.1f%%", percent)),
position = position_dodge(width = 0.8),
vjust = -0.3, size = 3.5)
grid.arrange(duration_mean_dist, duration_median_dist, balance_mean_dist, balance_median_dist, layout_matrix = rbind(c(1,2), c(3,4)))
Job analysis
We have seen that in job group, blue-collar has the lowest persentage return, and retired and student have the highest, we will see why.
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
job_duration_analysis <- bankdata4 %>%
select(job, duration, deposit) %>%
group_by(job, deposit) %>%
ggplot(aes(x = job, y = duration, fill = deposit))+
scale_fill_manual(values = c("#d45d79", "#6e5773")) +
geom_boxplot()
job_balance_analysis <- bankdata4 %>%
select(job, balance, deposit) %>%
group_by(job, deposit) %>%
ggplot(aes(x = job, y = balance, fill = deposit))+
scale_fill_manual(values = c("#d45d79", "#6e5773")) +
geom_jitter(aes(colour = deposit))
grid.arrange(job_duration_analysis, job_balance_analysis, layout_matrix = rbind(c(1,1),c(2,2)))
job_previous_analysis <- bankdata4 %>%
select(job, previous, deposit) %>%
group_by(job, deposit) %>%
ggplot(aes(x = job, y = previous, fill = deposit))+
geom_boxplot()
job_pdays_analysis <- bankdata4 %>%
select(job, pdays, deposit) %>%
group_by(job, deposit) %>%
ggplot(aes(x = job, y = pdays, fill = deposit))+
geom_violin()
grid.arrange(job_previous_analysis, job_pdays_analysis, layout_matrix = rbind(c(1,1),c(2,2)))
job_housing_analysis <- bankdata4 %>%
select(job, housing, deposit) %>%
filter(housing == "yes") %>%
group_by(job, deposit, housing) %>%
summarize(n=n()) %>%
ggplot(aes(x = reorder(job, n), y =n, fill = deposit))+
geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8)) +
coord_flip()
job_housing_analysis2 <- bankdata4 %>%
select(job, loan, deposit) %>%
filter(loan == "no") %>%
group_by(job, deposit, loan) %>%
summarize(n=n()) %>%
ggplot(aes(x = reorder(job, n), y =n, fill = deposit))+
geom_bar(stat = "identity", position = position_dodge()) +
coord_flip()
grid.arrange(job_housing_analysis, job_housing_analysis2, layout_matrix = rbind(c(1,2)))
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
job_poutcome_analysis <- bankdata4 %>%
select(job, poutcome, deposit) %>%
filter(poutcome == "success") %>%
group_by(job, deposit, poutcome) %>%
summarize(n=n()) %>%
ggplot(aes(x = reorder(job, -n), y =n, fill = deposit))+
geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8))+
scale_y_continuous(breaks = seq(min(0), max(300), by = 50))+
xlab("Job") + ylab(" ")
job_contact_analysis <- bankdata4 %>%
select(job, contact, deposit) %>%
filter(contact == "cellular") %>%
group_by(job, deposit, contact) %>%
summarize(n=n()) %>%
ggplot(aes(x = reorder(job, -n), y =n, fill = deposit))+
geom_bar(stat = "identity", width = 0.7, position = position_dodge(width = 0.8))+
xlab("Job") + ylab(" ")
grid.arrange(job_poutcome_analysis, job_contact_analysis, layout_matrix = rbind(c(1,1), c(2,2)))
library(caTools)
set.seed(101)
sample <- sample.split(bankdata$deposit, SplitRatio = 0.8)
train <- subset(bankdata, sample == T)
test <- subset(bankdata, sample == F)
head(train)
## # A tibble: 6 x 17
## age job marital education default balance housing loan contact day
## <int> <fct> <fct> <fct> <fct> <int> <fct> <fct> <fct> <int>
## 1 59 admi… married secondary no 2343 yes no unknown 5
## 2 56 admi… married secondary no 45 no no unknown 5
## 3 41 tech… married secondary no 1270 yes no unknown 5
## 4 55 serv… married secondary no 2476 yes no unknown 5
## 5 54 admi… married tertiary no 184 no no unknown 5
## 6 42 mana… single tertiary no 0 yes yes unknown 5
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## # pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
head(test)
## # A tibble: 6 x 17
## age job marital education default balance housing loan contact day
## <int> <fct> <fct> <fct> <fct> <int> <fct> <fct> <fct> <int>
## 1 38 admi… single secondary no 100 yes no unknown 7
## 2 46 blue… single tertiary no 460 yes no unknown 7
## 3 32 blue… single primary no 611 yes no unknown 8
## 4 43 mana… single tertiary no 2067 yes no unknown 8
## 5 37 unem… single secondary no 381 yes no unknown 8
## 6 32 mana… single tertiary no 311 no no unknown 12
## # … with 7 more variables: month <fct>, duration <int>, campaign <int>,
## # pdays <int>, previous <int>, poutcome <fct>, deposit <fct>
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
modelrndm <- randomForest(deposit ~., data = train, ntree=500, importance = TRUE)
modelrndm.pred <- predict(modelrndm, test)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
confusionMatrix(modelrndm.pred, test$deposit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 988 93
## yes 187 965
##
## Accuracy : 0.8746
## 95% CI : (0.8602, 0.8881)
## No Information Rate : 0.5262
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7496
##
## Mcnemar's Test P-Value : 2.732e-08
##
## Sensitivity : 0.8409
## Specificity : 0.9121
## Pos Pred Value : 0.9140
## Neg Pred Value : 0.8377
## Prevalence : 0.5262
## Detection Rate : 0.4425
## Detection Prevalence : 0.4841
## Balanced Accuracy : 0.8765
##
## 'Positive' Class : no
##
accuracy.rf <- (991+967) / (991+91+184+967)
accuracy.rf
## [1] 0.8768473
importance(modelrndm)
## no yes MeanDecreaseAccuracy MeanDecreaseGini
## age 39.402196 14.478029 39.0287401 305.045320
## job 24.563284 4.370771 20.1572279 256.255719
## marital 4.168157 11.214756 11.3239289 69.155086
## education 12.575682 6.242795 13.1233697 87.882213
## default -2.833853 3.120639 0.7222635 4.827332
## balance 13.366418 13.634785 19.1649431 329.015624
## housing 31.812715 29.319830 40.0031618 104.722503
## loan 5.664004 13.991543 13.7720345 34.841775
## contact 53.138082 17.824721 59.5976290 159.193741
## day 52.971907 6.609632 47.3068511 290.199084
## month 98.223236 42.111764 108.1526361 572.105745
## duration 212.823083 261.827453 291.5712682 1554.625011
## campaign 14.293294 17.326700 21.9905971 125.306482
## pdays 23.043870 19.323607 28.4301387 173.228175
## previous 17.427722 12.231814 18.2975075 98.475613
## poutcome 53.542975 7.180618 40.0616674 220.840961
importance <- importance(modelrndm)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_classic()
logitmodel <- glm(deposit~., family = binomial("logit"), data = train)
summary(logitmodel)
##
## Call:
## glm(formula = deposit ~ ., family = binomial("logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3683 -0.6048 -0.2137 0.6203 2.7907
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.126e-01 2.968e-01 -2.738 0.00617 **
## age -1.544e-03 3.569e-03 -0.433 0.66520
## jobblue-collar -3.568e-01 1.162e-01 -3.071 0.00214 **
## jobentrepreneur -3.895e-01 1.912e-01 -2.037 0.04167 *
## jobhousemaid -4.860e-01 2.172e-01 -2.238 0.02525 *
## jobmanagement -3.296e-01 1.196e-01 -2.756 0.00585 **
## jobretired 2.962e-01 1.652e-01 1.792 0.07308 .
## jobself-employed -4.496e-01 1.776e-01 -2.532 0.01134 *
## jobservices -3.472e-01 1.338e-01 -2.596 0.00944 **
## jobstudent 5.142e-01 1.966e-01 2.615 0.00892 **
## jobtechnician -1.266e-01 1.103e-01 -1.147 0.25140
## jobunemployed -1.063e-01 1.840e-01 -0.578 0.56338
## jobunknown -3.619e-01 3.807e-01 -0.951 0.34184
## maritalmarried -1.496e-01 9.530e-02 -1.570 0.11644
## maritalsingle 8.177e-02 1.097e-01 0.746 0.45593
## educationsecondary 1.926e-01 1.040e-01 1.852 0.06404 .
## educationtertiary 4.766e-01 1.223e-01 3.898 9.70e-05 ***
## educationunknown 1.582e-01 1.699e-01 0.931 0.35200
## defaultyes -5.329e-02 2.422e-01 -0.220 0.82586
## balance 2.511e-05 9.636e-06 2.606 0.00916 **
## housingyes -6.821e-01 6.879e-02 -9.916 < 2e-16 ***
## loanyes -5.424e-01 9.291e-02 -5.838 5.29e-09 ***
## contacttelephone -9.747e-02 1.190e-01 -0.819 0.41294
## contactunknown -1.535e+00 1.075e-01 -14.279 < 2e-16 ***
## day 2.286e-03 3.921e-03 0.583 0.55998
## monthaug -8.435e-01 1.236e-01 -6.822 8.97e-12 ***
## monthdec 1.655e+00 4.639e-01 3.568 0.00036 ***
## monthfeb -1.778e-01 1.412e-01 -1.260 0.20778
## monthjan -1.082e+00 1.842e-01 -5.875 4.22e-09 ***
## monthjul -9.272e-01 1.239e-01 -7.482 7.32e-14 ***
## monthjun 3.498e-01 1.463e-01 2.391 0.01679 *
## monthmar 2.096e+00 2.565e-01 8.172 3.03e-16 ***
## monthmay -6.545e-01 1.187e-01 -5.515 3.49e-08 ***
## monthnov -9.151e-01 1.341e-01 -6.827 8.69e-12 ***
## monthoct 1.128e+00 1.941e-01 5.810 6.24e-09 ***
## monthsep 1.023e+00 2.220e-01 4.608 4.07e-06 ***
## duration 5.398e-03 1.375e-04 39.271 < 2e-16 ***
## campaign -7.758e-02 1.493e-02 -5.195 2.05e-07 ***
## pdays -4.800e-05 4.666e-04 -0.103 0.91806
## previous 2.464e-02 1.604e-02 1.536 0.12457
## poutcomeother 1.865e-01 1.475e-01 1.264 0.20620
## poutcomesuccess 2.153e+00 1.541e-01 13.977 < 2e-16 ***
## poutcomeunknown -2.267e-01 1.515e-01 -1.496 0.13456
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12353.8 on 8928 degrees of freedom
## Residual deviance: 7361.1 on 8886 degrees of freedom
## AIC: 7447.1
##
## Number of Fisher Scoring iterations: 5
logitmodel.pred <- predict(logitmodel, test, type = "response")
resul <- as.factor(ifelse(logitmodel.pred > 0.5, "yes", "no"))
confusionMatrix(resul, test$deposit)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1018 220
## yes 157 838
##
## Accuracy : 0.8312
## 95% CI : (0.815, 0.8465)
## No Information Rate : 0.5262
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6604
##
## Mcnemar's Test P-Value : 0.001407
##
## Sensitivity : 0.8664
## Specificity : 0.7921
## Pos Pred Value : 0.8223
## Neg Pred Value : 0.8422
## Prevalence : 0.5262
## Detection Rate : 0.4559
## Detection Prevalence : 0.5544
## Balanced Accuracy : 0.8292
##
## 'Positive' Class : no
##
exp(coef(logitmodel))
## (Intercept) age jobblue-collar jobentrepreneur
## 0.4436880 0.9984569 0.6999163 0.6774184
## jobhousemaid jobmanagement jobretired jobself-employed
## 0.6150934 0.7192332 1.3446880 0.6378793
## jobservices jobstudent jobtechnician jobunemployed
## 0.7066707 1.6722631 0.8811210 0.8991504
## jobunknown maritalmarried maritalsingle educationsecondary
## 0.6963704 0.8610494 1.0852050 1.2123751
## educationtertiary educationunknown defaultyes balance
## 1.6105738 1.1713487 0.9481016 1.0000251
## housingyes loanyes contacttelephone contactunknown
## 0.5055724 0.5813771 0.9071318 0.2155489
## day monthaug monthdec monthfeb
## 1.0022882 0.4302101 5.2334941 0.8370707
## monthjan monthjul monthjun monthmar
## 0.3387544 0.3956445 1.4187925 8.1322493
## monthmay monthnov monthoct monthsep
## 0.5197125 0.4004639 3.0882539 2.7810013
## duration campaign pdays previous
## 1.0054126 0.9253568 0.9999520 1.0249448
## poutcomeother poutcomesuccess poutcomeunknown
## 1.2050365 8.6126395 0.7971803