The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.
library(dplyr)
library(tidyverse)
library(ggplot2)bank <- read.csv('bank-additional-full.csv', sep = ";")dim(bank)#> [1] 41188 21
Our telemarketing dataset contains 41188 rows and 21 coloumns.
head(bank)#> age job marital education default housing loan contact month
#> 1 56 housemaid married basic.4y no no no telephone may
#> 2 57 services married high.school unknown no no telephone may
#> 3 37 services married high.school no yes no telephone may
#> 4 40 admin. married basic.6y no no no telephone may
#> 5 56 services married high.school no no yes telephone may
#> 6 45 services married basic.9y unknown no no telephone may
#> day_of_week duration campaign pdays previous poutcome emp.var.rate
#> 1 mon 261 1 999 0 nonexistent 1.1
#> 2 mon 149 1 999 0 nonexistent 1.1
#> 3 mon 226 1 999 0 nonexistent 1.1
#> 4 mon 151 1 999 0 nonexistent 1.1
#> 5 mon 307 1 999 0 nonexistent 1.1
#> 6 mon 198 1 999 0 nonexistent 1.1
#> cons.price.idx cons.conf.idx euribor3m nr.employed y
#> 1 93.994 -36.4 4.857 5191 no
#> 2 93.994 -36.4 4.857 5191 no
#> 3 93.994 -36.4 4.857 5191 no
#> 4 93.994 -36.4 4.857 5191 no
#> 5 93.994 -36.4 4.857 5191 no
#> 6 93.994 -36.4 4.857 5191 no
Column explanation
str(bank)#> 'data.frame': 41188 obs. of 21 variables:
#> $ age : int 56 57 37 40 56 45 59 41 24 25 ...
#> $ job : chr "housemaid" "services" "services" "admin." ...
#> $ marital : chr "married" "married" "married" "married" ...
#> $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
#> $ default : chr "no" "unknown" "no" "no" ...
#> $ housing : chr "no" "no" "yes" "no" ...
#> $ loan : chr "no" "no" "no" "no" ...
#> $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
#> $ month : chr "may" "may" "may" "may" ...
#> $ day_of_week : chr "mon" "mon" "mon" "mon" ...
#> $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
#> $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
#> $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
#> $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
#> $ cons.price.idx: num 94 94 94 94 94 ...
#> $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
#> $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
#> $ nr.employed : num 5191 5191 5191 5191 5191 ...
#> $ y : chr "no" "no" "no" "no" ...
From the result above, we find some of data type not in the corect type. We need to convert it into corect type.
bank <- bank %>%
mutate_if(is.character, as.factor)
str(bank)#> 'data.frame': 41188 obs. of 21 variables:
#> $ age : int 56 57 37 40 56 45 59 41 24 25 ...
#> $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
#> $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
#> $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
#> $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
#> $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
#> $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
#> $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
#> $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
#> $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
#> $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
#> $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
#> $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
#> $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
#> $ cons.price.idx: num 94 94 94 94 94 ...
#> $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
#> $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
#> $ nr.employed : num 5191 5191 5191 5191 5191 ...
#> $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
Each of column already changed into desired data type.
colSums(is.na(bank))#> age job marital education default
#> 0 0 0 0 0
#> housing loan contact month day_of_week
#> 0 0 0 0 0
#> duration campaign pdays previous poutcome
#> 0 0 0 0 0
#> emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
#> 0 0 0 0 0
#> y
#> 0
There is no missing value on the data.
ggplot(bank, aes(marital, duration))+
geom_boxplot(aes(fill = marital))+
labs(title = "Duration Call by Marital",
x = "Marital",
y = "Duration")+
theme(plot.title = element_text(hjust=0.5))+
theme(legend.position = "None")duration_month <- bank %>%
group_by(month) %>%
summarise(avg = mean(duration))
duration_month$month <- factor(duration_month$month, levels = c("apr","may","jun","jul","aug", "sep", "oct", "nov", "dec"))
ggplot(duration_month, aes(x = as.factor(month), y = avg))+
geom_line(group = 1, color = "#16d9e8") +
geom_point() +
labs(x = 'Month',
y = NULL,
title = 'Average Duration') +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5))job <- bank %>%
group_by(job) %>%
summarise(freq = n()) %>%
arrange(-freq)
ggplot(data = job, mapping = aes(x = freq,
y = reorder(job, freq))) +
geom_col(mapping = aes(fill = freq)) +
scale_fill_gradient(low = "#5e7496", high = "#16d9e8")+
labs(title = 'Most Job in Direct Marketing Campaigns',
x = 'Total',
y = NULL) +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5))