Introduction:

This multivariate data can be found on UCI Machine Learning Repository. http://archive.ics.uci.edu/ml/datasets/Bank+Marketing

The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).

library(ggplot2)
library(dplyr)
library(caTools)
library(caret)
library(randomForest)

#Display the first 6 rows of the data
head(Bank_Data)

##   X age       job marital   education default housing loan   contact month
## 1 0  56 housemaid married    basic.4y      no      no   no telephone   may
## 2 1  57  services married high.school unknown      no   no telephone   may
## 3 2  37  services married high.school      no     yes   no telephone   may
## 4 3  40    admin. married    basic.6y      no      no   no telephone   may
## 5 4  56  services married high.school      no      no  yes telephone   may
## 6 5  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed   y.
## 1         93.994         -36.4     4.857        5191 no\n
## 2         93.994         -36.4     4.857        5191 no\n
## 3         93.994         -36.4     4.857        5191 no\n
## 4         93.994         -36.4     4.857        5191 no\n
## 5         93.994         -36.4     4.857        5191 no\n
## 6         93.994         -36.4     4.857        5191 no\n

#Delete the X column

Data <- subset(Bank_Data, select = -X)

head(Data)

##   age       job marital   education default housing loan   contact month
## 1  56 housemaid married    basic.4y      no      no   no telephone   may
## 2  57  services married high.school unknown      no   no telephone   may
## 3  37  services married high.school      no     yes   no telephone   may
## 4  40    admin. married    basic.6y      no      no   no telephone   may
## 5  56  services married high.school      no      no  yes telephone   may
## 6  45  services married    basic.9y unknown      no   no telephone   may
##   day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1         mon      261        1   999        0 nonexistent          1.1
## 2         mon      149        1   999        0 nonexistent          1.1
## 3         mon      226        1   999        0 nonexistent          1.1
## 4         mon      151        1   999        0 nonexistent          1.1
## 5         mon      307        1   999        0 nonexistent          1.1
## 6         mon      198        1   999        0 nonexistent          1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed   y.
## 1         93.994         -36.4     4.857        5191 no\n
## 2         93.994         -36.4     4.857        5191 no\n
## 3         93.994         -36.4     4.857        5191 no\n
## 4         93.994         -36.4     4.857        5191 no\n
## 5         93.994         -36.4     4.857        5191 no\n
## 6         93.994         -36.4     4.857        5191 no\n

Check the dimention of the data

dim(Data)

## [1] 41188    21

The data consist of 41188 Observations and 21 features.

Output

Data %>% count(y.)

##      y.     n
## 1  no\n 36548
## 2 yes\n  4640

There are 36548 individuals did not subscribe to a term deposit, while 4640 did.

Output <- prop.table(table(Data$y.))
Output_Perc <- as.data.frame(Output)
Output_Perc

##    Var1      Freq
## 1  no\n 0.8873458
## 2 yes\n 0.1126542

As shown above, only 0.11 = 11% of the total observations which is 41188 have signed to a term deposit. This is a small percentage and indicate a not successful campaign. Our goal is increasing the percentage of people to sign to a term deposit. This can be done by studying and learning more about our clients and analyze the data carefully to find out who is the better candidate for the campaign.

Check for missing values

Missing_data <- sum(is.na(Data))
Missing_data

## [1] 0

Check the type of each column

#names(Data)
sapply(Data, class)

##            age            job        marital      education        default 
##      "integer"    "character"    "character"    "character"    "character" 
##        housing           loan        contact          month    day_of_week 
##    "character"    "character"    "character"    "character"    "character" 
##       duration       campaign          pdays       previous       poutcome 
##      "integer"      "integer"      "integer"      "integer"    "character" 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##      "numeric"      "numeric"      "numeric"      "numeric"      "numeric" 
##             y. 
##    "character"

To understand the features of the data, I am creating the table below to display the definition and the type of each feature.

Feature	Definition	Type
age	Age	numeric
job	Type of job	categorical
marital	Marital status (‘divorced’, ‘married’, ‘single’, ‘unknown’ ; note: ‘divorced’ means divorced or widowed)	categorical
education	Education Level	categorical
default	has credit in default? ( ‘no’, ‘yes’, ‘unknown’)	categorical
housing	has housing loan? ( ‘no’, ‘yes’, ‘unknown’)	categorical
loan	has personal loan? ( ‘no’, ‘yes’, ‘unknown’)	categorical
contact	contact communication type ( ‘cellular’,‘telephone’)	Categorical
month	last contact month of year ( ‘jan’, ‘feb’, ‘mar’, …, ‘nov’, ‘dec’)	Categorical
day_of_week	last contact day of the week ( ‘mon’,‘tue’,‘wed’,‘thu’,‘fri’)	categorical
duration	last contact duration, in seconds	Numeric
campaign	number of contacts performed during this campaign and for this client ( includes last contact)	Numeric
pdays	number of days that passed by after the client was last contacted from a previous campaign ( 999 means client was not previously contacted)	Numeric
previous	number of contacts performed before this campaign and for this client	Numeric
poutcome	outcome of the previous marketing campaign ( ‘failure’,‘nonexistent’,‘success’)	categorical
emp.var.rate	employment variation rate - quarterly indicator	Numeric
cons.price.idx	consumer price index - monthly indicator	Numeric
cons.conf.idx	consumer confidence index - monthly indicator	Numeric
euribor3m	euribor 3 month rate - daily indicator	Numeric
nr.employed	number of employees - quarterly indicator	Numeric
y.	has the client subscribed a term deposit?	Binary

summary(Data)

##       age            job              marital           education        
##  Min.   :17.00   Length:41188       Length:41188       Length:41188      
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.02                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :98.00                                                           
##    default            housing              loan             contact         
##  Length:41188       Length:41188       Length:41188       Length:41188      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:41188       Length:41188       Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 102.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 180.0   Median : 2.000  
##                                        Mean   : 258.3   Mean   : 2.568  
##                                        3rd Qu.: 319.0   3rd Qu.: 3.000  
##                                        Max.   :4918.0   Max.   :56.000  
##      pdays          previous       poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.000   Length:41188       Min.   :-3.40000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-1.80000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 1.10000  
##  Mean   :962.5   Mean   :0.173                      Mean   : 0.08189  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 1.40000  
##  Max.   :999.0   Max.   :7.000                      Max.   : 1.40000  
##  cons.price.idx  cons.conf.idx     euribor3m      nr.employed  
##  Min.   :92.20   Min.   :-50.8   Min.   :0.634   Min.   :4964  
##  1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344   1st Qu.:5099  
##  Median :93.75   Median :-41.8   Median :4.857   Median :5191  
##  Mean   :93.58   Mean   :-40.5   Mean   :3.621   Mean   :5167  
##  3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :94.77   Max.   :-26.9   Max.   :5.045   Max.   :5228  
##       y.           
##  Length:41188      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Summary function provides some information about each feature such as Min, Max, Mean, Median, 1st Quartile and 3rd Quartile.

#columns = list(data_f.columns)
Columns <- data.frame(colnames(Data))

Sum <- sum(duplicated(Data))

#Make a copy of Data

Bank.Data <- Data

Feature Aggregation

1- Change the output to 0 for “no” and 1 for “yes”.

Bank.Data= fastDummies::dummy_cols(Bank.Data, select_columns="y.", remove_selected_columns = TRUE)
#summary(Bank.Data$y.)

1- Age

#Re-Arrange the age column to 3 groups:
#0 represent the Age from (0 to 35), 1 represent the observations from (35 to 50), 2 represents the observations from 50 and older.

Bank.Data <- Bank.Data %>% mutate(Age = case_when(age < 35 ~ '0',
age >= 35  & age < 50 ~ '1',
age >= 50 ~ '2'))

2- Job

Job <- table(Data$job, Data$y.)
Job

##                
##                 no\n yes\n
##   admin.        9070  1352
##   blue-collar   8616   638
##   entrepreneur  1332   124
##   housemaid      954   106
##   management    2596   328
##   retired       1286   434
##   self-employed 1272   149
##   services      3646   323
##   student        600   275
##   technician    6013   730
##   unemployed     870   144
##   unknown        293    37

We can include from the table above that admins, blue- collar, management, technician, retiree and students were able to subscribe more than other jobs.

31% of the total observation for students have subscribed to a term deposit.

We can see here about 25% of the total observation for retired people subscribed to a term deposit. This confirms what we included earlier about in age feature, since retired people are most likely older than 60. However, they are good candidate to subscribe.

Unique.Jobs <- as.data.frame( table((Bank_Data$job)))

jobOutput <- prop.table(table(Data$job, Data$y.))
jobOutput_Perc <- as.data.frame(jobOutput)
jobOutput_Perc

##             Var1  Var2         Freq
## 1         admin.  no\n 0.2202097698
## 2    blue-collar  no\n 0.2091871419
## 3   entrepreneur  no\n 0.0323395164
## 4      housemaid  no\n 0.0231620860
## 5     management  no\n 0.0630280664
## 6        retired  no\n 0.0312226862
## 7  self-employed  no\n 0.0308827814
## 8       services  no\n 0.0885209284
## 9        student  no\n 0.0145673497
## 10    technician  no\n 0.1459891230
## 11    unemployed  no\n 0.0211226571
## 12       unknown  no\n 0.0071137224
## 13        admin. yes\n 0.0328250947
## 14   blue-collar yes\n 0.0154899485
## 15  entrepreneur yes\n 0.0030105856
## 16     housemaid yes\n 0.0025735651
## 17    management yes\n 0.0079634845
## 18       retired yes\n 0.0105370496
## 19 self-employed yes\n 0.0036175585
## 20      services yes\n 0.0078420899
## 21       student yes\n 0.0066767020
## 22    technician yes\n 0.0177236088
## 23    unemployed yes\n 0.0034961639
## 24       unknown yes\n 0.0008983199

#Factorizing the column (job) to zero and one by using dummy variable:

Bank.Data= fastDummies::dummy_cols(Bank.Data, select_columns="job", remove_selected_columns = TRUE, remove_first_dummy = TRUE)

3- Marital Status

#Change Martial feature from categorical to numerical values
Data$marital <- as.numeric(as.factor(Data$marital))
Divorced <- table(Data [Data$marital==1, ]$y.)
Single <- table(Data [Data$marital==2, ]$y.)
Married <- table(Data [Data$marital==3, ]$y.)
Unknown <- table(Data [Data$marital==4, ]$y.)

Marital_Y <- rbind(Divorced, Single, Married, Unknown )

Marital_Y

##           no\n yes\n
## Divorced  4136   476
## Single   22396  2532
## Married   9948  1620
## Unknown     68    12

14% of married people in the data have subscribed to a term deposit.

10.16 % of single people in the data have subscribed to a term deposit.

10.3 % of divorced people in the data have subscribed to a term deposit.

15% of people who did not declare their marital status in the data have subscribed to a term deposit.

Bank.Data <- Bank.Data %>% mutate(marital= recode(marital, 
married = '0',
divorced = '1',
single = '2',
unknown = '3'
))

4- Education

#Factorizing the column (Education) to zero and one by using dummy variable:

Bank.Data= fastDummies::dummy_cols(Bank.Data, select_columns="education", remove_selected_columns = TRUE, remove_first_dummy = TRUE)

Duration

Bank.Data <- Bank.Data %>% mutate(duration = case_when(
duration < 700 ~ '0',
duration >= 700  & duration < 1400 ~ '1',
duration >= 1400  & duration < 2100 ~ '2',
duration >= 2100  & duration < 2800 ~ '3',
duration >= 2800  & duration < 3500 ~ '4',
duration >= 3500  & duration < 4200 ~ '5',
duration >= 4200 ~ '6'

))

pdays

Bank.Data <- Bank.Data %>% mutate(pdays = case_when(
pdays < 8 ~ '0',
pdays >= 8  & pdays < 17 ~ '1',
pdays >= 17  & pdays < 999 ~ '2',
pdays >= 999 ~ '3'
))

Housing

housing <- table(Data$housing, Data$y.)
housing

##          
##            no\n yes\n
##   no      16596  2026
##   unknown   883   107
##   yes     19069  2507

10.88% of people without housing loan in the data have subscribed to a term deposit.

11.62% of people with housing loan in the data have subscribed to a term deposit.

10.80% of people with unknown response about housing loan in the data have subscribed to a term deposit.

Bank.Data <- Bank.Data %>% mutate(housing= recode(housing, yes = '0', no = '1', unknown = '3'))

Default

Default <- table(Data$default, Data$y.)
Default

##          
##            no\n yes\n
##   no      28391  4197
##   unknown  8154   443
##   yes         3     0

12.88% of people who don’t have a credit in line in the data have subscribed to a term deposit.

0% of people who have a credit in line in the data have subscribed to a term deposit.

5.15% of people with unknown response on whether they have a credit in line or not have subscribed to a term deposit.

Bank.Data <- Bank.Data %>% mutate(default= recode(default, yes = '0', no = '1', unknown = '3'))

Loan

Loan <- table(Data$loan, Data$y.)
Loan

##          
##            no\n yes\n
##   no      30100  3850
##   unknown   883   107
##   yes      5565   683

11.34% of people who don’t have a personal loan have subscribed to a term deposit.

10.93% of people with personal loan have subscribed to a term deposit.

10.8% of people with unknown response on whether they have a personal loan or not have subscribed to a term deposit.

Bank.Data <- Bank.Data %>% mutate(loan= recode(loan, yes = '0', no = '1', unknown = '3'))

poutocme

Bank.Data <- Bank.Data %>% mutate(poutcome= recode(loan, failure = '0', nonexistent = '1', success = '3'))

cons.price.indx

Bank.Data <- Bank.Data %>% mutate(cons.price.idx = case_when(
cons.price.idx >= 90  & cons.price.idx < 93 ~ '0',
cons.price.idx >= 93  & cons.price.idx < 94 ~ '1',
cons.price.idx >= 94 ~ '2'
))

cons.conf.idx

Bank.Data <- Bank.Data %>% mutate(cons.conf.idx = case_when(
cons.conf.idx >= -51  & cons.conf.idx < -47 ~ '0',
cons.conf.idx >= -47  & cons.conf.idx < -43 ~ '1',
cons.conf.idx >= -43  & cons.conf.idx < -39 ~ '2',
cons.conf.idx >= -39  & cons.conf.idx < -35 ~ '3',
cons.conf.idx >= -35  & cons.conf.idx < -31 ~ '4',
cons.conf.idx >= -31 ~ '5'
))

Graphs

#hist(Age relevant to y)
ggplot(data=Data,aes(x=age,fill=y.,))+geom_bar()

Although the highest population of people who are willing to subscribe to a term deposit are ranges between 24 to 60 years old, we can obviously see people older than 60 years old and under 24 years old subscribed to a term deposit

library(ggplot2)
Data$y. <- as.factor(Data$y.)
Data$default <- as.factor(Data$default)
Data$housing <- as.factor(Data$housing)
Data$loan <- as.factor(Data$loan)


ggplot(Data,aes(x= default,fill= y.)) +
  theme_bw() +
  facet_wrap(~marital) +
  geom_bar()+
  labs(y ="Freq",
       title = "The Subscribe status by Marital Status based on default")

ggplot(Data,aes(x= housing,fill= y.)) +
  geom_bar()+
  labs(y ="count",
       title = "Subscription to a term deposit based on housing loan")

ggplot(Data,aes(x= housing,fill= y.)) +
  theme_bw() +
  facet_wrap(~marital) +
  geom_bar()+
  labs(y ="Freq",
       title = "The Subscribe status by Marital Status based on housing")

ggplot(Data,aes(x= loan,fill= y.)) +
  geom_bar()+
  labs(y ="count",
       title = "Subscription to a term deposit based on loan")

ggplot(Data, aes(age, colour = y.)) +
  geom_density()

ggplot(Data, aes(duration , colour = y.)) +
  geom_density()

ggplot(Data, aes(pdays , colour = y.)) +
  geom_density()

Bank Campiagn data Analysis

Feature Engineering

Gehad Gad

July 19, 2021

Introduction:

Feature Aggregation

Graphs