library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(leaps)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(bestglm)
library(ROCR)
library(dplyr)
library(ROSE)
## Loaded ROSE 0.0-4
The data Bank Marketing that we’re going to explore is provided at https://archive.ics.uci.edu/ml/datasets/bank+marketing. The data records marketing campaigns through phone call at one of local bank in Portuguese.
Use read.csv() from base R to read bank-additional.csv that has been saved in local folder. Define it as BankMarketingData_Actual -
BankMarketingData_Actual = read.csv("bank-additional.csv", header=TRUE, sep = ";")
Display the dimensions of this data set
dim(BankMarketingData_Actual)
## [1] 4119 21
The dimension of bank data set is 4119 rows and 21 columns to be observed.
Display the column names of the dataset -
names(BankMarketingData_Actual)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
Inspect the type and some characteristics of each variable or column.
str(BankMarketingData_Actual)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : chr "blue-collar" "services" "services" "services" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education : chr "basic.9y" "high.school" "high.school" "basic.9y" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "yes" "no" "yes" "unknown" ...
## $ loan : chr "no" "no" "no" "unknown" ...
## $ contact : chr "cellular" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "jun" "jun" ...
## $ day_of_week : chr "fri" "fri" "wed" "fri" ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : chr "no" "no" "no" "no" ...
Inspect NA values in the data set and Impute them -
anyNA(BankMarketingData_Actual)
## [1] FALSE
Looks like there are no NA values in the data set.
#Data Transformations
In data preparation process, one of the most essential step to do is transforming some variables type into the right form ex: categorical variable as factor. A proper data transformation must be applied to make it easier for us to wrangle the data and visualization process later on.
#Define the clean Bank Marketing Data
BankMarketingData_Clean <- BankMarketingData_Actual
#Add target variable 'Subscribed' for the variable 'y'
BankMarketingData_Clean$Subscribed <- ifelse(BankMarketingData_Actual$y == 'yes', 1, 0)
BankMarketingData_Clean$Subscribed <- factor(BankMarketingData_Clean$Subscribed)
#Convert categorical into factor
BankMarketingData_Clean$job <- factor(BankMarketingData_Clean$job)
BankMarketingData_Clean$marital <- factor(BankMarketingData_Clean$marital)
BankMarketingData_Clean$education <- factor(BankMarketingData_Clean$education)
BankMarketingData_Clean$default <- factor(BankMarketingData_Clean$default)
BankMarketingData_Clean$housing <- factor(BankMarketingData_Clean$housing)
BankMarketingData_Clean$loan <- factor(BankMarketingData_Clean$loan)
BankMarketingData_Clean$contact <- factor(BankMarketingData_Clean$contact)
BankMarketingData_Clean$month <- factor(BankMarketingData_Clean$month)
BankMarketingData_Clean$day_of_week <- factor(BankMarketingData_Clean$day_of_week)
BankMarketingData_Clean$poutcome <- factor(BankMarketingData_Clean$poutcome)
str(BankMarketingData_Clean)
## 'data.frame': 4119 obs. of 22 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : chr "no" "no" "no" "no" ...
## $ Subscribed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#Also, removing the y as we have transformed the value of y into Subscribed
BankMarketingData_Clean <- subset( BankMarketingData_Clean, select = -y )
Important note: this attribute highly affects the output target (e.g., if duration=0 then y=‘no’). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. Because of this
BankMarketingData_Clean <- subset( BankMarketingData_Clean, select = -duration )
length(unique(BankMarketingData_Clean$day))
## [1] 5
#Wrangling & Exploratory Data
summary(BankMarketingData_Clean$Subscribed)
## 0 1
## 3668 451
After doing data transformation and preparation,we want to analyze the characteristic of customer segment of this bank using variables: job, marital, education, age, balance, loan, housing by using table()/aggregate() functions.
Before that, create new column range age to categories each customer into a categorical variable. We can wrangling the data by using function case_when() from library(dplyr)-
BankMarketingData_Clean$rangeage <- as.factor(case_when(
BankMarketingData_Clean$age < 23 ~ "18-22",
BankMarketingData_Clean$age > 22 & BankMarketingData_Clean$age < 36 ~ "23-35",
BankMarketingData_Clean$age > 35 & BankMarketingData_Clean$age < 51 ~ "36-50",
TRUE ~ "above_50"
))
head(BankMarketingData_Clean)
## age job marital education default housing loan contact
## 1 30 blue-collar married basic.9y no yes no cellular
## 2 39 services single high.school no no no telephone
## 3 25 services married high.school no yes no telephone
## 4 38 services married basic.9y no unknown unknown telephone
## 5 47 admin. married university.degree no yes no cellular
## 6 32 services single university.degree no no no cellular
## month day_of_week campaign pdays previous poutcome emp.var.rate
## 1 may fri 2 999 0 nonexistent -1.8
## 2 may fri 4 999 0 nonexistent 1.1
## 3 jun wed 1 999 0 nonexistent 1.4
## 4 jun fri 3 999 0 nonexistent 1.4
## 5 nov mon 1 999 0 nonexistent -0.1
## 6 sep thu 3 999 2 failure -1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed Subscribed rangeage
## 1 92.893 -46.2 1.313 5099.1 0 23-35
## 2 93.994 -36.4 4.855 5191.0 0 36-50
## 3 94.465 -41.8 4.962 5228.1 0 23-35
## 4 94.465 -41.8 4.959 5228.1 0 36-50
## 5 93.200 -42.0 4.191 5195.8 0 36-50
## 6 94.199 -37.5 0.884 4963.6 0 23-35
sort(round(prop.table(table(BankMarketingData_Clean$job))*100, 2), decreasing = T)
##
## admin. blue-collar technician services management
## 24.57 21.46 16.78 9.54 7.87
## retired self-employed entrepreneur unemployed housemaid
## 4.03 3.86 3.59 2.69 2.67
## student unknown
## 1.99 0.95
sort(round(prop.table(table(BankMarketingData_Clean$education))*100, 2), decreasing = T)
##
## university.degree high.school basic.9y professional.course
## 30.69 22.36 13.94 12.99
## basic.4y basic.6y unknown illiterate
## 10.42 5.54 4.05 0.02
sort(round(prop.table(table(BankMarketingData_Clean$rangeage))*100, 2), decreasing = T)
##
## 36-50 23-35 above_50 18-22
## 41.78 39.48 18.14 0.61
round(prop.table(table(BankMarketingData_Clean$rangeage, BankMarketingData_Clean$Subscribed))*100, 2)
##
## 0 1
## 18-22 0.46 0.15
## 23-35 35.23 4.25
## 36-50 37.99 3.79
## above_50 15.37 2.77
sort(round(prop.table(table(BankMarketingData_Clean$marital))*100, 2), decreasing = T)
##
## married single divorced unknown
## 60.91 27.99 10.83 0.27
round(prop.table(table(BankMarketingData_Clean$marital, BankMarketingData_Clean$Subscribed))*100, 2)
##
## 0 1
## divorced 9.78 1.04
## married 54.79 6.12
## single 24.23 3.76
## unknown 0.24 0.02
#Initial run of the logistic regression, we found out that loan=unknown do not have any significance and #the summary is providing ‘NA’ for all the output parameters. So removing it
BankMarketingData_Clean <- BankMarketingData_Clean %>% dplyr::filter(BankMarketingData_Clean$loan != 'unknown')
Exploratory Data Analysis
library(dplyr)
library(ggplot2)
ggplot(BankMarketingData_Actual,aes(job))+geom_bar(aes(fill=y))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Admin and Technicians are most clients with job description that subscribed a term deposit.
ggplot(BankMarketingData_Actual,aes(job))+geom_bar(aes(fill=contact)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.9, hjust=1))
table(BankMarketingData_Actual$contact,BankMarketingData_Actual$y)
##
## no yes
## cellular 2277 375
## telephone 1391 76
ggplot(BankMarketingData_Actual,aes(previous))+geom_bar(aes(fill=y))
Previous number of contacts performed before this campaign and for this client has less affect on subscription of product compared to clients who has no previous number of contacts performed.
table(BankMarketingData_Actual$poutcome,BankMarketingData_Actual$y)
##
## no yes
## failure 387 67
## nonexistent 3231 292
## success 50 92
Among the clients who subscribed product, outcome of previous contacted clients with nonexistent category has most subscriptions.
#—————————————————————-# # Train - Test Split# #—————————————————————-#
#split the data into training and validation sets(70- 30 split)
set.seed(123)
train = sample(1:nrow(BankMarketingData_Clean), nrow(BankMarketingData_Clean)/1.43) #Row indexes
test = (-train)#Row indexes for test set
BankMarketingDataTrain <- BankMarketingData_Clean[train, ]
BankMarketingDataTest <- BankMarketingData_Clean[test, ]
#—————————————————————-# # Train - balancing by Oversampling technique# #—————————————————————-#
table(BankMarketingDataTrain$Subscribed)
##
## 0 1
## 2480 326
BankMarketingDataTrain <- ovun.sample(Subscribed ~ ., data = BankMarketingDataTrain, method = "over",N =4960 )$data
table(BankMarketingDataTrain$Subscribed)
##
## 0 1
## 2480 2480
#—————————————————————-# # Model Selection, training and Predict# #—————————————————————-#
#Summary of the train dataset
str(BankMarketingDataTrain)
## 'data.frame': 4960 obs. of 21 variables:
## $ age : int 48 50 38 35 38 34 42 50 29 56 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 5 10 2 2 10 5 10 2 6 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 1 1 2 2 3 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 7 7 3 8 6 7 7 3 1 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 2 1 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 3 1 1 3 3 1 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 3 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 1 1 1 2 2 1 1 2 1 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 5 10 7 8 5 7 2 10 5 8 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 3 5 4 1 5 3 5 2 1 ...
## $ campaign : int 4 4 1 2 1 1 5 2 1 2 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 3 0 0 0 0 0 0 0 1 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 1 2 2 2 2 2 2 2 1 ...
## $ emp.var.rate : num 1.4 -1.1 -1.8 -0.1 1.4 1.1 1.4 -3.4 1.4 -0.1 ...
## $ cons.price.idx: num 94.5 94.2 92.9 93.2 94.5 ...
## $ cons.conf.idx : num -41.8 -37.5 -46.2 -42 -41.8 -36.4 -36.1 -29.8 -41.8 -42 ...
## $ euribor3m : num 4.865 0.878 1.334 4.153 4.967 ...
## $ nr.employed : num 5228 4964 5099 5196 5228 ...
## $ Subscribed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ rangeage : Factor w/ 4 levels "18-22","23-35",..: 3 3 3 2 3 2 3 3 2 4 ...
m1 <- glm(formula = as.factor(Subscribed) ~ . , data = BankMarketingDataTrain, family = binomial)
summary(m1)
##
## Call:
## glm(formula = as.factor(Subscribed) ~ ., family = binomial, data = BankMarketingDataTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.94595 -0.85529 0.03802 0.79478 2.11609
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.133e+01 7.417e+01 -0.827 0.408273
## age 4.526e-02 8.952e-03 5.055 4.29e-07 ***
## jobblue-collar -2.641e-01 1.308e-01 -2.019 0.043535 *
## jobentrepreneur -5.122e-01 2.265e-01 -2.261 0.023748 *
## jobhousemaid -4.832e-01 2.471e-01 -1.956 0.050482 .
## jobmanagement -2.132e-01 1.436e-01 -1.484 0.137737
## jobretired -1.555e-01 2.083e-01 -0.747 0.455338
## jobself-employed -9.062e-01 2.210e-01 -4.100 4.14e-05 ***
## jobservices -2.012e-01 1.428e-01 -1.409 0.158896
## jobstudent -2.466e-01 2.461e-01 -1.002 0.316403
## jobtechnician -5.354e-02 1.183e-01 -0.453 0.650853
## jobunemployed 3.896e-01 2.171e-01 1.795 0.072683 .
## jobunknown 2.806e-01 3.349e-01 0.838 0.402025
## maritalmarried 1.113e-01 1.205e-01 0.924 0.355590
## maritalsingle 4.721e-01 1.379e-01 3.423 0.000619 ***
## maritalunknown 1.231e+00 4.923e-01 2.500 0.012415 *
## educationbasic.6y -3.614e-01 1.999e-01 -1.808 0.070563 .
## educationbasic.9y -5.037e-02 1.519e-01 -0.332 0.740125
## educationhigh.school -6.188e-02 1.522e-01 -0.407 0.684316
## educationilliterate -1.340e+01 3.247e+02 -0.041 0.967098
## educationprofessional.course -1.602e-01 1.721e-01 -0.931 0.352025
## educationuniversity.degree -1.413e-01 1.547e-01 -0.914 0.360942
## educationunknown -9.730e-02 2.089e-01 -0.466 0.641452
## defaultunknown -1.118e-01 9.958e-02 -1.123 0.261542
## defaultyes -1.146e+01 3.247e+02 -0.035 0.971852
## housingyes -2.802e-02 7.243e-02 -0.387 0.698889
## loanyes -3.269e-02 9.273e-02 -0.352 0.724467
## contacttelephone -7.066e-01 1.486e-01 -4.755 1.98e-06 ***
## monthaug 2.106e-01 2.641e-01 0.797 0.425252
## monthdec 9.232e-01 4.830e-01 1.912 0.055937 .
## monthjul 4.361e-01 1.873e-01 2.329 0.019877 *
## monthjun 6.913e-01 2.664e-01 2.594 0.009475 **
## monthmar 1.704e+00 3.853e-01 4.423 9.73e-06 ***
## monthmay -2.191e-01 1.620e-01 -1.352 0.176349
## monthnov -1.786e-01 2.175e-01 -0.821 0.411425
## monthoct -4.901e-01 3.154e-01 -1.554 0.120217
## monthsep -5.047e-01 3.835e-01 -1.316 0.188201
## day_of_weekmon 4.903e-02 1.124e-01 0.436 0.662680
## day_of_weekthu -7.163e-02 1.129e-01 -0.635 0.525655
## day_of_weektue -2.115e-01 1.180e-01 -1.793 0.073049 .
## day_of_weekwed 1.097e-01 1.146e-01 0.958 0.338258
## campaign -9.706e-02 1.951e-02 -4.975 6.52e-07 ***
## pdays -2.879e-04 5.297e-04 -0.544 0.586731
## previous 4.622e-02 1.451e-01 0.319 0.750017
## poutcomenonexistent 4.999e-01 2.065e-01 2.421 0.015492 *
## poutcomesuccess 1.487e+00 5.302e-01 2.804 0.005042 **
## emp.var.rate -5.680e-01 2.997e-01 -1.895 0.058058 .
## cons.price.idx 8.839e-01 4.936e-01 1.791 0.073334 .
## cons.conf.idx 3.949e-02 1.774e-02 2.226 0.026003 *
## euribor3m 5.630e-02 2.491e-01 0.226 0.821203
## nr.employed -4.131e-03 5.904e-03 -0.700 0.484111
## rangeage23-35 -3.338e-01 4.046e-01 -0.825 0.409316
## rangeage36-50 -7.039e-01 4.375e-01 -1.609 0.107612
## rangeageabove_50 -1.068e+00 4.989e-01 -2.141 0.032278 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6876.0 on 4959 degrees of freedom
## Residual deviance: 5146.2 on 4906 degrees of freedom
## AIC: 5254.2
##
## Number of Fisher Scoring iterations: 11
library(fastDummies)
dummy_data <- fastDummies::dummy_cols(BankMarketingData_Clean[,c(1:19,21)],remove_first_dummy = TRUE)
names(dummy_data)
## [1] "age" "job"
## [3] "marital" "education"
## [5] "default" "housing"
## [7] "loan" "contact"
## [9] "month" "day_of_week"
## [11] "campaign" "pdays"
## [13] "previous" "poutcome"
## [15] "emp.var.rate" "cons.price.idx"
## [17] "cons.conf.idx" "euribor3m"
## [19] "nr.employed" "rangeage"
## [21] "job_blue-collar" "job_entrepreneur"
## [23] "job_housemaid" "job_management"
## [25] "job_retired" "job_self-employed"
## [27] "job_services" "job_student"
## [29] "job_technician" "job_unemployed"
## [31] "job_unknown" "marital_married"
## [33] "marital_single" "marital_unknown"
## [35] "education_basic.6y" "education_basic.9y"
## [37] "education_high.school" "education_illiterate"
## [39] "education_professional.course" "education_university.degree"
## [41] "education_unknown" "default_unknown"
## [43] "default_yes" "housing_unknown"
## [45] "housing_yes" "loan_unknown"
## [47] "loan_yes" "contact_telephone"
## [49] "month_aug" "month_dec"
## [51] "month_jul" "month_jun"
## [53] "month_mar" "month_may"
## [55] "month_nov" "month_oct"
## [57] "month_sep" "day_of_week_mon"
## [59] "day_of_week_thu" "day_of_week_tue"
## [61] "day_of_week_wed" "poutcome_nonexistent"
## [63] "poutcome_success" "rangeage_23-35"
## [65] "rangeage_36-50" "rangeage_above_50"
keep <-
c("campaign","pdays","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed", "job_blue-collar",
"job_entrepreneur", "job_housemaid" , "job_management" ,
"job_retired", "job_self-employed" , "job_services" ,
"job_student" , "job_technician" , "job_unemployed" ,
"job_unknown", "marital_married" , "marital_single" ,
"marital_unknown" , "education_basic.6y" , "education_basic.9y" ,
"education_high.school" , "education_professional.course",
"education_university.degree",
"housing_yes" ,
"loan_yes" , "contact_telephone" ,
"month_aug" , "month_dec" , "month_jul" ,
"month_jun" , "month_mar" , "month_may" ,
"month_nov" , "month_oct" , "month_sep" ,
"day_of_week_mon" , "day_of_week_thu" , "day_of_week_tue" ,
"day_of_week_wed" , "poutcome_nonexistent" , "poutcome_success" ,
"rangeage_23-35" , "rangeage_36-50" , "rangeage_above_50")
final <- dummy_data[keep]
final <- cbind(final,Subscribed = BankMarketingData_Clean$Subscribed )
str(final)
## 'data.frame': 4014 obs. of 49 variables:
## $ campaign : int 2 4 1 1 3 4 2 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 2 0 0 1 0 0 ...
## $ emp.var.rate : num -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 1.4 ...
## $ cons.price.idx : num 92.9 94 94.5 93.2 94.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 -42.7 ...
## $ euribor3m : num 1.313 4.855 4.962 4.191 0.884 ...
## $ nr.employed : num 5099 5191 5228 5196 4964 ...
## $ job_blue-collar : int 1 0 0 0 0 0 0 0 1 0 ...
## $ job_entrepreneur : int 0 0 0 0 0 0 1 0 0 0 ...
## $ job_housemaid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_management : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_retired : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_self-employed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_services : int 0 1 1 0 1 0 0 1 0 1 ...
## $ job_student : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_technician : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_unemployed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ marital_married : int 1 0 1 1 0 0 1 0 1 0 ...
## $ marital_single : int 0 1 0 0 1 1 0 0 0 1 ...
## $ marital_unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ education_basic.6y : int 0 0 0 0 0 0 0 0 0 1 ...
## $ education_basic.9y : int 1 0 0 0 0 0 0 0 1 0 ...
## $ education_high.school : int 0 1 1 0 0 0 0 0 0 0 ...
## $ education_professional.course: int 0 0 0 0 0 0 0 1 0 0 ...
## $ education_university.degree : int 0 0 0 1 1 1 1 0 0 0 ...
## $ housing_yes : int 1 0 1 1 0 1 1 0 0 1 ...
## $ loan_yes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ contact_telephone : int 0 1 1 0 0 0 0 0 1 0 ...
## $ month_aug : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_dec : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_jul : int 0 0 0 0 0 0 0 0 0 1 ...
## $ month_jun : int 0 0 1 0 0 0 0 0 0 0 ...
## $ month_mar : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_may : int 1 1 0 0 0 0 0 0 1 0 ...
## $ month_nov : int 0 0 0 1 0 0 1 1 0 0 ...
## $ month_oct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_sep : int 0 0 0 0 1 1 0 0 0 0 ...
## $ day_of_week_mon : int 0 0 0 1 0 1 1 0 0 0 ...
## $ day_of_week_thu : int 0 0 0 0 1 0 0 0 1 1 ...
## $ day_of_week_tue : int 0 0 0 0 0 0 0 1 0 0 ...
## $ day_of_week_wed : int 0 0 1 0 0 0 0 0 0 0 ...
## $ poutcome_nonexistent : int 1 1 1 1 0 1 1 0 1 1 ...
## $ poutcome_success : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rangeage_23-35 : int 1 0 1 0 1 1 0 1 1 1 ...
## $ rangeage_36-50 : int 0 1 0 1 0 0 1 0 0 0 ...
## $ rangeage_above_50 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Subscribed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#----------------------------------------
# Model Selection using AIC and BIC
#----------------------------------------
#Full Model
glm_full_bw <- glm(Subscribed ~ ., data = final, family = "binomial")
#Null Model
glm_null_bw <- glm(Subscribed ~ 1, data = final, family = "binomial")
# stepwise selection with AIC & BIC
#**************************************
#**Null to Full step wise selection****
#**************************************
bw_AIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
direction="both",test="Chisq", trace = F)
# stepwise selection with BIC
bw_BIC<-step(glm_null_bw, scope = list(upper=glm_full_bw),
direction="both",test="Chisq", trace = F, k=log(nrow(final)))
summary(bw_AIC)
##
## Call:
## glm(formula = Subscribed ~ nr.employed + poutcome_success + month_may +
## month_mar + month_jun + contact_telephone + cons.conf.idx +
## poutcome_nonexistent + campaign + month_jul + month_dec +
## rangeage_above_50, family = "binomial", data = final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9620 -0.3864 -0.3387 -0.2570 2.7017
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 51.9375860 4.0099142 12.952 < 2e-16 ***
## nr.employed -0.0103416 0.0007956 -12.999 < 2e-16 ***
## poutcome_success 1.7553656 0.2444095 7.182 6.87e-13 ***
## month_may -0.2733815 0.1618741 -1.689 0.0912 .
## month_mar 1.5990053 0.3445086 4.641 3.46e-06 ***
## month_jun 0.9247649 0.1989563 4.648 3.35e-06 ***
## contact_telephone -0.6659446 0.1675220 -3.975 7.03e-05 ***
## cons.conf.idx 0.0289765 0.0113571 2.551 0.0107 *
## poutcome_nonexistent 0.4221315 0.1749164 2.413 0.0158 *
## campaign -0.0764859 0.0341694 -2.238 0.0252 *
## month_jul 0.4402753 0.1875209 2.348 0.0189 *
## month_dec 0.9165702 0.4767709 1.922 0.0545 .
## rangeage_above_50 0.2026515 0.1373453 1.475 0.1401
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2783.7 on 4013 degrees of freedom
## Residual deviance: 2200.5 on 4001 degrees of freedom
## AIC: 2226.5
##
## Number of Fisher Scoring iterations: 6
summary(bw_BIC)
##
## Call:
## glm(formula = Subscribed ~ nr.employed + poutcome_success + month_may +
## month_mar, family = "binomial", data = final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7811 -0.4282 -0.3269 -0.2704 2.5795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 50.9718976 3.4744157 14.671 < 2e-16 ***
## nr.employed -0.0103048 0.0006771 -15.219 < 2e-16 ***
## poutcome_success 1.5340903 0.2089098 7.343 2.08e-13 ***
## month_may -0.7701215 0.1311712 -5.871 4.33e-09 ***
## month_mar 1.2320744 0.3273017 3.764 0.000167 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2783.7 on 4013 degrees of freedom
## Residual deviance: 2247.7 on 4009 degrees of freedom
## AIC: 2257.7
##
## Number of Fisher Scoring iterations: 5
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
## The following object is masked from 'package:dplyr':
##
## select
table(final$Subscribed)
##
## 0 1
## 3572 442
names(final)
## [1] "campaign" "pdays"
## [3] "previous" "emp.var.rate"
## [5] "cons.price.idx" "cons.conf.idx"
## [7] "euribor3m" "nr.employed"
## [9] "job_blue-collar" "job_entrepreneur"
## [11] "job_housemaid" "job_management"
## [13] "job_retired" "job_self-employed"
## [15] "job_services" "job_student"
## [17] "job_technician" "job_unemployed"
## [19] "job_unknown" "marital_married"
## [21] "marital_single" "marital_unknown"
## [23] "education_basic.6y" "education_basic.9y"
## [25] "education_high.school" "education_professional.course"
## [27] "education_university.degree" "housing_yes"
## [29] "loan_yes" "contact_telephone"
## [31] "month_aug" "month_dec"
## [33] "month_jul" "month_jun"
## [35] "month_mar" "month_may"
## [37] "month_nov" "month_oct"
## [39] "month_sep" "day_of_week_mon"
## [41] "day_of_week_thu" "day_of_week_tue"
## [43] "day_of_week_wed" "poutcome_nonexistent"
## [45] "poutcome_success" "rangeage_23-35"
## [47] "rangeage_36-50" "rangeage_above_50"
## [49] "Subscribed"
final <- ovun.sample(Subscribed ~ nr.employed+poutcome_success+month_mar+month_jun+
contact_telephone+cons.conf.idx+poutcome_nonexistent+
campaign+month_jul, data = final, method = "over",N =7144 )$data
# LDA modeling.
lda.model <- lda(Subscribed~nr.employed+poutcome_success+month_mar+month_jun+
contact_telephone+cons.conf.idx+poutcome_nonexistent+
campaign+month_jul,data=final)
# View the output
lda.model
## Call:
## lda(Subscribed ~ nr.employed + poutcome_success + month_mar +
## month_jun + contact_telephone + cons.conf.idx + poutcome_nonexistent +
## campaign + month_jul, data = final)
##
## Prior probabilities of groups:
## 0 1
## 0.5 0.5
##
## Group means:
## nr.employed poutcome_success month_mar month_jun contact_telephone
## 0 5175.341 0.01399776 0.005599104 0.1254199 0.3776596
## 1 5092.313 0.20408735 0.060190370 0.1564950 0.1685330
## cons.conf.idx poutcome_nonexistent campaign month_jul
## 0 -40.59569 0.8810190 2.604143 0.1774916
## 1 -39.63989 0.6508959 1.982083 0.1229003
##
## Coefficients of linear discriminants:
## LD1
## nr.employed -0.01001799
## poutcome_success 0.87928936
## month_mar 1.14702120
## month_jun 1.02105109
## contact_telephone -0.66060768
## cons.conf.idx 0.03732923
## poutcome_nonexistent 0.28023953
## campaign -0.06764702
## month_jul 0.51384993
# Predicting for the testing dataset we created
predictions.lda <- predict(lda.model,final)
# Make confusion matrix for the LDA predictions to compare accuracy
caret::confusionMatrix(as.factor(predictions.lda$class),as.factor(final$Subscribed))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2979 1285
## 1 593 2287
##
## Accuracy : 0.7371
## 95% CI : (0.7267, 0.7473)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4742
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8340
## Specificity : 0.6403
## Pos Pred Value : 0.6986
## Neg Pred Value : 0.7941
## Prevalence : 0.5000
## Detection Rate : 0.4170
## Detection Prevalence : 0.5969
## Balanced Accuracy : 0.7371
##
## 'Positive' Class : 0
##
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123)
train = sample(1:nrow(final), nrow(final)/1.43) #Row indexes
test = (-train)#Row indexes for test set
final_train <- final[train, ]
final_test <- final[test, ]
table(final_train$Subscribed)
##
## 0 1
## 2504 2491
lda.fit = train(Subscribed ~ nr.employed+poutcome_success+month_mar+month_jun+
contact_telephone+cons.conf.idx+poutcome_nonexistent+
campaign+month_jul, data=final_train, method="lda",
trControl = trainControl(method = "cv", number=10))
pred.subscribed = predict(lda.fit, final_test)
table(pred.subscribed, final_test$Subscribed)
##
## pred.subscribed 0 1
## 0 866 375
## 1 202 706
caret::confusionMatrix(as.factor(pred.subscribed), final_test$Subscribed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 866 375
## 1 202 706
##
## Accuracy : 0.7315
## 95% CI : (0.7122, 0.7501)
## No Information Rate : 0.503
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4635
##
## Mcnemar's Test P-Value : 8.041e-13
##
## Sensitivity : 0.8109
## Specificity : 0.6531
## Pos Pred Value : 0.6978
## Neg Pred Value : 0.7775
## Prevalence : 0.4970
## Detection Rate : 0.4030
## Detection Prevalence : 0.5775
## Balanced Accuracy : 0.7320
##
## 'Positive' Class : 0
##
#GLM (round 1)
m2_lr <- glm(formula = Subscribed ~ age + contact + month +
campaign + previous + poutcome + cons.price.idx + nr.employed + euribor3m,
data = BankMarketingDataTrain, family = binomial)
summary(m2_lr)
##
## Call:
## glm(formula = Subscribed ~ age + contact + month + campaign +
## previous + poutcome + cons.price.idx + nr.employed + euribor3m,
## family = binomial, data = BankMarketingDataTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.90894 -0.87807 -0.04373 0.79103 1.89388
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 126.055130 27.516299 4.581 4.63e-06 ***
## age 0.013358 0.003147 4.244 2.19e-05 ***
## contacttelephone -0.507317 0.121688 -4.169 3.06e-05 ***
## monthaug 0.087021 0.188543 0.462 0.644407
## monthdec 0.772218 0.452188 1.708 0.087685 .
## monthjul 0.493718 0.174726 2.826 0.004718 **
## monthjun 1.184741 0.173783 6.817 9.27e-12 ***
## monthmar 1.288961 0.343682 3.750 0.000177 ***
## monthmay -0.412287 0.141822 -2.907 0.003648 **
## monthnov -0.336431 0.203893 -1.650 0.098936 .
## monthoct -0.474688 0.302404 -1.570 0.116482
## monthsep -0.940073 0.315678 -2.978 0.002902 **
## campaign -0.096496 0.019202 -5.025 5.02e-07 ***
## previous 0.126083 0.124863 1.010 0.312604
## poutcomenonexistent 0.557963 0.191976 2.906 0.003656 **
## poutcomesuccess 1.805582 0.202834 8.902 < 2e-16 ***
## cons.price.idx -0.334611 0.153994 -2.173 0.029789 *
## nr.employed -0.018847 0.002895 -6.511 7.45e-11 ***
## euribor3m 0.387363 0.149421 2.592 0.009530 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6876.0 on 4959 degrees of freedom
## Residual deviance: 5248.9 on 4941 degrees of freedom
## AIC: 5286.9
##
## Number of Fisher Scoring iterations: 5
#Train
pred1 <- predict(m2_lr, newdata = BankMarketingDataTrain, type = "response")
probs1 <- ifelse(pred1 >= 0.5, 1, 0)
caret::confusionMatrix(as.factor(probs1), BankMarketingDataTrain$Subscribed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2132 920
## 1 348 1560
##
## Accuracy : 0.7444
## 95% CI : (0.732, 0.7564)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4887
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8597
## Specificity : 0.6290
## Pos Pred Value : 0.6986
## Neg Pred Value : 0.8176
## Prevalence : 0.5000
## Detection Rate : 0.4298
## Detection Prevalence : 0.6153
## Balanced Accuracy : 0.7444
##
## 'Positive' Class : 0
##
#Test
pred2 <- predict(m2_lr, newdata = BankMarketingDataTest, type = "response")
probs2 <- ifelse(pred2 >= 0.4, 1, 0)
caret::confusionMatrix(as.factor(probs2), BankMarketingDataTest$Subscribed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 842 43
## 1 250 73
##
## Accuracy : 0.7575
## 95% CI : (0.7323, 0.7814)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2227
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7711
## Specificity : 0.6293
## Pos Pred Value : 0.9514
## Neg Pred Value : 0.2260
## Prevalence : 0.9040
## Detection Rate : 0.6970
## Detection Prevalence : 0.7326
## Balanced Accuracy : 0.7002
##
## 'Positive' Class : 0
##
m2_lr2 <- glm(formula = Subscribed ~ as.factor(rangeage) + contact + month + as.factor(job)+
campaign + poutcome + cons.price.idx + nr.employed +euribor3m,
data = BankMarketingDataTrain, family = binomial)
summary(m2_lr2)
##
## Call:
## glm(formula = Subscribed ~ as.factor(rangeage) + contact + month +
## as.factor(job) + campaign + poutcome + cons.price.idx + nr.employed +
## euribor3m, family = binomial, data = BankMarketingDataTrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.95806 -0.86986 -0.03686 0.80326 1.92327
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 122.564308 27.949116 4.385 1.16e-05 ***
## as.factor(rangeage)23-35 -0.043214 0.391498 -0.110 0.912107
## as.factor(rangeage)36-50 -0.016903 0.395046 -0.043 0.965871
## as.factor(rangeage)above_50 0.165904 0.401777 0.413 0.679660
## contacttelephone -0.471699 0.124602 -3.786 0.000153 ***
## monthaug 0.076278 0.191365 0.399 0.690188
## monthdec 0.869647 0.465017 1.870 0.061464 .
## monthjul 0.487041 0.177432 2.745 0.006052 **
## monthjun 1.173814 0.175713 6.680 2.38e-11 ***
## monthmar 1.251839 0.345589 3.622 0.000292 ***
## monthmay -0.377939 0.143218 -2.639 0.008318 **
## monthnov -0.313542 0.207266 -1.513 0.130343
## monthoct -0.556329 0.304027 -1.830 0.067269 .
## monthsep -0.824743 0.320804 -2.571 0.010145 *
## as.factor(job)blue-collar -0.298847 0.104451 -2.861 0.004222 **
## as.factor(job)entrepreneur -0.554484 0.222384 -2.493 0.012654 *
## as.factor(job)housemaid -0.391206 0.227923 -1.716 0.086090 .
## as.factor(job)management -0.263646 0.140095 -1.882 0.059848 .
## as.factor(job)retired 0.064718 0.185398 0.349 0.727030
## as.factor(job)self-employed -0.889387 0.216666 -4.105 4.05e-05 ***
## as.factor(job)services -0.226602 0.131027 -1.729 0.083733 .
## as.factor(job)student -0.196072 0.234462 -0.836 0.403008
## as.factor(job)technician -0.114132 0.106087 -1.076 0.282002
## as.factor(job)unemployed 0.255759 0.207661 1.232 0.218094
## as.factor(job)unknown 0.388467 0.311388 1.248 0.212201
## campaign -0.094880 0.019167 -4.950 7.42e-07 ***
## poutcomenonexistent 0.382481 0.113620 3.366 0.000762 ***
## poutcomesuccess 1.805916 0.206555 8.743 < 2e-16 ***
## cons.price.idx -0.311260 0.155841 -1.997 0.045793 *
## nr.employed -0.018416 0.002939 -6.266 3.69e-10 ***
## euribor3m 0.365744 0.151624 2.412 0.015857 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6876.0 on 4959 degrees of freedom
## Residual deviance: 5220.3 on 4929 degrees of freedom
## AIC: 5282.3
##
## Number of Fisher Scoring iterations: 5
#Test
pred_lr2 <- predict(m2_lr2, newdata = BankMarketingDataTest, type = "response")
probs_lr2 <- ifelse(pred_lr2 >= 0.4, 1, 0)
caret::confusionMatrix(as.factor(probs_lr2), BankMarketingDataTest$Subscribed)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 816 44
## 1 276 72
##
## Accuracy : 0.7351
## 95% CI : (0.7092, 0.7598)
## No Information Rate : 0.904
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1943
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7473
## Specificity : 0.6207
## Pos Pred Value : 0.9488
## Neg Pred Value : 0.2069
## Prevalence : 0.9040
## Detection Rate : 0.6755
## Detection Prevalence : 0.7119
## Balanced Accuracy : 0.6840
##
## 'Positive' Class : 0
##
#ROC Curve and AUC
pred_roc <- prediction(predict(m2_lr, BankMarketingDataTrain, type = "response"),
BankMarketingDataTrain$Subscribed) #Predicted Probability and True Classification
# area under curve
auc <- round(as.numeric(performance(pred_roc, measure = "auc")@y.values),3)
# some important statistics
false.rates <-performance(pred_roc, "fpr","fnr")
accuracy <-performance(pred_roc, "acc","err")
perf <- performance(pred_roc, "tpr","fpr")
#plotting the ROC curve and computing AUC
plot(perf,colorize = T, main = "ROC Curve", cex = 0.2)
text(0.5,0.5, paste("AUC:", auc))
# computing threshold for cutoff to best trade off sensitivity and specificity
#first sensitivity
plot(unlist(performance(pred_roc, "sens")@x.values), unlist(performance(pred_roc, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE) # plot another line in same plot
#second specificity
plot(unlist(performance(pred_roc, "spec")@x.values), unlist(performance(pred_roc, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2)) #specificity axis labels
mtext("Specificity",side=4, col='red')
#find where the lines intersect
min.diff <-which.min(abs(unlist(performance(pred_roc, "sens")@y.values) -
unlist(performance(pred_roc, "spec")@y.values)))
min.x<-unlist(performance(pred_roc, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred_roc, "spec")@y.values)[min.diff]
optimal <-min.x #this is the optimal points to best trade off sensitivity and specificity
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,2)), pos = 4)
#—————————————————————-# # Model training non-parametric #—————————————————————-#
library(e1071)
model_svm<-svm(Subscribed ~ .,data = BankMarketingDataTrain)
summary(model_svm)
##
## Call:
## svm(formula = Subscribed ~ ., data = BankMarketingDataTrain)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 2880
##
## ( 1437 1443 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
library(caret)
pred1<-predict(model_svm,BankMarketingDataTest)
confusionMatrix(as.factor(BankMarketingDataTest$Subscribed),as.factor(pred1))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 978 114
## 1 49 67
##
## Accuracy : 0.8651
## 95% CI : (0.8445, 0.8838)
## No Information Rate : 0.8502
## P-Value [Acc > NIR] : 0.07771
##
## Kappa : 0.3784
##
## Mcnemar's Test P-Value : 5.362e-07
##
## Sensitivity : 0.9523
## Specificity : 0.3702
## Pos Pred Value : 0.8956
## Neg Pred Value : 0.5776
## Prevalence : 0.8502
## Detection Rate : 0.8096
## Detection Prevalence : 0.9040
## Balanced Accuracy : 0.6612
##
## 'Positive' Class : 0
##
library(rpart)
model_dt<-rpart(Subscribed ~ .,data=BankMarketingDataTrain)
library(rpart.plot)
rpart.plot(model_dt)
pred<-predict(model_dt,BankMarketingDataTest,type = "class")
confusionMatrix(as.factor(BankMarketingDataTest$Subscribed),as.factor(pred))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 959 133
## 1 53 63
##
## Accuracy : 0.846
## 95% CI : (0.8244, 0.8659)
## No Information Rate : 0.8377
## P-Value [Acc > NIR] : 0.2304
##
## Kappa : 0.3221
##
## Mcnemar's Test P-Value : 6.932e-09
##
## Sensitivity : 0.9476
## Specificity : 0.3214
## Pos Pred Value : 0.8782
## Neg Pred Value : 0.5431
## Prevalence : 0.8377
## Detection Rate : 0.7939
## Detection Prevalence : 0.9040
## Balanced Accuracy : 0.6345
##
## 'Positive' Class : 0
##
table(BankMarketingDataTest$Subscribed)
##
## 0 1
## 1092 116
Accuracy is looking good with almost 84% and sensitivity is ~95% i.e True Positive Rate is also good.
Using advanced models - Bagging
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
model_rf<-train(Subscribed ~ .,data=BankMarketingDataTrain,method="rf",ntree =20)
pred2<-predict(model_rf,BankMarketingDataTest)
confusionMatrix(as.factor(BankMarketingDataTest$Subscribed),as.factor(pred2))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1037 55
## 1 83 33
##
## Accuracy : 0.8858
## 95% CI : (0.8665, 0.9032)
## No Information Rate : 0.9272
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.2624
##
## Mcnemar's Test P-Value : 0.02154
##
## Sensitivity : 0.9259
## Specificity : 0.3750
## Pos Pred Value : 0.9496
## Neg Pred Value : 0.2845
## Prevalence : 0.9272
## Detection Rate : 0.8584
## Detection Prevalence : 0.9040
## Balanced Accuracy : 0.6504
##
## 'Positive' Class : 0
##
library(ada)
model_ada<-ada(Subscribed ~ .,data=BankMarketingDataTrain,loss="exponential",type="discrete",iter=50 )
summary(model_ada)
## Call:
## ada(Subscribed ~ ., data = BankMarketingDataTrain, loss = "exponential",
## type = "discrete", iter = 50)
##
## Loss: exponential Method: discrete Iteration: 50
##
## Training Results
##
## Accuracy: 0.844 Kappa: 0.687
pred_ada<-predict(model_ada,BankMarketingDataTest)
confusionMatrix(as.factor(BankMarketingDataTest$Subscribed),as.factor(pred_ada))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 960 132
## 1 49 67
##
## Accuracy : 0.8502
## 95% CI : (0.8288, 0.8698)
## No Information Rate : 0.8353
## P-Value [Acc > NIR] : 0.0861
##
## Kappa : 0.3461
##
## Mcnemar's Test P-Value : 1.094e-09
##
## Sensitivity : 0.9514
## Specificity : 0.3367
## Pos Pred Value : 0.8791
## Neg Pred Value : 0.5776
## Prevalence : 0.8353
## Detection Rate : 0.7947
## Detection Prevalence : 0.9040
## Balanced Accuracy : 0.6441
##
## 'Positive' Class : 0
##
#—Try VIMP
library(fastDummies)
dummy_data <- fastDummies::dummy_cols(BankMarketingData_Clean[,c(1:19,21)],remove_first_dummy = TRUE)
names(dummy_data)
## [1] "age" "job"
## [3] "marital" "education"
## [5] "default" "housing"
## [7] "loan" "contact"
## [9] "month" "day_of_week"
## [11] "campaign" "pdays"
## [13] "previous" "poutcome"
## [15] "emp.var.rate" "cons.price.idx"
## [17] "cons.conf.idx" "euribor3m"
## [19] "nr.employed" "rangeage"
## [21] "job_blue-collar" "job_entrepreneur"
## [23] "job_housemaid" "job_management"
## [25] "job_retired" "job_self-employed"
## [27] "job_services" "job_student"
## [29] "job_technician" "job_unemployed"
## [31] "job_unknown" "marital_married"
## [33] "marital_single" "marital_unknown"
## [35] "education_basic.6y" "education_basic.9y"
## [37] "education_high.school" "education_illiterate"
## [39] "education_professional.course" "education_university.degree"
## [41] "education_unknown" "default_unknown"
## [43] "default_yes" "housing_unknown"
## [45] "housing_yes" "loan_unknown"
## [47] "loan_yes" "contact_telephone"
## [49] "month_aug" "month_dec"
## [51] "month_jul" "month_jun"
## [53] "month_mar" "month_may"
## [55] "month_nov" "month_oct"
## [57] "month_sep" "day_of_week_mon"
## [59] "day_of_week_thu" "day_of_week_tue"
## [61] "day_of_week_wed" "poutcome_nonexistent"
## [63] "poutcome_success" "rangeage_23-35"
## [65] "rangeage_36-50" "rangeage_above_50"
keep <-
c("campaign","pdays","previous","emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed", "job_blue-collar",
"job_entrepreneur", "job_housemaid" , "job_management" ,
"job_retired", "job_self-employed" , "job_services" ,
"job_student" , "job_technician" , "job_unemployed" ,
"job_unknown", "marital_married" , "marital_single" ,
"marital_unknown" , "education_basic.6y" , "education_basic.9y" ,
"education_high.school" , "education_professional.course",
"education_university.degree",
"housing_yes" ,
"loan_yes" , "contact_telephone" ,
"month_aug" , "month_dec" , "month_jul" ,
"month_jun" , "month_mar" , "month_may" ,
"month_nov" , "month_oct" , "month_sep" ,
"day_of_week_mon" , "day_of_week_thu" , "day_of_week_tue" ,
"day_of_week_wed" , "poutcome_nonexistent" , "poutcome_success" ,
"rangeage_23-35" , "rangeage_36-50" , "rangeage_above_50")
final <- dummy_data[keep]
final <- cbind(final,Subscribed = BankMarketingData_Clean$Subscribed )
str(final)
## 'data.frame': 4014 obs. of 49 variables:
## $ campaign : int 2 4 1 1 3 4 2 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 2 0 0 1 0 0 ...
## $ emp.var.rate : num -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 1.4 ...
## $ cons.price.idx : num 92.9 94 94.5 93.2 94.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 -42.7 ...
## $ euribor3m : num 1.313 4.855 4.962 4.191 0.884 ...
## $ nr.employed : num 5099 5191 5228 5196 4964 ...
## $ job_blue-collar : int 1 0 0 0 0 0 0 0 1 0 ...
## $ job_entrepreneur : int 0 0 0 0 0 0 1 0 0 0 ...
## $ job_housemaid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_management : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_retired : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_self-employed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_services : int 0 1 1 0 1 0 0 1 0 1 ...
## $ job_student : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_technician : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_unemployed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ job_unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ marital_married : int 1 0 1 1 0 0 1 0 1 0 ...
## $ marital_single : int 0 1 0 0 1 1 0 0 0 1 ...
## $ marital_unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ education_basic.6y : int 0 0 0 0 0 0 0 0 0 1 ...
## $ education_basic.9y : int 1 0 0 0 0 0 0 0 1 0 ...
## $ education_high.school : int 0 1 1 0 0 0 0 0 0 0 ...
## $ education_professional.course: int 0 0 0 0 0 0 0 1 0 0 ...
## $ education_university.degree : int 0 0 0 1 1 1 1 0 0 0 ...
## $ housing_yes : int 1 0 1 1 0 1 1 0 0 1 ...
## $ loan_yes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ contact_telephone : int 0 1 1 0 0 0 0 0 1 0 ...
## $ month_aug : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_dec : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_jul : int 0 0 0 0 0 0 0 0 0 1 ...
## $ month_jun : int 0 0 1 0 0 0 0 0 0 0 ...
## $ month_mar : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_may : int 1 1 0 0 0 0 0 0 1 0 ...
## $ month_nov : int 0 0 0 1 0 0 1 1 0 0 ...
## $ month_oct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ month_sep : int 0 0 0 0 1 1 0 0 0 0 ...
## $ day_of_week_mon : int 0 0 0 1 0 1 1 0 0 0 ...
## $ day_of_week_thu : int 0 0 0 0 1 0 0 0 1 1 ...
## $ day_of_week_tue : int 0 0 0 0 0 0 0 1 0 0 ...
## $ day_of_week_wed : int 0 0 1 0 0 0 0 0 0 0 ...
## $ poutcome_nonexistent : int 1 1 1 1 0 1 1 0 1 1 ...
## $ poutcome_success : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rangeage_23-35 : int 1 0 1 0 1 1 0 1 1 1 ...
## $ rangeage_36-50 : int 0 1 0 1 0 0 1 0 0 0 ...
## $ rangeage_above_50 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Subscribed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
index <- caret::createDataPartition(final$Subscribed, p=0.8, list=FALSE)
X_train <- final[, 1:48]
y_train <- as.factor(final[, 49])
X_test <- final[, 1:48]
y_test <- as.factor(final[, 49])
model_classy <- train(X_train, y_train, method='LogitBoost',preProcess=c("center", "scale"))
feature_importance <- varImp(model_classy, scale=FALSE)
row.names(feature_importance$importance %>% filter(X0>0.6))
## [1] "pdays" "previous" "emp.var.rate"
## [4] "euribor3m" "nr.employed" "contact_telephone"
## [7] "poutcome_nonexistent"
plot(feature_importance)
predictions<-predict(object=model_classy,X_test)
table(predictions)
## predictions
## 0 1
## 3873 141
confusionMatrix(predictions,y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3527 346
## 1 45 96
##
## Accuracy : 0.9026
## 95% CI : (0.893, 0.9116)
## No Information Rate : 0.8899
## P-Value [Acc > NIR] : 0.004878
##
## Kappa : 0.2916
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9874
## Specificity : 0.2172
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.6809
## Prevalence : 0.8899
## Detection Rate : 0.8787
## Detection Prevalence : 0.9649
## Balanced Accuracy : 0.6023
##
## 'Positive' Class : 0
##
keep_2 <-
c("pdays" , "previous" , "emp.var.rate" , "euribor3m" ,
"nr.employed" , "contact_telephone" , "poutcome_nonexistent")
final_2 <- dummy_data[keep_2]
final_2 <- cbind(final_2,Subscribed = BankMarketingData_Clean$Subscribed )
str(final_2)
## 'data.frame': 4014 obs. of 8 variables:
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 2 0 0 1 0 0 ...
## $ emp.var.rate : num -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 1.4 ...
## $ euribor3m : num 1.313 4.855 4.962 4.191 0.884 ...
## $ nr.employed : num 5099 5191 5228 5196 4964 ...
## $ contact_telephone : int 0 1 1 0 0 0 0 0 1 0 ...
## $ poutcome_nonexistent: int 1 1 1 1 0 1 1 0 1 1 ...
## $ Subscribed : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
index <- caret::createDataPartition(final_2$Subscribed, p=0.8, list=FALSE)
X_train <- final_2[index, 1:7]
y_train <- as.factor(final_2[index, 8])
X_test <- final_2[-index, 1:7]
y_test <- as.factor(final_2[-index, 8])
df_tmp <- cbind(X_train,Subscribed = y_train)
table(df_tmp$Subscribed)
##
## 0 1
## 2858 354
df_tmp <- ovun.sample(Subscribed ~ ., data = df_tmp, method = "over",N =(2858 * 2) )$data
table(df_tmp$Subscribed)
##
## 0 1
## 2858 2858
counts <- table(BankMarketingData_Clean$Subscribed)
barplot(counts, main="Target Distribution",col=c("lightblue","blue"),
xlab="Term Deposit Subscription" )
counts <- table(BankMarketingData_Clean$marital)
barplot(counts, main="Target Distribution",col=c("yellow","orange","red","blue"),
xlab="Term Deposit Subscription" )
model_classy <- train(df_tmp[,1:7], df_tmp$Subscribed, method='ada',preProcess=c("center", "scale"))
# LogitBoost
feature_importance <- varImp(model_classy, scale=FALSE)
# row.names(feature_importance$importance %>% filter(X0>0.6))
plot(feature_importance)
predictions<-predict(object=model_classy,X_test)
table(predictions)
## predictions
## 0 1
## 665 137
confusionMatrix(predictions,y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 627 38
## 1 87 50
##
## Accuracy : 0.8441
## 95% CI : (0.8172, 0.8686)
## No Information Rate : 0.8903
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3588
##
## Mcnemar's Test P-Value : 1.761e-05
##
## Sensitivity : 0.8782
## Specificity : 0.5682
## Pos Pred Value : 0.9429
## Neg Pred Value : 0.3650
## Prevalence : 0.8903
## Detection Rate : 0.7818
## Detection Prevalence : 0.8292
## Balanced Accuracy : 0.7232
##
## 'Positive' Class : 0
##
model_logitboost <- train(df_tmp[,1:7], df_tmp$Subscribed, method='LogitBoost',
preProcess=c("center", "scale"))
# LogitBoost
predictions<-predict(object=model_logitboost,X_test)
table(predictions)
## predictions
## 0 1
## 657 145
confusionMatrix(predictions,y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 625 32
## 1 89 56
##
## Accuracy : 0.8491
## 95% CI : (0.8225, 0.8732)
## No Information Rate : 0.8903
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3985
##
## Mcnemar's Test P-Value : 3.564e-07
##
## Sensitivity : 0.8754
## Specificity : 0.6364
## Pos Pred Value : 0.9513
## Neg Pred Value : 0.3862
## Prevalence : 0.8903
## Detection Rate : 0.7793
## Detection Prevalence : 0.8192
## Balanced Accuracy : 0.7559
##
## 'Positive' Class : 0
##