Introduction of the Problem Statement
In this assignment, your group will analyse data related to the direct marketing campaign (in this case, phone calls) of a Portuguese banking institution. The goals are to • understand which of the observed variables are most associated with the chance of subscribing to a term deposit (feature importance); • how the important variables relate to the predicted probability that a client will subscribe to a term deposit (feature effects); • build a model that could be useful in determining which future clients are likely to respond a term deposit, if contacted (prediction/deployment). The raw data contain N = 41,188 observations on the following 21 variables:
You can also embed plots, for example:
## corrplot 0.92 loaded
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:testthat':
##
## matches
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
Check the structure, dimension and summary of training data set
## [1] 37070 21
## 'data.frame': 37070 obs. of 21 variables:
## $ age : int 57 37 40 56 45 59 25 41 25 29 ...
## $ job : chr "services" "services" "admin." "services" ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "high.school" "high.school" "basic.6y" "high.school" ...
## $ default : chr "unknown" "no" "no" "no" ...
## $ housing : chr "no" "yes" "no" "no" ...
## $ loan : chr "no" "no" "no" "yes" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 149 226 151 307 198 139 50 55 222 137 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp_var_rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num 94 94 94 94 94 ...
## $ cons_conf_idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num 5191 5191 5191 5191 5191 ...
## $ subscribed : chr "no" "no" "no" "no" ...
## NULL
## age job marital education
## Min. :17.00 Length:37070 Length:37070 Length:37070
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.04
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:37070 Length:37070 Length:37070 Length:37070
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:37070 Length:37070 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.5 Mean : 2.565
## 3rd Qu.: 320.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp_var_rate
## Min. : 0.0 Min. :0.0000 Length:37070 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :962.2 Mean :0.1737 Mean : 0.08046
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :6.0000 Max. : 1.40000
## cons_price_idx cons_conf_idx euribor3m nr_employed
## Min. :92.20 Min. :-50.80 Min. :0.634 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.70 1st Qu.:1.344 1st Qu.:5099
## Median :93.75 Median :-41.80 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.49 Mean :3.620 Mean :5167
## 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.90 Max. :5.045 Max. :5228
## subscribed
## Length:37070
## Class :character
## Mode :character
##
##
##
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
## 0 0 0 0 0
## subscribed
## 0
## [1] "There are no null values"
## [1] "Total duplicate rows"
## [1] 9
Converting Character variables into Factors
bank_data_train[,"age"] <- as.numeric(bank_data_train[,"age"])
bank_data_train[,"subscribed"] <- as.factor(bank_data_train[,"subscribed"])
# Storing the Categorical and Numerical columns separately
cat_var <- names(bank_data_train %>% select_if(~!is.numeric(.)))
num_var <- names(bank_data_train %>% select_if(~is.numeric(.)))
# 12 categorical variables; 10 numeric variables
# Convert categorical variables into factor
bank_data_train[, cat_var] <- lapply(bank_data_train[,cat_var], as.factor)
sapply(bank_data_train[, cat_var], table)
## $job
##
## admin. blue-collar entrepreneur housemaid management
## 9334 8365 1314 969 2632
## retired self-employed services student technician
## 1541 1273 3589 773 6047
## unemployed unknown
## 917 307
##
## $marital
##
## divorced married single unknown
## 4145 22444 10396 76
##
## $education
##
## basic.4y basic.6y basic.9y high.school
## 3814 2073 5456 8526
## illiterate professional.course university.degree unknown
## 18 4702 10931 1541
##
## $default
##
## no unknown yes
## 29292 7766 3
##
## $housing
##
## no unknown yes
## 16774 896 19391
##
## $loan
##
## no unknown yes
## 30559 896 5606
##
## $contact
##
## cellular telephone
## 23458 13603
##
## $month
##
## apr aug dec jul jun mar may nov oct sep
## 2380 5542 162 6418 4766 495 12451 3683 647 517
##
## $day_of_week
##
## fri mon thu tue wed
## 7033 7660 7747 7296 7325
##
## $poutcome
##
## failure nonexistent success
## 3840 31979 1242
##
## $subscribed
##
## no yes
## 32886 4175
Providing specific order to the levels for Month column
# Providing specific Order to Month factors
bank_data_train$month <- factor(bank_data_train$month,
levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
# plot densities
sm.density.compare(bank_data_train$age, bank_data_train$subscribed, xlab="Age")
title(main="Distribution of Responses by Age")
colfill<-c(2:(2+length(levels(bank_data_train$subscribed))))
legend("topright", inset = .05, title = "Respone",
legend= levels(bank_data_train$subscribed), fill = colfill, horiz=TRUE)
bank_data_train["age_group"] <- cut(bank_data_train$age, c(17, 25, 34, 44, 54, 64, 74, Inf),
c("<= 24 yrs", "25-34 yrs",
"35-44 yrs", "45-54 years", "55-64 yrs",
"65-74 yrs", "75+ yrs"), include.lowest=TRUE)
# Plotting the responses by age_group
age_gp_df <- data.frame(table(bank_data_train$age_group, bank_data_train$subscribed))
colnames(age_gp_df) <- c("age_group","subscribed","customers")
ggplot(data=age_gp_df, aes(x = age_group, y = customers, fill = subscribed))+
geom_bar(stat = 'identity', position = 'dodge')+
labs(X="Number of Responses",
y=NULL,
title="Age wise Subsciption summary")
Histograms for Categorical Data
# Histogram of categorical variables
bank_train_cat <- bank_data_train[, cat_var]
bank_train_num <- bank_data_train[, num_var]
plotHist <- function(data_in, i) {
data <- data.frame(x=data_in[[i]])
p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
return (p)
}
doPlots <- function(data_in, fun, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- fun(data_in=data_in, i=i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol=ncol))
}
doPlots(bank_train_cat, fun = plotHist, ii = 1:4, ncol = 2)
doPlots(bank_train_cat, fun = plotHist, ii = 5:8, ncol = 2)
Pair-plots for Numerical Data
# pairplot
ggpairs(bank_train_num, lower = list(continuous = wrap("points",size=0.001)))
Correlation Plot for Continuous Variables We can also convert the binary response variable to 0/1 and test correlation with numerical variables. This is called poin-biserial correlation and can be approximated to Pearson’s correlation coefficient
## Storing the numerical data in a new DataFrame
df111 <- copy(bank_train_num)
# converting the target variable (subscribed) to numeric, only for the purpose of bi-serial correlation
df111$response <- (as.numeric(bank_data_train$subscribed))-1
correlations <- cor(na.omit(df111))
row_indic <- apply(correlations, 1, function(x) sum(x > 0.4 | x < -0.4) > 1)
# onlykeeping the rows where there's enough correlation between variables for the plot
correlations<- correlations[row_indic ,row_indic ]
corrplot(correlations, method="number")
Chi-Square Test for Continuous Variables
## Trustworthy Chi-square results ordered in descending value of Chi-square
chisq.test(bank_data_train$poutcome, bank_data_train$subscribed) # X-squared = 3899.9, df = 2, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: bank_data_train$poutcome and bank_data_train$subscribed
## X-squared = 3899.9, df = 2, p-value < 2.2e-16
chisq.test(bank_data_train$month, bank_data_train$subscribed) # X-squared = 2768.6, df = 9, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: bank_data_train$month and bank_data_train$subscribed
## X-squared = 2768.6, df = 9, p-value < 2.2e-16
chisq.test(bank_data_train$age_group, bank_data_train$subscribed) # X-squared = 1116.2, df = 6, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: bank_data_train$age_group and bank_data_train$subscribed
## X-squared = 1116.2, df = 6, p-value < 2.2e-16
chisq.test(bank_data_train$job, bank_data_train$subscribed) # X-squared = 879.01, df = 11, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: bank_data_train$job and bank_data_train$subscribed
## X-squared = 879.09, df = 11, p-value < 2.2e-16
chisq.test(bank_data_train$contact, bank_data_train$subscribed) # X-squared = 790.62, df = 1, p-value < 2.2e-16
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: bank_data_train$contact and bank_data_train$subscribed
## X-squared = 790.62, df = 1, p-value < 2.2e-16
chisq.test(bank_data_train$marital, bank_data_train$subscribed) # X-squared = 109.72, df = 3, p-value < 2.2e-16
##
## Pearson's Chi-squared test
##
## data: bank_data_train$marital and bank_data_train$subscribed
## X-squared = 109.72, df = 3, p-value < 2.2e-16
chisq.test(bank_data_train$day_of_week, bank_data_train$subscribed) # X-squared = 23.704, df = 4, p-value = 9.154e-05
##
## Pearson's Chi-squared test
##
## data: bank_data_train$day_of_week and bank_data_train$subscribed
## X-squared = 23.704, df = 4, p-value = 9.154e-05
chisq.test(bank_data_train$housing, bank_data_train$subscribed) # X-squared = 6.6319, df = 2, p-value = 0.0363
##
## Pearson's Chi-squared test
##
## data: bank_data_train$housing and bank_data_train$subscribed
## X-squared = 6.6892, df = 2, p-value = 0.03527
chisq.test(bank_data_train$loan, bank_data_train$subscribed) # X-squared = 3.2153, df = 2, p-value = 0.2004
##
## Pearson's Chi-squared test
##
## data: bank_data_train$loan and bank_data_train$subscribed
## X-squared = 3.2159, df = 2, p-value = 0.2003
Observing Response to the Marketing Campaigns for some features in the data-set
Call Duration vs Response
# plot densities
sm.density.compare(bank_data_train$duration, bank_data_train$subscribed, xlab="Duration of Call")
title(main="Distribution of Calls Duration w.r.t Response")
colfill<-c(2:(2+length(levels(bank_data_train$subscribed))))
legend("topright", inset = .05, title = "Response",
legend= levels(bank_data_train$subscribed), fill = colfill, horiz=TRUE)
Loan vs Subscribed
loan_df <- data.frame(table(bank_data_train$loan, bank_data_train$subscribed))
colnames(loan_df) <- c("loan","subscribed","customers")
ggplot(data=bank_data_train, aes(x = bank_data_train$loan, y = bank_data_train$subscribed, fill = subscribed))+
geom_bar(stat = 'identity', position = 'dodge')+
labs(X="Do they have a pior Loan?",
y=NULL,
title="Loan vs. Response")
This doesn’t tell much since the distribution is fairly equal in terms of percentage as 88.6% people not having a loan declining the offer to invest in the term deposit compared to 89.4% of people having a prior loan declined to invest*
poutcome_df <- data.frame(table(bank_data_train$poutcome, bank_data_train$subscribed))
colnames(poutcome_df) <- c("poutcome","subscribed","customers")
ggplot(data=poutcome_df, aes(x = poutcome, y = customers, fill = subscribed))+
geom_bar(stat = 'identity', position = 'dodge')+
labs(X="Result of Precious outreach",
y=NULL,
title="Success of previous marketing reach")
This observation tells us that if the customer had responded positively to prior marketing campaign and invested in the Bank (through investments/loans), they are likely to invest this time again. Where as large portion of the customer base wasn’t contacted prior to this market and have responded in a better proportion compared to the people who responded negatively to prior marketing campaign. Hence the Bank could focus on these two segments - poutcome : success & unknown for better success rate*
From above chi-square results we can select poutcome, month, age_group, job, and contact variables for our model.They have highest Chi-squared values and significant p-values to reject the hypothesis and conclude that subscribed(Y) is dependent on these categorical variables. Since we will be keeping age_group as an important variable, we can drop numerical age from our computations
bank_data_train <- within(bank_data_train, rm(age))
Preparing a Full model with all the variables to check multi-collinearity (VIF), redundant variables (HMISC::REDUN) and Variable importance plots (VIP)
# FULL model
full_mod <- glm(formula = subscribed ~ ., data = bank_data_train, family = binomial(link = "logit"))
summary(full_mod)
##
## Call:
## glm(formula = subscribed ~ ., family = binomial(link = "logit"),
## data = bank_data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0167 -0.2969 -0.1857 -0.1329 3.3893
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.952e+02 4.036e+01 -4.838 1.31e-06 ***
## jobblue-collar -1.959e-01 8.432e-02 -2.323 0.020153 *
## jobentrepreneur -1.247e-01 1.326e-01 -0.941 0.346746
## jobhousemaid 3.370e-02 1.530e-01 0.220 0.825632
## jobmanagement 2.013e-02 8.949e-02 0.225 0.822064
## jobretired 1.430e-01 1.286e-01 1.112 0.266215
## jobself-employed -9.018e-02 1.235e-01 -0.730 0.465199
## jobservices -1.214e-01 9.126e-02 -1.331 0.183309
## jobstudent 9.859e-02 1.263e-01 0.781 0.434910
## jobtechnician 4.214e-02 7.487e-02 0.563 0.573549
## jobunemployed 2.892e-02 1.363e-01 0.212 0.831943
## jobunknown 1.734e-02 2.425e-01 0.072 0.942977
## maritalmarried 7.376e-03 7.260e-02 0.102 0.919078
## maritalsingle 2.154e-02 8.304e-02 0.259 0.795370
## maritalunknown 5.223e-02 4.201e-01 0.124 0.901055
## educationbasic.6y 1.856e-01 1.280e-01 1.450 0.147149
## educationbasic.9y 4.842e-02 1.011e-01 0.479 0.632122
## educationhigh.school 1.282e-01 9.776e-02 1.312 0.189644
## educationilliterate 1.157e+00 7.622e-01 1.518 0.128897
## educationprofessional.course 1.507e-01 1.078e-01 1.398 0.162013
## educationuniversity.degree 2.601e-01 9.820e-02 2.649 0.008076 **
## educationunknown 1.519e-01 1.269e-01 1.197 0.231357
## defaultunknown -2.863e-01 7.130e-02 -4.015 5.94e-05 ***
## defaultyes -7.357e+00 1.135e+02 -0.065 0.948313
## housingunknown -9.684e-02 1.470e-01 -0.659 0.509937
## housingyes 1.014e-02 4.366e-02 0.232 0.816411
## loanunknown NA NA NA NA
## loanyes -1.091e-01 6.145e-02 -1.776 0.075708 .
## contacttelephone -6.155e-01 8.034e-02 -7.661 1.84e-14 ***
## monthapr -1.869e+00 1.523e-01 -12.271 < 2e-16 ***
## monthmay -2.358e+00 1.282e-01 -18.397 < 2e-16 ***
## monthjun -2.285e+00 2.205e-01 -10.365 < 2e-16 ***
## monthjul -1.753e+00 1.601e-01 -10.948 < 2e-16 ***
## monthaug -1.097e+00 1.336e-01 -8.212 < 2e-16 ***
## monthsep -1.625e+00 1.627e-01 -9.988 < 2e-16 ***
## monthoct -1.770e+00 1.582e-01 -11.186 < 2e-16 ***
## monthnov -2.327e+00 1.518e-01 -15.329 < 2e-16 ***
## monthdec -1.598e+00 2.249e-01 -7.105 1.21e-12 ***
## day_of_weekmon -1.111e-01 6.985e-02 -1.591 0.111629
## day_of_weekthu 4.879e-02 6.775e-02 0.720 0.471428
## day_of_weektue 1.265e-01 6.930e-02 1.825 0.067933 .
## day_of_weekwed 1.522e-01 6.956e-02 2.188 0.028668 *
## duration 4.699e-03 7.863e-05 59.763 < 2e-16 ***
## campaign -3.772e-02 1.207e-02 -3.125 0.001777 **
## pdays -9.539e-04 2.273e-04 -4.196 2.72e-05 ***
## previous -4.553e-02 6.318e-02 -0.721 0.471129
## poutcomenonexistent 4.357e-01 1.002e-01 4.350 1.36e-05 ***
## poutcomesuccess 9.488e-01 2.215e-01 4.283 1.84e-05 ***
## emp_var_rate -1.618e+00 1.506e-01 -10.743 < 2e-16 ***
## cons_price_idx 1.915e+00 2.668e-01 7.178 7.06e-13 ***
## cons_conf_idx 1.516e-02 8.168e-03 1.856 0.063391 .
## euribor3m 3.791e-01 1.371e-01 2.765 0.005694 **
## nr_employed 2.712e-03 3.284e-03 0.826 0.408927
## age_group25-34 yrs -1.764e-01 1.015e-01 -1.737 0.082313 .
## age_group35-44 yrs -3.696e-01 1.087e-01 -3.400 0.000675 ***
## age_group45-54 years -2.956e-01 1.158e-01 -2.552 0.010725 *
## age_group55-64 yrs -1.437e-01 1.310e-01 -1.097 0.272574
## age_group65-74 yrs 1.115e-02 1.939e-01 0.057 0.954160
## age_group75+ yrs 3.448e-01 2.187e-01 1.577 0.114867
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 15355 on 37003 degrees of freedom
## AIC: 15471
##
## Number of Fisher Scoring iterations: 10
(dev_1 <- full_mod$deviance/full_mod$df.residual) # 0.415)
## [1] 0.4149741
AIC(full_mod) # 15471.29
## [1] 15471.29
BIC(full_mod) # 15965.47
## [1] 15965.47
# let's use HMISC::REDUN package to solve this
# (redun <- Hmisc::redun(~., data = bank_data_train, nk = 4, r2 = 'adjusted'))
(redun <- Hmisc::redun(~., data = bank_data_train, nk = 0))
##
## Redundancy Analysis
##
## Hmisc::redun(formula = ~., data = bank_data_train, nk = 0)
##
## n: 37061 p: 21 nk: 0
##
## Number of NAs: 0
##
## Transformation of target variables forced to be linear
##
## R-squared cutoff: 0.9 Type: ordinary
##
## R^2 with which each variable can be predicted from all other variables:
##
## job marital education default housing
## 0.452 0.234 0.469 0.138 1.000
## loan contact month day_of_week duration
## 1.000 0.722 0.938 0.022 0.196
## campaign pdays previous poutcome emp_var_rate
## 0.046 0.919 0.833 0.907 0.996
## cons_price_idx cons_conf_idx euribor3m nr_employed subscribed
## 0.989 0.854 0.995 0.995 0.358
## age_group
## 0.460
##
## Rendundant variables:
##
## housing emp_var_rate euribor3m pdays
##
## Predicted from variables:
##
## job marital education default loan contact month day_of_week duration campaign previous poutcome cons_price_idx cons_conf_idx nr_employed subscribed age_group
##
## Variable Deleted R^2 R^2 after later deletions
## 1 housing 1.000 1 1 1
## 2 emp_var_rate 0.996 0.996 0.996
## 3 euribor3m 0.995 0.995
## 4 pdays 0.919
# based on this, we can drop variables - ['housing', 'emp_var_rate', 'euribor3m', 'pdays'] in first iteration
bank_df2 = dplyr::select(bank_data_train, -c('housing', 'emp_var_rate', 'euribor3m', 'pdays'))
(redun <- Hmisc::redun(~., data = bank_df2, nk = 0))
##
## Redundancy Analysis
##
## Hmisc::redun(formula = ~., data = bank_df2, nk = 0)
##
## n: 37061 p: 17 nk: 0
##
## Number of NAs: 0
##
## Transformation of target variables forced to be linear
##
## R-squared cutoff: 0.9 Type: ordinary
##
## R^2 with which each variable can be predicted from all other variables:
##
## job marital education default loan
## 0.452 0.234 0.469 0.138 0.003
## contact month day_of_week duration campaign
## 0.696 0.657 0.018 0.196 0.043
## previous poutcome cons_price_idx cons_conf_idx nr_employed
## 0.812 0.807 0.671 0.556 0.719
## subscribed age_group
## 0.354 0.458
##
## No redundant variables
## lets fit a full model on this df and check VIF
full_mod_2 <- glm(formula = subscribed ~ ., data = bank_df2, family = binomial(link = "logit"))
summary(full_mod_2)
##
## Call:
## glm(formula = subscribed ~ ., family = binomial(link = "logit"),
## data = bank_df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0254 -0.3052 -0.1901 -0.1316 3.0676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.947e+01 4.418e+00 20.249 < 2e-16 ***
## jobblue-collar -2.222e-01 8.415e-02 -2.640 0.008290 **
## jobentrepreneur -1.298e-01 1.319e-01 -0.984 0.325103
## jobhousemaid 1.462e-02 1.526e-01 0.096 0.923632
## jobmanagement 1.580e-02 8.918e-02 0.177 0.859377
## jobretired 1.489e-01 1.277e-01 1.166 0.243683
## jobself-employed -9.994e-02 1.233e-01 -0.810 0.417761
## jobservices -1.389e-01 9.091e-02 -1.528 0.126453
## jobstudent 1.317e-01 1.257e-01 1.047 0.294917
## jobtechnician 2.137e-02 7.444e-02 0.287 0.774070
## jobunemployed 2.052e-02 1.357e-01 0.151 0.879802
## jobunknown 8.029e-02 2.402e-01 0.334 0.738187
## maritalmarried 1.483e-02 7.232e-02 0.205 0.837536
## maritalsingle 2.799e-02 8.272e-02 0.338 0.735116
## maritalunknown 5.771e-02 4.202e-01 0.137 0.890772
## educationbasic.6y 1.790e-01 1.278e-01 1.401 0.161098
## educationbasic.9y 5.564e-02 1.009e-01 0.552 0.581258
## educationhigh.school 1.161e-01 9.755e-02 1.190 0.234160
## educationilliterate 1.152e+00 7.440e-01 1.549 0.121472
## educationprofessional.course 1.314e-01 1.075e-01 1.222 0.221732
## educationuniversity.degree 2.439e-01 9.802e-02 2.488 0.012841 *
## educationunknown 1.520e-01 1.269e-01 1.198 0.231037
## defaultunknown -2.966e-01 7.115e-02 -4.169 3.06e-05 ***
## defaultyes -7.520e+00 1.134e+02 -0.066 0.947106
## loanunknown -8.263e-02 1.447e-01 -0.571 0.568001
## loanyes -1.156e-01 6.111e-02 -1.892 0.058480 .
## contacttelephone -3.338e-01 7.235e-02 -4.613 3.97e-06 ***
## monthapr -1.175e+00 1.238e-01 -9.492 < 2e-16 ***
## monthmay -1.945e+00 1.178e-01 -16.513 < 2e-16 ***
## monthjun -6.414e-01 1.275e-01 -5.032 4.85e-07 ***
## monthjul -9.131e-01 1.311e-01 -6.964 3.31e-12 ***
## monthaug -1.061e+00 1.326e-01 -8.003 1.22e-15 ***
## monthsep -1.651e+00 1.587e-01 -10.408 < 2e-16 ***
## monthoct -1.270e+00 1.502e-01 -8.452 < 2e-16 ***
## monthnov -1.465e+00 1.315e-01 -11.134 < 2e-16 ***
## monthdec -1.169e+00 2.209e-01 -5.291 1.22e-07 ***
## day_of_weekmon -1.235e-01 6.948e-02 -1.777 0.075538 .
## day_of_weekthu 2.725e-02 6.729e-02 0.405 0.685546
## day_of_weektue 1.140e-01 6.881e-02 1.656 0.097659 .
## day_of_weekwed 1.270e-01 6.914e-02 1.836 0.066292 .
## duration 4.677e-03 7.826e-05 59.760 < 2e-16 ***
## campaign -4.286e-02 1.214e-02 -3.532 0.000413 ***
## previous 6.496e-02 5.962e-02 1.090 0.275899
## poutcomenonexistent 5.104e-01 9.923e-02 5.143 2.70e-07 ***
## poutcomesuccess 1.802e+00 8.996e-02 20.028 < 2e-16 ***
## cons_price_idx -2.513e-01 4.523e-02 -5.555 2.77e-08 ***
## cons_conf_idx 1.059e-02 5.409e-03 1.959 0.050146 .
## nr_employed -1.328e-02 3.817e-04 -34.789 < 2e-16 ***
## age_group25-34 yrs -1.873e-01 1.010e-01 -1.854 0.063796 .
## age_group35-44 yrs -3.745e-01 1.083e-01 -3.460 0.000541 ***
## age_group45-54 years -2.911e-01 1.153e-01 -2.525 0.011584 *
## age_group55-64 yrs -1.311e-01 1.302e-01 -1.007 0.313953
## age_group65-74 yrs 5.325e-02 1.929e-01 0.276 0.782512
## age_group75+ yrs 3.343e-01 2.186e-01 1.529 0.126166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 15493 on 37007 degrees of freedom
## AIC: 15601
##
## Number of Fisher Scoring iterations: 10
(dev_2 <- full_mod_2$deviance/full_mod_2$df.residual) # 0.4187)
## [1] 0.4186463
AIC(full_mod_2) # 15600.84
## [1] 15600.84
BIC(full_mod_2) # 16060.94
## [1] 16060.94
# checking VIFs using the full Model 2
car::vif(full_mod_2) # VIF scores generated
## GVIF Df GVIF^(1/(2*Df))
## job 9.059826 11 1.105364
## marital 1.510063 3 1.071106
## education 3.350200 7 1.090197
## default 1.148644 2 1.035253
## loan 1.006765 2 1.001687
## contact 1.821452 1 1.349612
## month 5.211980 9 1.096058
## day_of_week 1.059986 4 1.007309
## duration 1.239855 1 1.113488
## campaign 1.050150 1 1.024768
## previous 4.010475 1 2.002617
## poutcome 3.949355 2 1.409716
## cons_price_idx 1.950385 1 1.396562
## cons_conf_idx 2.351205 1 1.533364
## nr_employed 2.356913 1 1.535224
## age_group 4.491173 6 1.133348
job) with VIF score > 5. We can choose to drop this one to reduce any further multi-collinearity.# job has a high VIF (9.05) - we can consider to drop it.
bank_df3 <- dplyr::select(bank_df2, -c('job'))
full_mod_3 <- glm(formula = subscribed ~ ., data = bank_df3, family = binomial(link = "logit"))
summary(full_mod_3)
##
## Call:
## glm(formula = subscribed ~ ., family = binomial(link = "logit"),
## data = bank_df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0144 -0.3054 -0.1902 -0.1319 3.0394
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.968e+01 4.406e+00 20.353 < 2e-16 ***
## maritalmarried 6.824e-03 7.220e-02 0.095 0.924700
## maritalsingle 3.975e-02 8.233e-02 0.483 0.629233
## maritalunknown 3.974e-02 4.208e-01 0.094 0.924750
## educationbasic.6y 1.503e-01 1.265e-01 1.188 0.234806
## educationbasic.9y 2.972e-02 9.903e-02 0.300 0.764129
## educationhigh.school 1.691e-01 8.968e-02 1.886 0.059324 .
## educationilliterate 1.171e+00 7.408e-01 1.581 0.113791
## educationprofessional.course 2.137e-01 9.742e-02 2.194 0.028234 *
## educationuniversity.degree 3.280e-01 8.660e-02 3.787 0.000152 ***
## educationunknown 2.077e-01 1.224e-01 1.697 0.089659 .
## defaultunknown -3.129e-01 7.093e-02 -4.411 1.03e-05 ***
## defaultyes -7.481e+00 1.133e+02 -0.066 0.947346
## loanunknown -8.459e-02 1.447e-01 -0.585 0.558699
## loanyes -1.119e-01 6.106e-02 -1.833 0.066816 .
## contacttelephone -3.330e-01 7.225e-02 -4.609 4.05e-06 ***
## monthapr -1.187e+00 1.239e-01 -9.574 < 2e-16 ***
## monthmay -1.966e+00 1.178e-01 -16.681 < 2e-16 ***
## monthjun -6.502e-01 1.275e-01 -5.098 3.44e-07 ***
## monthjul -9.179e-01 1.311e-01 -7.001 2.55e-12 ***
## monthaug -1.046e+00 1.326e-01 -7.891 3.00e-15 ***
## monthsep -1.651e+00 1.587e-01 -10.398 < 2e-16 ***
## monthoct -1.264e+00 1.502e-01 -8.417 < 2e-16 ***
## monthnov -1.465e+00 1.315e-01 -11.138 < 2e-16 ***
## monthdec -1.159e+00 2.207e-01 -5.251 1.51e-07 ***
## day_of_weekmon -1.245e-01 6.941e-02 -1.794 0.072750 .
## day_of_weekthu 2.641e-02 6.723e-02 0.393 0.694464
## day_of_weektue 1.160e-01 6.871e-02 1.689 0.091257 .
## day_of_weekwed 1.290e-01 6.905e-02 1.868 0.061739 .
## duration 4.669e-03 7.811e-05 59.773 < 2e-16 ***
## campaign -4.337e-02 1.214e-02 -3.574 0.000352 ***
## previous 6.757e-02 5.956e-02 1.135 0.256550
## poutcomenonexistent 5.173e-01 9.914e-02 5.218 1.81e-07 ***
## poutcomesuccess 1.809e+00 8.987e-02 20.131 < 2e-16 ***
## cons_price_idx -2.486e-01 4.519e-02 -5.501 3.77e-08 ***
## cons_conf_idx 1.128e-02 5.404e-03 2.087 0.036870 *
## nr_employed -1.337e-02 3.785e-04 -35.334 < 2e-16 ***
## age_group25-34 yrs -2.387e-01 9.327e-02 -2.559 0.010498 *
## age_group35-44 yrs -4.276e-01 9.968e-02 -4.290 1.79e-05 ***
## age_group45-54 years -3.355e-01 1.071e-01 -3.134 0.001726 **
## age_group55-64 yrs -1.138e-01 1.170e-01 -0.973 0.330684
## age_group65-74 yrs 1.742e-01 1.595e-01 1.092 0.274832
## age_group75+ yrs 4.843e-01 1.854e-01 2.612 0.008990 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 15510 on 37018 degrees of freedom
## AIC: 15596
##
## Number of Fisher Scoring iterations: 10
(dev_3 <- full_mod_3$deviance/full_mod_3$df.residual) # 0.41898)
## [1] 0.4189805
AIC(full_mod_3) # 15595.82
## [1] 15595.82
BIC(full_mod_3) # 15962.19
## [1] 15962.19
## checking for VIF and redundancy in the new model/data
car::vif(full_mod_3)
## GVIF Df GVIF^(1/(2*Df))
## marital 1.456355 3 1.064661
## education 1.390601 7 1.023832
## default 1.141040 2 1.033535
## loan 1.005349 2 1.001335
## contact 1.817153 1 1.348018
## month 5.059042 9 1.094246
## day_of_week 1.055524 4 1.006778
## duration 1.236788 1 1.112110
## campaign 1.049627 1 1.024513
## previous 4.002795 1 2.000699
## poutcome 3.939368 2 1.408824
## cons_price_idx 1.949601 1 1.396281
## cons_conf_idx 2.348971 1 1.532635
## nr_employed 2.320086 1 1.523183
## age_group 1.935101 6 1.056555
(redun <- Hmisc::redun(~., data = bank_df3, nk = 0))
##
## Redundancy Analysis
##
## Hmisc::redun(formula = ~., data = bank_df3, nk = 0)
##
## n: 37061 p: 16 nk: 0
##
## Number of NAs: 0
##
## Transformation of target variables forced to be linear
##
## R-squared cutoff: 0.9 Type: ordinary
##
## R^2 with which each variable can be predicted from all other variables:
##
## marital education default loan contact
## 0.215 0.145 0.130 0.002 0.696
## month day_of_week duration campaign previous
## 0.654 0.017 0.195 0.043 0.811
## poutcome cons_price_idx cons_conf_idx nr_employed subscribed
## 0.807 0.670 0.555 0.716 0.354
## age_group
## 0.251
##
## No redundant variables
y <- bank_df3$subscribed # observed classes
prob <- predict(full_mod_3, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32028 858
## yes 2454 1721
(accuracy_full_model <- (cm[1,1] + cm[2,2])/length(y)) # 0.9106338 # initial accuracy
## [1] 0.9106338
#plotting ROC curve for Initial model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9333
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9333
vip::vi(full_mod_3) # see ?vip::vi for details
## # A tibble: 42 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 59.8 POS
## 2 nr_employed 35.3 NEG
## 3 poutcomesuccess 20.1 POS
## 4 monthmay 16.7 NEG
## 5 monthnov 11.1 NEG
## 6 monthsep 10.4 NEG
## 7 monthapr 9.57 NEG
## 8 monthoct 8.42 NEG
## 9 monthaug 7.89 NEG
## 10 monthjul 7.00 NEG
## # ... with 32 more rows
#Therefore, our Intial model for further comparison will be the above model, i.e. full_mod_3.***
#i.e. based on bank_df3 after removing redundant variables like housing, emp_var_rate, euribor3m, pdays, job and age***
### ANALYZING relationships between most important categorical variables (based on VIP of initial model)
xtabs(~ poutcome + month + age_group + subscribed, data = bank_df3, drop.unused.levels = TRUE)
## , , age_group = <= 24 yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 1 18 58 20 6 16 10 6 5 1
## nonexistent 13 82 324 133 378 31 9 15 16 3
## success 1 0 7 6 0 9 2 3 3 2
##
## , , age_group = 25-34 yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 18 131 607 33 21 45 23 33 202 5
## nonexistent 87 355 3116 1169 1923 1510 52 58 787 10
## success 6 13 44 17 8 18 12 10 18 2
##
## , , age_group = 35-44 yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 7 231 505 28 13 31 16 16 235 5
## nonexistent 27 491 3822 1507 1718 1354 31 59 914 9
## success 4 7 26 11 6 13 5 9 18 3
##
## , , age_group = 45-54 years, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 9 104 231 12 10 22 18 16 194 2
## nonexistent 16 266 2089 934 1197 1290 27 20 624 9
## success 1 8 19 6 3 11 3 4 16 2
##
## , , age_group = 55-64 yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 10 36 69 5 7 22 14 13 50 5
## nonexistent 15 100 702 361 533 507 19 44 206 9
## success 2 9 6 3 4 5 2 4 5 3
##
## , , age_group = 65-74 yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 6 6 4 3 6 17 6 10 4 3
## nonexistent 6 25 5 5 6 33 17 19 6 4
## success 1 2 1 0 0 3 5 5 0 0
##
## , , age_group = 75+ yrs, subscribed = no
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 1 3 5 4 2 6 3 5 1 3
## nonexistent 12 11 5 0 1 8 9 16 4 3
## success 1 0 2 0 0 2 1 3 2 0
##
## , , age_group = <= 24 yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 3 4 5 8 10 7 4 5 6 0
## nonexistent 10 36 32 32 31 14 8 14 6 4
## success 5 1 11 10 11 6 12 10 8 1
##
## , , age_group = 25-34 yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 14 12 53 17 11 13 13 11 17 2
## nonexistent 75 126 215 135 167 121 31 52 78 6
## success 29 15 28 31 17 42 32 21 28 6
##
## , , age_group = 35-44 yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 8 21 31 11 8 11 7 12 16 3
## nonexistent 32 68 179 101 126 103 18 30 64 8
## success 11 15 26 24 13 29 25 16 16 5
##
## , , age_group = 45-54 years, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 3 14 20 10 4 13 5 8 10 2
## nonexistent 10 56 103 58 74 93 18 15 37 8
## success 6 16 17 13 13 15 7 13 11 5
##
## , , age_group = 55-64 yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 1 10 7 4 6 11 4 9 9 4
## nonexistent 10 41 47 33 51 45 5 15 20 9
## success 6 7 11 6 6 20 14 13 13 5
##
## , , age_group = 65-74 yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 1 2 2 1 3 10 1 3 3 1
## nonexistent 8 18 2 4 8 11 5 13 13 1
## success 4 2 7 2 4 7 7 9 4 2
##
## , , age_group = 75+ yrs, subscribed = yes
##
## month
## poutcome mar apr may jun jul aug sep oct nov dec
## failure 2 2 2 1 2 3 4 3 4 0
## nonexistent 11 11 3 6 5 8 6 5 2 3
## success 2 5 3 2 6 7 7 2 8 4
null_mod <- glm(formula = subscribed ~ 1, data = bank_df3, family = binomial(link = "logit"))
summary(null_mod)
##
## Call:
## glm(formula = subscribed ~ 1, family = binomial(link = "logit"),
## data = bank_df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4889 -0.4889 -0.4889 -0.4889 2.0897
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.06393 0.01643 -125.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 26093 on 37060 degrees of freedom
## AIC: 26095
##
## Number of Fisher Scoring iterations: 4
(dev_null <- null_mod$deviance/null_mod$df.residual) # 0.7040681
## [1] 0.7040681
AIC(null_mod) # 26094.76
## [1] 26094.76
BIC(null_mod) # 26103.28
## [1] 26103.28
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(null_mod, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no
## no 32886
## yes 4175
(accuracy_null <- (cm[1,1])/length(y)) # 0.9078276 # poorer accuracy than full model (initial model)
## [1] 0.8873479
#### We can clearly see, null model always predict the values as "NO". Although it has a good accuracy, this model is clearly not useful because it means no matter what the marketing team does, people will never subscribe to the Term-deposit plans.
### Reduced models based on 3 shortlisted variables
```r
# first reduced model can be constructed using Variable Importance Plot (VIP) - we will select the top 3 variables for this model
red_mod_1 <- glm(formula = subscribed ~ duration + nr_employed + poutcome,
data = bank_df3, family = "binomial"(link = "logit"))
summary(red_mod_1)
##
## Call:
## glm(formula = subscribed ~ duration + nr_employed + poutcome,
## family = binomial(link = "logit"), data = bank_df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.8826 -0.3577 -0.2040 -0.1465 2.9663
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.326e+01 1.499e+00 48.88 < 2e-16 ***
## duration 4.444e-03 7.391e-05 60.12 < 2e-16 ***
## nr_employed -1.506e-02 2.977e-04 -50.60 < 2e-16 ***
## poutcomenonexistent 4.637e-01 6.232e-02 7.44 1.01e-13 ***
## poutcomesuccess 1.977e+00 8.656e-02 22.84 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 16508 on 37056 degrees of freedom
## AIC: 16518
##
## Number of Fisher Scoring iterations: 6
(dev_red_1 <- red_mod_1$deviance/red_mod_1$df.residual) # 0.4454975
## [1] 0.4454975
AIC(red_mod_1) # 16518.36
## [1] 16518.36
BIC(red_mod_1) # 16560.96
## [1] 16560.96
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(red_mod_1, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
print("Confusion Matrix for the model")
## [1] "Confusion Matrix for the model"
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32080 806
## yes 2610 1565
(accuracy_red_1 <- (cm[1,1] + cm[2,2])/length(y)) # 0.9078276 # poorer accuracy than full model (initial model)
## [1] 0.9078276
#plotting ROC curve for Initial model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9333
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9201
#### We observe that even with just 3 predictor variables, this model performs good in terms of accuracy. But since this is an imbalanced data-set, accuracy is not a good metric. We can look into PR-curve, or other methods of evaluating the logistic regression classifier.
# Feature Selection using step-AIC
## Q3 - Using the stepAIC() function from package MASS, fit several additional models for comparison.
### 3.1 stepAIC() || direction = "both"
```r
### Q3.1 Starting from your initial model, use stepAIC() and specify direction = "both" (the default).
### Is the resulting model different? If it is, how so (e.g., did stepAIC() drop terms, include new ones,
### or both)? Which model seems better in terms of prediction accuracy?
(model_aic_both <- MASS::stepAIC(full_mod_3, direction = "both", trace = 0))
##
## Call: glm(formula = subscribed ~ education + default + contact + month +
## day_of_week + duration + campaign + poutcome + cons_price_idx +
## cons_conf_idx + nr_employed + age_group, family = binomial(link = "logit"),
## data = bank_df3)
##
## Coefficients:
## (Intercept) educationbasic.6y
## 89.138320 0.152900
## educationbasic.9y educationhigh.school
## 0.031208 0.173305
## educationilliterate educationprofessional.course
## 1.171630 0.215315
## educationuniversity.degree educationunknown
## 0.332245 0.215660
## defaultunknown defaultyes
## -0.310539 -7.473116
## contacttelephone monthapr
## -0.336962 -1.194644
## monthmay monthjun
## -1.971394 -0.657007
## monthjul monthaug
## -0.923678 -1.041207
## monthsep monthoct
## -1.656586 -1.268692
## monthnov monthdec
## -1.464629 -1.172465
## day_of_weekmon day_of_weekthu
## -0.123507 0.027428
## day_of_weektue day_of_weekwed
## 0.115619 0.129283
## duration campaign
## 0.004668 -0.043830
## poutcomenonexistent poutcomesuccess
## 0.434092 1.822442
## cons_price_idx cons_conf_idx
## -0.236366 0.011390
## nr_employed age_group25-34 yrs
## -0.013468 -0.250453
## age_group35-44 yrs age_group45-54 years
## -0.448503 -0.362089
## age_group55-64 yrs age_group65-74 yrs
## -0.141381 0.144161
## age_group75+ yrs
## 0.455487
##
## Degrees of Freedom: 37060 Total (i.e. Null); 37024 Residual
## Null Deviance: 26090
## Residual Deviance: 15520 AIC: 15590
(dev_mod_aic_both <- model_aic_both$deviance/model_aic_both$df.residual) # 0.4190575
## [1] 0.4190575
AIC(model_aic_both) # 15589.19
## [1] 15589.19
BIC(model_aic_both) # 15904.44
## [1] 15904.44
## Variable importance using vip for aic model
vip::vi(model_aic_both) # see ?vip::vi for details
## # A tibble: 36 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 59.8 POS
## 2 nr_employed 36.2 NEG
## 3 poutcomesuccess 20.4 POS
## 4 monthmay 16.7 NEG
## 5 monthnov 11.1 NEG
## 6 monthsep 10.4 NEG
## 7 monthapr 9.65 NEG
## 8 monthoct 8.45 NEG
## 9 monthaug 7.86 NEG
## 10 monthjul 7.05 NEG
## # ... with 26 more rows
# it has dropped a few variables compared to full_mod3
# the resulting model is different than full_mod3 because of num of variables, numb of categories used for categorical variables
# and the residual deviance and AIC/BIC dropped for the step_AIC("both") model
## total variables (after dummy encoding - 36)
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(model_aic_both, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32034 852
## yes 2454 1721
(accuracy_aic_both <- (cm[1,1] + cm[2,2])/length(y)) # 0.9107957
## [1] 0.9107957
#plotting ROC curve for bi-directional AIC model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9332
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9332
## Q3.2
# Starting with an intercept only model, use stepAIC() to perform forward selection
# (i.e., direction = "forward"). What is the final model? Does it differ substantially from your initial model or the
# stepAIC() model from the previous step? How many parameters does this model have?
(model_aic_forward <- MASS::stepAIC(full_mod_3, direction = "forward", trace = 0))
##
## Call: glm(formula = subscribed ~ marital + education + default + loan +
## contact + month + day_of_week + duration + campaign + previous +
## poutcome + cons_price_idx + cons_conf_idx + nr_employed +
## age_group, family = binomial(link = "logit"), data = bank_df3)
##
## Coefficients:
## (Intercept) maritalmarried
## 89.684393 0.006824
## maritalsingle maritalunknown
## 0.039751 0.039741
## educationbasic.6y educationbasic.9y
## 0.150256 0.029716
## educationhigh.school educationilliterate
## 0.169113 1.171426
## educationprofessional.course educationuniversity.degree
## 0.213736 0.327990
## educationunknown defaultunknown
## 0.207686 -0.312879
## defaultyes loanunknown
## -7.481081 -0.084594
## loanyes contacttelephone
## -0.111924 -0.332986
## monthapr monthmay
## -1.186603 -1.965796
## monthjun monthjul
## -0.650159 -0.917945
## monthaug monthsep
## -1.046334 -1.650737
## monthoct monthnov
## -1.264229 -1.465101
## monthdec day_of_weekmon
## -1.159153 -0.124546
## day_of_weekthu day_of_weektue
## 0.026407 0.116044
## day_of_weekwed duration
## 0.128997 0.004669
## campaign previous
## -0.043373 0.067570
## poutcomenonexistent poutcomesuccess
## 0.517332 1.809184
## cons_price_idx cons_conf_idx
## -0.248578 0.011278
## nr_employed age_group25-34 yrs
## -0.013373 -0.238683
## age_group35-44 yrs age_group45-54 years
## -0.427605 -0.335473
## age_group55-64 yrs age_group65-74 yrs
## -0.113836 0.174224
## age_group75+ yrs
## 0.484292
##
## Degrees of Freedom: 37060 Total (i.e. Null); 37018 Residual
## Null Deviance: 26090
## Residual Deviance: 15510 AIC: 15600
(dev_mod_aic_forward <- model_aic_forward$deviance/model_aic_forward$df.residual) # 0.4189805
## [1] 0.4189805
AIC(model_aic_forward) # 15595.82
## [1] 15595.82
BIC(model_aic_forward) # 15962.19
## [1] 15962.19
## Variable importance using vip for aic model
vip::vi(model_aic_forward) # see ?vip::vi for details
## # A tibble: 42 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 59.8 POS
## 2 nr_employed 35.3 NEG
## 3 poutcomesuccess 20.1 POS
## 4 monthmay 16.7 NEG
## 5 monthnov 11.1 NEG
## 6 monthsep 10.4 NEG
## 7 monthapr 9.57 NEG
## 8 monthoct 8.42 NEG
## 9 monthaug 7.89 NEG
## 10 monthjul 7.00 NEG
## # ... with 32 more rows
# it has dropped a few variables compared to full_mod3
# the resulting model is different than full_mod3 because of num of variables, numb of categories used for categorical variables
# and the residual deviance and AIC/BIC dropped for the step_AIC("forward") model
## total variables (after dummy encoding - 42)
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(model_aic_forward, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32028 858
## yes 2454 1721
(accuracy_aic_forward <- (cm[1,1] + cm[2,2])/length(y)) # 0.9106338 # slightly poorer accuracy than step_AIC(both)
## [1] 0.9106338
#plotting ROC curve for forward AIC model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9333
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9333
## Q3.3
# Starting with a full model (i.e., a model with all possible predictors of interest), perform backward
# elimination by calling stepAIC() with direction = "backward. What is the result of the final
# model? Does it differ substantially from any of the previous models? If so, how?
(model_aic_backward <- MASS::stepAIC(full_mod_3, direction = "backward", trace = 0))
##
## Call: glm(formula = subscribed ~ education + default + contact + month +
## day_of_week + duration + campaign + poutcome + cons_price_idx +
## cons_conf_idx + nr_employed + age_group, family = binomial(link = "logit"),
## data = bank_df3)
##
## Coefficients:
## (Intercept) educationbasic.6y
## 89.138320 0.152900
## educationbasic.9y educationhigh.school
## 0.031208 0.173305
## educationilliterate educationprofessional.course
## 1.171630 0.215315
## educationuniversity.degree educationunknown
## 0.332245 0.215660
## defaultunknown defaultyes
## -0.310539 -7.473116
## contacttelephone monthapr
## -0.336962 -1.194644
## monthmay monthjun
## -1.971394 -0.657007
## monthjul monthaug
## -0.923678 -1.041207
## monthsep monthoct
## -1.656586 -1.268692
## monthnov monthdec
## -1.464629 -1.172465
## day_of_weekmon day_of_weekthu
## -0.123507 0.027428
## day_of_weektue day_of_weekwed
## 0.115619 0.129283
## duration campaign
## 0.004668 -0.043830
## poutcomenonexistent poutcomesuccess
## 0.434092 1.822442
## cons_price_idx cons_conf_idx
## -0.236366 0.011390
## nr_employed age_group25-34 yrs
## -0.013468 -0.250453
## age_group35-44 yrs age_group45-54 years
## -0.448503 -0.362089
## age_group55-64 yrs age_group65-74 yrs
## -0.141381 0.144161
## age_group75+ yrs
## 0.455487
##
## Degrees of Freedom: 37060 Total (i.e. Null); 37024 Residual
## Null Deviance: 26090
## Residual Deviance: 15520 AIC: 15590
(dev_mod_aic_backward <- model_aic_backward$deviance/model_aic_backward$df.residual) # 0.4190575
## [1] 0.4190575
AIC(model_aic_backward) # 15589.19
## [1] 15589.19
BIC(model_aic_backward) # 15904.44
## [1] 15904.44
## Variable importance using vip for aic model
vip::vi(model_aic_backward) # see ?vip::vi for details
## # A tibble: 36 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 59.8 POS
## 2 nr_employed 36.2 NEG
## 3 poutcomesuccess 20.4 POS
## 4 monthmay 16.7 NEG
## 5 monthnov 11.1 NEG
## 6 monthsep 10.4 NEG
## 7 monthapr 9.65 NEG
## 8 monthoct 8.45 NEG
## 9 monthaug 7.86 NEG
## 10 monthjul 7.05 NEG
## # ... with 26 more rows
# it has dropped a few variables compared to full_mod3
# the resulting model is different than full_mod3 because of num of variables, numb of categories used for categorical variables
# and the residual deviance and AIC/BIC dropped for the step_AIC("backward") model
## total variables (after dummy encoding - 36)
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(model_aic_backward, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32034 852
## yes 2454 1721
(accuracy_aic_backward <- (cm[1,1] + cm[2,2])/length(y)) # 0.9107957 # same accuracy than step_AIC(both)
## [1] 0.9107957
#plotting ROC curve for backward AIC model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9332
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9332
# Explore models that include 2-way interaction effects.
# Does it seem necessary or beneficial to include any 2-way interaction effects in this example?
# If so, which interaction effects seem relevant and predictive
## USING MARS (earth) to include 2-degree interaction effects
library(earth)
model_mars <- earth(subscribed ~ ., data = bank_df3,
degree = 2, glm = list(family = binomial(link = "logit")))
# Print summary of model fit
summary(model_mars)
## Call: earth(formula=subscribed~., data=bank_df3,
## glm=list(family=binomial(link="logit")), degree=2)
##
## GLM coefficients
## yes
## (Intercept) 1.91642807
## monthmar 2.37066546
## monthmay -0.77436546
## poutcomesuccess 2.08293961
## h(1059-duration) -0.00392420
## h(duration-1059) -0.00142287
## h(1-previous) 0.37376110
## h(nr_employed-5076.2) -0.03712948
## h(5099.1-nr_employed) 0.02331550
## h(nr_employed-5099.1) 0.02505227
## monthmar * h(duration-187) -0.00145544
## monthmar * h(187-duration) -0.01299746
## monthmar * h(5099.1-nr_employed) -0.02576873
## monthmay * h(duration-306) 0.00048880
## monthmay * h(306-duration) -0.00860072
## monthoct * h(nr_employed-5099.1) 0.07088895
## day_of_weekmon * h(5099.1-nr_employed) -0.00558505
## h(180-duration) * poutcomesuccess -0.00844786
## h(duration-180) * poutcomesuccess -0.00120520
## h(1059-duration) * h(cons_conf_idx- -40) 0.00004305
## h(275-duration) * h(5099.1-nr_employed) -0.00010386
## h(duration-275) * h(5099.1-nr_employed) -0.00001851
## h(355-duration) * h(nr_employed-5099.1) -0.00014894
## h(duration-355) * h(nr_employed-5099.1) 0.00000939
## h(3-campaign) * h(5099.1-nr_employed) 0.00084014
## h(campaign-3) * h(5099.1-nr_employed) -0.00250433
## h(92.963-cons_price_idx) * h(5099.1-nr_employed) -0.03118132
## h(cons_price_idx-92.963) * h(5099.1-nr_employed) -0.00871802
##
## GLM (family binomial, link logit):
## nulldev df dev df devratio AIC iters converged
## 26092.8 37060 13766.1 37033 0.472 13820 9 1
##
## Earth selected 28 of 30 terms, and 11 of 44 predictors
## Termination condition: RSq changed by less than 0.001 at 30 terms
## Importance: duration, nr_employed, poutcomesuccess, monthmar, monthoct, ...
## Number of terms at each degree of interaction: 1 9 18
## Earth GCV 0.05871542 RSS 2168.016 GRSq 0.4126521 RSq 0.4147897
plot(model_mars) # not terribly useful
(dev_model_mars <- 13766.1/37033) # 0.3717252
## [1] 0.3717252
## Variable importance using vip for mars model
vip::vi(model_mars)
## # A tibble: 44 x 2
## Variable Importance
## <chr> <dbl>
## 1 duration 27
## 2 nr_employed 26
## 3 poutcomesuccess 24
## 4 monthmar 22
## 5 monthoct 18
## 6 monthmay 17
## 7 previous 14
## 8 campaign 13
## 9 day_of_weekmon 8
## 10 cons_price_idx 8
## # ... with 34 more rows
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(model_mars, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 31818 1068
## yes 2077 2098
(accuracy_model_mars <- (cm[1,1] + cm[2,2])/length(y)) # 0.9151129 # slightly better accuracy than step_AIC(both)
## [1] 0.9151399
library(pROC)
#plotting ROC curve for MARS model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9414
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9414
## Plottting Variable Importance Plots
library(vip)
vip_full <- vip(full_mod_3, geom = "point", include_type = TRUE)
vip_aic_both <- vip(model_aic_both, geom = "point", include_type = TRUE)
vip_aic_forward <- vip(model_aic_forward, geom = "point", include_type = TRUE)
vip_aic_backward <- vip(model_aic_backward, geom = "point", include_type = TRUE)
vip_mars <- vip(model_mars, geom = "point", include_type = TRUE)
gridExtra::grid.arrange(vip_aic_both, vip_mars, nrow = 1)
## Based on best accuracy, less false positive and false negatives, and highest AUC - ROC value,
## We can chose the MARS model just based on performance.
## based on interpretability, we should go ahead with step-AIC (both directions) model since
## it has very sligthly lesser accuracy than MARS but doesn't involve interaction effect and can be explained easily
## also it has significanlty lesses important variables than other models.
# Generating partial dependence plots for explainability of the aic_forward model
library(pdp)
# Set theme for ggplot2 graphics
theme_set(theme_bw())
# These plots will be linear on the logit scale (why?) but nonlinear on the probability scale
partial(model_aic_both, pred.var = "duration", prob = TRUE, plot = TRUE, rug = TRUE,
plot.engine = "ggplot2") +
geom_rug(data = bank_df3, aes(x = duration), alpha = 0.2, inherit.aes = FALSE)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
library(pdp)
pd1 <- partial(model_aic_both, pred.var = "duration", plot = TRUE, plot.engine = "ggplot2") + ggtitle("Logit scale")
pd2 <- partial(model_aic_both, pred.var = "duration", plot = TRUE, prob = TRUE, plot.engine = "ggplot2") + ggtitle("Probability scale")
gridExtra::grid.arrange(pd1, pd2, nrow = 1)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## as we can see, the higher the duration of the calls for marketing campaigns,
## higher would be the probability of the person subscribing
partial(model_aic_both, pred.var = "poutcome", prob = TRUE, plot = TRUE, rug = TRUE,
plot.engine = "ggplot2") +
geom_rug(data = bank_df3, aes(x = poutcome), alpha = 0.2, inherit.aes = FALSE)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]` instead.
## Plotting against original values
poutcome_df <- data.frame(table(bank_df3$poutcome, bank_df3$subscribed))
colnames(poutcome_df) <- c("poutcome","subscribed","customers")
library(ggplot2)
ggplot(data=poutcome_df, aes(x = poutcome, y = customers, fill = subscribed))+
geom_bar(stat = 'identity', position = 'dodge')+
labs(X="Result of Precious outreach",
y=NULL,
title="Success of previous marketing reach")
## It is clear that if a customer was reached earlier and they refused, they are much likely to refuse again,
## whereas if they agreed, they are more likely to subscribe this time as well.
## similarly, we can say that the important factors which will be useful for future marketing campaigns are
## duration of the call - higher the better, Call duration around 300-400s with effective persuasion could increase the chance of campaign success,
## number of employees - higher the better - aas they can reach more people for subscribing to the plan/investment
## outcome of the previous marketing campaign - successful/failure is a strong predictor,
# The campaign should concentrate on new customer as seen on chart previous campaign chart,
## that most customer who responded positively are customer who never been called prior of the campaign.
## if they were contacted between March - June last year, and consumer price index,
## if they were contacted on telephone, they are less likely to subscribe.
#Question 6
## Considering poutcome and months as variables causing leakage, we have
## to prevent leakage we have created bins for age group variable
full_mod_4 <- glm(formula = subscribed ~ .-poutcome - month, data = bank_df3, family = binomial(link="logit"))
model_aic_both_2 <- MASS::stepAIC(full_mod_4,direction = "both", trace = 0)
(dev_mod_aic_both_2 <- model_aic_both_2$deviance/model_aic_both_2$df.residual) # 0.4190575
## [1] 0.4458151
AIC(model_aic_both_2)
## [1] 16565.87
BIC(model_aic_both_2)
## [1] 16804.44
## Variable importance using vip for aic model
vip::vi(model_aic_both_2) # see ?vip::vi for details
## # A tibble: 27 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 60.3 POS
## 2 nr_employed 42.3 NEG
## 3 contacttelephone 11.8 NEG
## 4 cons_conf_idx 10.4 POS
## 5 age_group35-44 yrs 6.45 NEG
## 6 defaultunknown 5.85 NEG
## 7 educationuniversity.degree 5.05 POS
## 8 age_group45-54 years 4.95 NEG
## 9 previous 4.66 POS
## 10 age_group25-34 yrs 4.00 NEG
## # ... with 17 more rows
# it has dropped a few variables compared to full_mod3
# the resulting model is different than full_mod3 because of num of variables, numb of categories used for categorical variables
# and the residual deviance and AIC/BIC dropped for the step_AIC("both") model
## total variables (after dummy encoding - 36)
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y <- bank_df3$subscribed # observed classes
prob <- predict(model_aic_both_2, type = "response") # predicted probabilities
classes <- ifelse(prob > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32078 808
## yes 2652 1523
(accuracy_aic_both <- (cm[1,1] + cm[2,2])/length(y)) # 0.9107957
## [1] 0.9066404
#plotting ROC curve for bi-directional AIC model
plot(roc1 <- pROC::roc(y, predictor = prob))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9332
##
## Call:
## roc.default(response = y, predictor = prob)
##
## Data: prob in 32886 controls (y no) < 4175 cases (y yes).
## Area under the curve: 0.9201
# Top 3 predictor variables are duration, nr_employed, contacttlephone
vip_aic_both_2 <- vip(model_aic_both_2, geom = "point", include_type = TRUE)
vip_aic_both_2
partial(model_aic_both_2, pred.var = "nr_employed", prob = TRUE, plot = TRUE, rug = TRUE,
plot.engine = "ggplot2") +
geom_rug(data = bank_df3, aes(x = nr_employed), alpha = 0.2, inherit.aes = FALSE)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
partial(model_aic_both_2, pred.var = "duration", prob = TRUE, plot = TRUE, rug = TRUE,
plot.engine = "ggplot2") +
geom_rug(data = bank_df3, aes(x = duration), alpha = 0.2, inherit.aes = FALSE)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.
data12<-read.csv('C:\\Users\\ajayd\\OneDrive\\Desktop\\bank_new.csv')
# Dimensions
dim(data12)
## [1] 4118 21
# STructure
str(data12)
## 'data.frame': 4118 obs. of 21 variables:
## $ age : int 56 41 24 54 50 59 57 44 33 55 ...
## $ job : chr "housemaid" "blue-collar" "technician" "retired" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education : chr "basic.4y" "unknown" "professional.course" "basic.9y" ...
## $ default : chr "no" "unknown" "no" "unknown" ...
## $ housing : chr "no" "no" "yes" "yes" ...
## $ loan : chr "no" "no" "no" "yes" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 261 217 380 174 353 386 212 188 273 349 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp_var_rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num 94 94 94 94 94 ...
## $ cons_conf_idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num 5191 5191 5191 5191 5191 ...
## $ subscribed : chr "no" "no" "no" "no" ...
# Summary of the Data
summary(data12)
## age job marital education
## Min. :18.00 Length:4118 Length:4118 Length:4118
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :39.88
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:4118 Length:4118 Length:4118 Length:4118
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:4118 Length:4118 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 104.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 179.0 Median : 2.000
## Mean : 256.6 Mean : 2.594
## 3rd Qu.: 316.0 3rd Qu.: 3.000
## Max. :3253.0 Max. :37.000
## pdays previous poutcome emp_var_rate
## Min. : 0.0 Min. :0.0000 Length:4118 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :964.5 Mean :0.1661 Mean : 0.09473
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :7.0000 Max. : 1.40000
## cons_price_idx cons_conf_idx euribor3m nr_employed
## Min. :92.20 Min. :-50.80 Min. :0.634 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.70 1st Qu.:1.344 1st Qu.:5099
## Median :93.44 Median :-41.80 Median :4.857 Median :5196
## Mean :93.58 Mean :-40.63 Mean :3.632 Mean :5168
## 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.90 Max. :5.045 Max. :5228
## subscribed
## Length:4118
## Class :character
## Mode :character
##
##
##
# Verify NA values
colSums(sapply(data12, is.na))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
## 0 0 0 0 0
## subscribed
## 0
# There are no null values
# Remove duplicate rows
nrow(data12) - nrow(unique(data12)) # 9
## [1] 1
data12[,"age"] <- as.numeric(data12[,"age"])
colnames(data12)# Remove duplicate rows
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp_var_rate"
## [17] "cons_price_idx" "cons_conf_idx" "euribor3m" "nr_employed"
## [21] "subscribed"
nrow(data12) - nrow(unique(data12))
## [1] 1
data12 <- distinct(data12)
#dim(bank_data_train)
#bank_data_train[duplicated(bank_data_train),]
data12[,"age"] <- as.numeric(data12[,"age"])
# age distribution and grouping
library(sm)
# plot densities
sm.density.compare(data12$age, data12$subscribed, xlab="Age")
title(main="Distribution of Responses by Age")
colfill<-c(2:(2+length(levels(data12$subscribed))))
#legend('locator(1)', levels(data12), fill=colfill)
# add legend via mouse click
#colfill<-c(2:(2+length(levels(cyl.f))))
#legend(locator(1), levels(cyl.f), fill=colfill)
# grouping Age - we know min Age = 17, & max age = 98
data12["age_group"] <- cut(data12$age, c(17, 25, 34, 44, 54, 64, 74, Inf),
c(" Upto 24 years", "25 to 34 years",
"35 to 44 years", "45 to 54 years", "55 to 64 years",
"65 to 74 years", "75 years and over"), include.lowest=TRUE)
# Plotting the responses by age_group
age_gp_df <- data.frame(table(data12$age_group, data12$subscribed))
colnames(age_gp_df) <- c("age_group","subscribed","customers")
library(ggplot2)
ggplot(data=age_gp_df, aes(x = age_group, y = customers, fill = subscribed))+
geom_bar(stat = 'identity', position = 'dodge')+
labs(X="Number of Responses",
y=NULL,
title="Age wise Subsciption summary")
# Storing the Categorical and Numerical columns separately
cat_var <- names(data12 %>% select_if(~!is.numeric(.)))
num_var <- names(data12 %>% select_if(~is.numeric(.)))
# 12 categorical variables; 10 numeric variables
# Convert categorical variables into factor
data12[, cat_var] <- lapply(data12[,cat_var], as.factor)
sapply(data12[, cat_var], table)
## $job
##
## admin. blue-collar entrepreneur housemaid management
## 1086 888 142 91 292
## retired self-employed services student technician
## 178 148 378 102 692
## unemployed unknown
## 97 23
##
## $marital
##
## divorced married single unknown
## 466 2478 1169 4
##
## $education
##
## basic.4y basic.6y basic.9y high.school
## 362 218 589 986
## professional.course university.degree unknown
## 538 1235 189
##
## $default
##
## no unknown
## 3287 830
##
## $housing
##
## no unknown yes
## 1842 94 2181
##
## $loan
##
## no unknown yes
## 3381 94 642
##
## $contact
##
## cellular telephone
## 2678 1439
##
## $month
##
## apr aug dec jul jun mar may nov oct sep
## 251 634 20 751 552 51 1317 417 71 53
##
## $day_of_week
##
## fri mon thu tue wed
## 794 852 871 791 809
##
## $poutcome
##
## failure nonexistent success
## 412 3574 131
##
## $subscribed
##
## no yes
## 3653 464
##
## $age_group
##
## Upto 24 years 25 to 34 years 35 to 44 years 45 to 54 years
## 173 1335 1338 862
## 55 to 64 years 65 to 74 years 75 years and over
## 354 28 27
# Providing specific Order to Month factors
data12$month <- factor(data12$month,
levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
# age distribution and grouping
library(sm)
library(dplyr)
data12<-distinct(data12)
(redun <- Hmisc::redun(~., data = data12, nk = 0))
##
## Redundancy Analysis
##
## Hmisc::redun(formula = ~., data = data12, nk = 0)
##
## n: 4117 p: 22 nk: 0
##
## Number of NAs: 0
##
## Transformation of target variables forced to be linear
##
## R-squared cutoff: 0.9 Type: ordinary
##
## R^2 with which each variable can be predicted from all other variables:
##
## age job marital education default
## 0.941 0.444 0.246 0.458 0.151
## housing loan contact month day_of_week
## 1.000 1.000 0.733 0.938 0.033
## duration campaign pdays previous poutcome
## 0.217 0.062 0.928 0.825 0.919
## emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
## 0.996 0.989 0.860 0.995 0.995
## subscribed age_group
## 0.376 0.938
##
## Rendundant variables:
##
## housing emp_var_rate euribor3m age pdays
##
## Predicted from variables:
##
## job marital education default loan contact month day_of_week duration campaign previous poutcome cons_price_idx cons_conf_idx nr_employed subscribed age_group
##
## Variable Deleted R^2 R^2 after later deletions
## 1 housing 1.000 1 1 1 1
## 2 emp_var_rate 0.996 0.996 0.996 0.996
## 3 euribor3m 0.995 0.995 0.995
## 4 age 0.941 0.94
## 5 pdays 0.927
# based on this, we can drop variables - ['housing','age', 'emp_var_rate', 'euribor3m', 'pdays'] in first iteration
data12 = dplyr::select(data12, -c('housing','age', 'emp_var_rate', 'euribor3m', 'pdays'))
(redun <- Hmisc::redun(~., data = data12, nk = 0))
##
## Redundancy Analysis
##
## Hmisc::redun(formula = ~., data = data12, nk = 0)
##
## n: 4117 p: 17 nk: 0
##
## Number of NAs: 0
##
## Transformation of target variables forced to be linear
##
## R-squared cutoff: 0.9 Type: ordinary
##
## R^2 with which each variable can be predicted from all other variables:
##
## job marital education default loan
## 0.443 0.230 0.458 0.149 0.013
## contact month day_of_week duration campaign
## 0.709 0.672 0.025 0.216 0.059
## previous poutcome cons_price_idx cons_conf_idx nr_employed
## 0.803 0.799 0.682 0.570 0.708
## subscribed age_group
## 0.362 0.464
##
## No redundant variables
## Looks good - No redundant variables
## lets fit a full model on this df and check VIF
full_mod_8 <- glm(formula = subscribed ~ ., data = data12, family = binomial(link = "logit"))
model_aic_both_2 <- MASS::stepAIC(full_mod_8,direction = "both", trace = 0)
(dev_mod_aic_both_2 <- model_aic_both_2$deviance/model_aic_both_2$df.residual)
## [1] 0.414179
summary(full_mod_8)
##
## Call:
## glm(formula = subscribed ~ ., family = binomial(link = "logit"),
## data = data12)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7993 -0.3004 -0.1859 -0.1199 3.3225
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.541e+01 1.411e+01 6.055 1.40e-09 ***
## jobblue-collar -4.726e-01 2.595e-01 -1.821 0.068536 .
## jobentrepreneur -4.573e-01 4.121e-01 -1.110 0.267185
## jobhousemaid -1.931e+00 8.547e-01 -2.260 0.023848 *
## jobmanagement -6.966e-01 2.940e-01 -2.370 0.017803 *
## jobretired -1.178e+00 4.708e-01 -2.503 0.012332 *
## jobself-employed -6.788e-01 3.851e-01 -1.762 0.077988 .
## jobservices -4.624e-01 2.679e-01 -1.726 0.084406 .
## jobstudent -3.198e-02 3.536e-01 -0.090 0.927945
## jobtechnician -4.898e-01 2.303e-01 -2.127 0.033437 *
## jobunemployed -2.475e-02 3.763e-01 -0.066 0.947554
## jobunknown -4.511e+00 1.503e+00 -3.002 0.002679 **
## maritalmarried -8.937e-02 2.262e-01 -0.395 0.692773
## maritalsingle -1.530e-01 2.516e-01 -0.608 0.543276
## maritalunknown -1.155e+01 3.725e+02 -0.031 0.975268
## educationbasic.6y 3.116e-01 4.067e-01 0.766 0.443639
## educationbasic.9y -8.798e-02 3.220e-01 -0.273 0.784700
## educationhigh.school -1.574e-01 3.216e-01 -0.489 0.624584
## educationprofessional.course 3.108e-01 3.470e-01 0.896 0.370505
## educationuniversity.degree 1.508e-01 3.179e-01 0.474 0.635324
## educationunknown 5.220e-01 3.991e-01 1.308 0.190855
## defaultunknown -3.731e-01 2.225e-01 -1.677 0.093598 .
## loanunknown 1.839e-01 4.384e-01 0.419 0.674864
## loanyes 4.055e-01 1.706e-01 2.377 0.017450 *
## contacttelephone -2.383e-01 2.316e-01 -1.029 0.303376
## monthapr -1.200e+00 3.908e-01 -3.070 0.002139 **
## monthmay -2.006e+00 3.711e-01 -5.405 6.49e-08 ***
## monthjun -1.001e+00 4.026e-01 -2.485 0.012953 *
## monthjul -1.054e+00 4.185e-01 -2.520 0.011748 *
## monthaug -1.053e+00 4.224e-01 -2.493 0.012671 *
## monthsep -1.807e+00 5.066e-01 -3.566 0.000362 ***
## monthoct -1.280e+00 4.683e-01 -2.734 0.006255 **
## monthnov -1.431e+00 4.164e-01 -3.436 0.000589 ***
## monthdec -1.478e+00 6.363e-01 -2.323 0.020154 *
## day_of_weekmon -2.423e-01 2.087e-01 -1.161 0.245712
## day_of_weekthu -7.189e-03 2.031e-01 -0.035 0.971758
## day_of_weektue -3.307e-01 2.209e-01 -1.497 0.134315
## day_of_weekwed 2.451e-01 2.029e-01 1.208 0.227018
## duration 4.963e-03 2.457e-04 20.201 < 2e-16 ***
## campaign -7.261e-02 4.063e-02 -1.787 0.073951 .
## previous -8.831e-02 1.705e-01 -0.518 0.604507
## poutcomenonexistent 4.390e-01 2.925e-01 1.501 0.133433
## poutcomesuccess 1.877e+00 2.864e-01 6.556 5.52e-11 ***
## cons_price_idx -1.781e-01 1.436e-01 -1.240 0.214827
## cons_conf_idx 2.131e-02 1.715e-02 1.243 0.213940
## nr_employed -1.362e-02 1.127e-03 -12.088 < 2e-16 ***
## age_group25 to 34 years -1.920e-01 3.111e-01 -0.617 0.537060
## age_group35 to 44 years -2.467e-01 3.310e-01 -0.745 0.456046
## age_group45 to 54 years -3.991e-01 3.550e-01 -1.124 0.260973
## age_group55 to 64 years 3.277e-02 4.063e-01 0.081 0.935712
## age_group65 to 74 years 8.509e-01 7.039e-01 1.209 0.226734
## age_group75 years and over 1.151e+00 7.354e-01 1.565 0.117561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2899.4 on 4116 degrees of freedom
## Residual deviance: 1673.3 on 4065 degrees of freedom
## AIC: 1777.3
##
## Number of Fisher Scoring iterations: 13
(dev_2 <- full_mod_8$deviance/full_mod_8$df.residual)
## [1] 0.4116262
AIC(full_mod_8) # 15600.84
## [1] 1777.26
BIC(full_mod_8) # 16060.94
## [1] 2106.05
# checking VIFs using the full Model 2
library(car)
vif(full_mod_8) # VIF scores generated
## GVIF Df GVIF^(1/(2*Df))
## job 13.922559 11 1.127164
## marital 1.485918 3 1.068233
## education 3.666261 6 1.114342
## default 1.121762 1 1.059132
## loan 1.062537 2 1.015280
## contact 2.003222 1 1.415352
## month 6.116761 9 1.105849
## day_of_week 1.162667 4 1.019018
## duration 1.289509 1 1.135565
## campaign 1.067903 1 1.033394
## previous 3.808494 1 1.951536
## poutcome 3.992456 2 1.413546
## cons_price_idx 2.061920 1 1.435939
## cons_conf_idx 2.447758 1 1.564531
## nr_employed 2.261897 1 1.503960
## age_group 5.800441 6 1.157769
# removing the leakage variables viz. poutcomes, month on the basis if the training model
data12 <- dplyr::select(bank_df2, -c('poutcome','month'))
full_mod_9 <- glm(formula = subscribed ~ ., data = data12, family = binomial(link = "logit"))
summary(full_mod_9)
##
## Call:
## glm(formula = subscribed ~ ., family = binomial(link = "logit"),
## data = data12)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.9172 -0.3474 -0.2035 -0.1359 3.1822
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.192e+01 3.690e+00 16.780 < 2e-16 ***
## jobblue-collar -3.039e-01 8.142e-02 -3.732 0.000190 ***
## jobentrepreneur -2.107e-01 1.278e-01 -1.649 0.099214 .
## jobhousemaid 5.089e-02 1.454e-01 0.350 0.726315
## jobmanagement -6.061e-02 8.608e-02 -0.704 0.481377
## jobretired 1.642e-01 1.235e-01 1.329 0.183960
## jobself-employed -1.298e-01 1.200e-01 -1.082 0.279084
## jobservices -2.374e-01 8.801e-02 -2.697 0.006993 **
## jobstudent 1.891e-01 1.209e-01 1.564 0.117841
## jobtechnician 9.677e-05 7.160e-02 0.001 0.998922
## jobunemployed 1.421e-01 1.299e-01 1.094 0.273915
## jobunknown 1.242e-01 2.313e-01 0.537 0.591137
## maritalmarried 2.637e-02 6.979e-02 0.378 0.705573
## maritalsingle 8.730e-02 7.958e-02 1.097 0.272628
## maritalunknown 1.028e-01 3.958e-01 0.260 0.795139
## educationbasic.6y 1.531e-01 1.241e-01 1.234 0.217382
## educationbasic.9y 9.661e-03 9.774e-02 0.099 0.921259
## educationhigh.school 8.198e-02 9.447e-02 0.868 0.385494
## educationilliterate 1.359e+00 7.098e-01 1.914 0.055623 .
## educationprofessional.course 1.573e-01 1.039e-01 1.514 0.130018
## educationuniversity.degree 3.050e-01 9.454e-02 3.227 0.001252 **
## educationunknown 1.546e-01 1.233e-01 1.254 0.209877
## defaultunknown -3.828e-01 6.913e-02 -5.537 3.07e-08 ***
## defaultyes -7.892e+00 1.132e+02 -0.070 0.944415
## loanunknown -1.203e-01 1.413e-01 -0.851 0.394641
## loanyes -1.162e-01 5.890e-02 -1.974 0.048433 *
## contacttelephone -6.739e-01 6.215e-02 -10.843 < 2e-16 ***
## day_of_weekmon -4.911e-02 6.711e-02 -0.732 0.464321
## day_of_weekthu 6.069e-02 6.517e-02 0.931 0.351790
## day_of_weektue 1.651e-01 6.611e-02 2.497 0.012521 *
## day_of_weekwed 1.189e-01 6.669e-02 1.783 0.074651 .
## duration 4.548e-03 7.592e-05 59.905 < 2e-16 ***
## campaign -3.998e-02 1.197e-02 -3.341 0.000835 ***
## previous 1.368e-01 3.197e-02 4.280 1.87e-05 ***
## cons_price_idx 3.970e-02 3.846e-02 1.032 0.302011
## cons_conf_idx 3.597e-02 3.860e-03 9.317 < 2e-16 ***
## nr_employed -1.313e-02 3.168e-04 -41.434 < 2e-16 ***
## age_group25-34 yrs -2.479e-01 9.820e-02 -2.524 0.011595 *
## age_group35-44 yrs -4.452e-01 1.049e-01 -4.244 2.20e-05 ***
## age_group45-54 years -3.336e-01 1.118e-01 -2.983 0.002851 **
## age_group55-64 yrs -1.150e-01 1.260e-01 -0.912 0.361661
## age_group65-74 yrs 6.626e-02 1.874e-01 0.354 0.723689
## age_group75+ yrs 3.976e-01 2.128e-01 1.868 0.061735 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26093 on 37060 degrees of freedom
## Residual deviance: 16468 on 37018 degrees of freedom
## AIC: 16554
##
## Number of Fisher Scoring iterations: 10
(dev_3 <- full_mod_9$deviance/full_mod_9$df.residual) # 0.41898)
## [1] 0.444857
AIC(full_mod_9) # 15595.82
## [1] 16553.72
BIC(full_mod_9)
## [1] 16920.09
vif(full_mod_9)
## GVIF Df GVIF^(1/(2*Df))
## job 8.680997 11 1.103220
## marital 1.485943 3 1.068236
## education 3.254095 7 1.087933
## default 1.132008 2 1.031484
## loan 1.004695 2 1.001172
## contact 1.437132 1 1.198805
## day_of_week 1.027927 4 1.003449
## duration 1.209917 1 1.099962
## campaign 1.035337 1 1.017515
## previous 1.335185 1 1.155502
## cons_price_idx 1.444268 1 1.201777
## cons_conf_idx 1.278745 1 1.130816
## nr_employed 1.683525 1 1.297507
## age_group 4.325551 6 1.129805
## Variable importance using vip for mars model
vip::vi(full_mod_9)
## # A tibble: 42 x 3
## Variable Importance Sign
## <chr> <dbl> <chr>
## 1 duration 59.9 POS
## 2 nr_employed 41.4 NEG
## 3 contacttelephone 10.8 NEG
## 4 cons_conf_idx 9.32 POS
## 5 defaultunknown 5.54 NEG
## 6 previous 4.28 POS
## 7 age_group35-44 yrs 4.24 NEG
## 8 jobblue-collar 3.73 NEG
## 9 campaign 3.34 NEG
## 10 educationuniversity.degree 3.23 POS
## # ... with 32 more rows
# Confusion matrix (i.e., 2x2 contingency table of classification results)
y2 <- data12$subscribed # observed classes
prob8 <- predict(full_mod_9, type = "response") # predicted probabilities
classes <- ifelse(prob8 > 0.5, "yes", "no") # classification based on 0.5 threshold
(cm <- table("actual" = y2, "predicted" = classes)) # confusion matrix
## predicted
## actual no yes
## no 32052 834
## yes 2631 1544
(accuracy_model <- (cm[1,1] + cm[2,2])/length(y2))
## [1] 0.9065055
library(pROC)
#plotting ROC curve for our final model
plot(roc1 <- pROC::roc(y2, predictor = prob8))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
roc1 # Area under the curve: 0.9414
##
## Call:
## roc.default(response = y2, predictor = prob8)
##
## Data: prob8 in 32886 controls (y2 no) < 4175 cases (y2 yes).
## Area under the curve: 0.921
## Plottting Variable Importance Plots
library(vip)
vip_aic_both_9 <- vip(full_mod_9, geom = "point", include_type = TRUE)
# Based on the plot, duration is the most important factor influencing our model
# Function to compute the Brier score; the Brier score is an example of a
# *proper scoring rule* and is a better performance metric than AUROC when
# probabilities are of interest (which they usually are)
bs <- function(y2, prob8) { # this is a relative measure, like log loss/binomial deviance, SSE/RMSE, AIC/BIC, etc.
mean((prob8 - y2) ^ 2) # y should be in {0, 1}
}
# Compute Brier score for testing model
y01 <- ifelse(y == "yes", 1, 0)
fit0 <- glm(subscribed ~ 1, data = na.omit(data12), family = binomial(link = "logit"))
bs(y01, prob = predict(fit0, type = "response"))
## [1] 0.09996162
bs(y01, prob = predict(full_mod_9, type = "response"))
## [1] 0.06665323
#bs(y01, prob = predict(lr.enet, newx = X, type = "response", s = "lambda.min")[, 1L, drop = TRUE])
## caliberation curve
# Check calibration of the model
rms::val.prob(prob, y = y01)
## Dxy C (ROC) R2 D D:Chi-sq
## 8.401664e-01 9.200832e-01 4.508033e-01 2.585438e-01 9.582892e+03
## D:p U U:Chi-sq U:p Q
## NA -5.396508e-05 -2.921297e-09 1.000000e+00 2.585978e-01
## Brier Intercept Slope Emax E90
## 6.676875e-02 6.185606e-08 1.000000e+00 2.065730e-01 6.045818e-02
## Eavg S:z S:p
## 2.314654e-02 2.823507e+00 4.750134e-03
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:TeachingDemos':
##
## cnvrt.coords, subplot
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:testthat':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'rms'
## The following objects are masked from 'package:car':
##
## Predict, vif
# The Probability does not seems to be well calibrated for the training model
# We can see the deviaiton in the graph
head(data12)
## job marital education default loan contact day_of_week
## 1 services married high.school unknown no telephone mon
## 2 services married high.school no no telephone mon
## 3 admin. married basic.6y no no telephone mon
## 4 services married high.school no yes telephone mon
## 5 services married basic.9y unknown no telephone mon
## 6 admin. married professional.course no no telephone mon
## duration campaign previous cons_price_idx cons_conf_idx nr_employed
## 1 149 1 0 93.994 -36.4 5191
## 2 226 1 0 93.994 -36.4 5191
## 3 151 1 0 93.994 -36.4 5191
## 4 307 1 0 93.994 -36.4 5191
## 5 198 1 0 93.994 -36.4 5191
## 6 139 1 0 93.994 -36.4 5191
## subscribed age_group
## 1 no 55-64 yrs
## 2 no 35-44 yrs
## 3 no 35-44 yrs
## 4 no 55-64 yrs
## 5 no 45-54 years
## 6 no 55-64 yrs
# Function to compute lift and cumulative gain charts
lift <- function(prob8, y2, pos.class = NULL, cumulative = TRUE) {
if (!all(sort(unique(y2)) == c(0, 1))) {
if (is.null(pos.class)) {
stop("A value for `pos.class` is required whenever `y` is not a 0/1 ",
"outcome.", call. = FALSE)
}
y2 <- ifelse(y2 == pos.class, 1, 0)
}
ord <- order(prob8, decreasing = TRUE)
prob <- prob8[ord]
y2 <- y2[ord]
prop <- seq_along(y2) / length(y2)
lift <- if (isTRUE(cumulative)) {
cumsum(y2)
} else {
(cumsum(y2) / seq_along(y2)) / mean(y2)
}
structure(list("lift" = lift, "prop" = prop, "cumulative" = cumulative,
"y" = y), class = "lift")
}
# Cumulative gain chart; what does this plot tell usTable0 <- data.frame(prob1,test_subscribed)
Table0 <- data.frame(prob8,y2)
top_500_households <- Table0 %>%
arrange(-prob8) %>%
top_n(n = 500,wt = prob8)
head(top_500_households)
## prob8 y2
## 21680 1.0000000 no
## 36472 0.9999997 no
## 19980 0.9999986 yes
## 32432 0.9999973 no
## 12513 0.9999939 yes
## 7026 0.9999928 yes
l <- lift(prob8, y = y2, pos.class = "yes")
plot(l[["prop"]], l[["lift"]], type = "l",
xlab = "Proportion of sample", ylab = "Cumulative lift",
las = 1, lwd = 2, col = 2)
abline(0, sum(y == "yes"), lty = 2)
# After 30% of the sample, the cumulative lift is getting constant