# code chunk specifies whether the R code, warnings, and output
# will be included in the output files.
#if (!require("knitr")) {
# install.packages("knitr")
library(knitr)
#}
#if (!require("MASS")) {
# install.packages("MASS")
library(MASS)
#}
#install.packages("glmnet")
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
#if (!require("TSstudio")) {
# install.packages("TSstudio")
library(TSstudio)
#}
#if (!require("gmodels")) {
# install.packages("gmodels")
library(gmodels)
#}
#if (!require("recipes")) {
# install.packages("recipes")
library(recipes)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'recipes'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## step
#}
# if (!require("tidyverse")) {
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::select() masks MASS::select()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#}
#if (!require("GGally")) {
# install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#}
#install.packages("neuralnet")
library(neuralnet)
##
## Attaching package: 'neuralnet'
##
## The following object is masked from 'package:dplyr':
##
## compute
#install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:gmodels':
##
## ci
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#install.packages("rpart")
library(rpart)
#installed.packages("pander")
#library(pander)
#install.packages("CRAN")
library(digest)
library(Rcpp)
#install.packages("pander")
library(pander)
##
## Attaching package: 'pander'
##
## The following object is masked from 'package:GGally':
##
## wrap
#install.packages("rpart.plot")
library(rpart.plot)
#reading in dataset
BankMarketingNA = read.csv("https://pengdsci.github.io/datasets/BankMarketing/BankMarketingCSV.csv")[ , -1]
#getting dimensions to makke sure all observations were included
dim(BankMarketingNA)
## [1] 45211 17
The goal for this data set is to be able to predict which customers will subscribe to a term deposit based upon specific qualities and characteristics of the customer and methods used by the campaign. The data set is made up of 45,211 observations and contains 17 variables. Among all of the variables, there were no missing values or null values found. The variables and their descriptions are listed below:
1 - age: age of customer(numeric)
2 - job : type of job (categorical: “admin.”,“unknown”,“unemployed”,“management”,“housemaid”,“entrepreneur”,“student”, “blue-collar”, “self-employed”, “retired”, “technician”, “services”)
3 - marital : marital status (categorical: “married”, “divorced”, “single”; note: “divorced” means divorced or widowed)
4 - education: education level (categorical: “unknown”, “secondary”, “primary”, “tertiary”)
5 - default: has credit in default? (binary: “yes”, “no”)
6 - balance: average yearly balance, in euros (numeric)
7 - housing: has housing loan? (binary: “yes”, “no”)
8 - loan: has personal loan? (binary: “yes”, “no”)
9 - contact: contact communication type (categorical: “unknown”, “telephone”, “cellular”)
10 - day: last contact day of the month (numeric)
11 - month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
12 - duration: last contact duration, in seconds (numeric)
13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
15 - previous: number of contacts performed before this campaign and for this client (numeric)
16 - poutcome: outcome of the previous marketing campaign (categorical: “unknown”, “other”, “failure”, “success”)
Output variable:
17 – y: has the client subscribed a term deposit? (binary: “yes”, “no”)
#getting a look at the data set
summary(BankMarketingNA)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
#shortening the names of the variables
BankMarketingNA$edu <- BankMarketingNA$education
BankMarketingNA$hous. <- BankMarketingNA$housing
BankMarketingNA$mar <- BankMarketingNA$marital
BankMarketingNA$pout <- BankMarketingNA$poutcome
BankMarketingNA$cont <- BankMarketingNA$contact
BankMarketingNA$def <- BankMarketingNA$default
#getting a look at frequency tables for categorical variables
CrossTable(BankMarketingNA$mar)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | divorced | married | single |
## |-----------|-----------|-----------|
## | 5207 | 27214 | 12790 |
## | 0.115 | 0.602 | 0.283 |
## |-----------|-----------|-----------|
##
##
##
##
CrossTable(BankMarketingNA$edu)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | primary | secondary | tertiary | unknown |
## |------------|------------|------------|------------|
## | 6851 | 23202 | 13301 | 1857 |
## | 0.152 | 0.513 | 0.294 | 0.041 |
## |------------|------------|------------|------------|
##
##
##
##
CrossTable(BankMarketingNA$def)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | no | yes |
## |-----------|-----------|
## | 44396 | 815 |
## | 0.982 | 0.018 |
## |-----------|-----------|
##
##
##
##
CrossTable(BankMarketingNA$hous.)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | no | yes |
## |-----------|-----------|
## | 20081 | 25130 |
## | 0.444 | 0.556 |
## |-----------|-----------|
##
##
##
##
CrossTable(BankMarketingNA$loan)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | no | yes |
## |-----------|-----------|
## | 37967 | 7244 |
## | 0.840 | 0.160 |
## |-----------|-----------|
##
##
##
##
CrossTable(BankMarketingNA$cont)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | cellular | telephone | unknown |
## |------------|------------|------------|
## | 29285 | 2906 | 13020 |
## | 0.648 | 0.064 | 0.288 |
## |------------|------------|------------|
##
##
##
##
CrossTable(BankMarketingNA$month)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | apr | aug | dec | feb | jan |
## |-----------|-----------|-----------|-----------|-----------|
## | 2932 | 6247 | 214 | 2649 | 1403 |
## | 0.065 | 0.138 | 0.005 | 0.059 | 0.031 |
## |-----------|-----------|-----------|-----------|-----------|
##
##
## | jul | jun | mar | may | nov |
## |-----------|-----------|-----------|-----------|-----------|
## | 6895 | 5341 | 477 | 13766 | 3970 |
## | 0.153 | 0.118 | 0.011 | 0.304 | 0.088 |
## |-----------|-----------|-----------|-----------|-----------|
##
##
## | oct | sep |
## |-----------|-----------|
## | 738 | 579 |
## | 0.016 | 0.013 |
## |-----------|-----------|
##
##
##
##
CrossTable(BankMarketingNA$pout)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 45211
##
##
## | failure | other | success | unknown |
## |-----------|-----------|-----------|-----------|
## | 4901 | 1840 | 1511 | 36959 |
## | 0.108 | 0.041 | 0.033 | 0.817 |
## |-----------|-----------|-----------|-----------|
##
##
##
##
After finding no missing values or null values, the variables distributions were individually looked at to find outliers. We created box plots for the numerical variables and outliers were found in age, balance, duration, campaign, pdays, and previous. The outliers are represented by little bubbles in the box plots below. The only boxplot that shows no outliers is the variable day. The box plot for the day variable not only shows the absence of any outliers, but also shows a normal distribution. This means that the day variable will not need any transformation, unlike the other variables.
par(mfrow=c(3,2))
#changing units for duration to minutes instead of seconds
BankMarketingNA$duration_min = round(BankMarketingNA$duration/60, 2)
#looking at boxplot for numerical variables that showed potential outliers
boxplot(BankMarketingNA$age)
title("Age Boxplot")
boxplot(BankMarketingNA$balance)
title("Balance Boxplot")
boxplot(BankMarketingNA$duration_min)
title("Duration Boxplot")
boxplot(BankMarketingNA$campaign)
title("Campaign Boxplot")
boxplot(BankMarketingNA$pdays)
title("Pdays Boxplot")
boxplot(BankMarketingNA$day)
title("Day Boxplot")
Boxplots for EDA
Before making any transformations to the variables, the normality was checked for each variable by looking at a histogram and the p-values for normality tests. For the variable age, the histogram was skewed right due to the outliers, so a log transformation was computed to create a more normal distribution and minimize the outliers. The variable balance also had a right skew to its histogram, but as some of the values were negative or zero, the log transformation could not be used. Therefore, the cube root was taken of each value to get a more normal histogram and minimize outliers. After the cube root transformation, the distribution was still skewed right, but was improved from before. After transforming the balance variable, we got some observations with missing values, so these observations were removed from the data. Next, the duration variable was found to be right skewed as well, but after the log transformation was performed, the distribution came to look more normal and minimized its outliers as well.
#getting histograms to check normality of numerical features
par(mfrow=c(3,2))
####
hist(BankMarketingNA$age, main="Age")
hist(BankMarketingNA$balance, main = "Balance")
hist(BankMarketingNA$duration_min, main = "Duration")
hist(BankMarketingNA$campaign, main = "Campaign")
hist(BankMarketingNA$day, main = "Day")
hist(BankMarketingNA$previous, main = "Previous")
Histograms for EDA
#transforming variables as needed
#log transformation on age to get rid of right skew
v_age <- c(BankMarketingNA$age)
BankMarketingNA$trans_age <- log(v_age + 1)
#cube root transformation for balance to fix right skew, with negative and zero values
BankMarketingNA$new_bal = BankMarketingNA$balance^(1/3)
#transforming duration_min with the log transformation to get rid of right skew
v_duration_min <- c(BankMarketingNA$duration_min)
BankMarketingNA$dur_min <- log(v_duration_min + 1)
#grouping pdays
BankMarketingNA$grp_pd <- ifelse(BankMarketingNA$pdays == -1, "-1", "1+"
)
#grouping previous variable
BankMarketingNA$grp_pre <- ifelse(BankMarketingNA$previous == 0, "0", "1+"
)
#grouping months
month = gsub(" ", "",BankMarketingNA$month) # remove all white space in the string!!!
BankMarketingNA$grp_mon <- ifelse(month %in% c('dec', 'jan', 'feb'), 'Wint',
ifelse(month %in% c('sep', 'oct', 'nov'), 'Fall',
ifelse(month %in% c('apr','mar','may'), 'Spri', 'Summ')))
#table(BankMarketing$grp_mon)
#grouping jobs
job = gsub(" ", "",BankMarketingNA$job) # remove all white space in the string!!!
BankMarketingNA$grp_job <- ifelse(job %in% c("unknown", "unemployed", "retired", "student"),'Unk./Unemp.' ,
ifelse(job %in% c("admin.", "management", "self-employed", "entrepreneur"), 'Bus.',
'B.-Col.-Serv.'
))
#table(BankMarketing$grp_job)
#grouping campaign
BankMarketingNA$grp_cmpn <- ifelse(BankMarketingNA$campaign == 1 ,'1',
ifelse(BankMarketingNA$campaign == 2, '2',
ifelse(BankMarketingNA$campaign == 3, '3',
ifelse(BankMarketingNA$campaign >= 4, '4-63', BankMarketingNA$campaign
))))
#table(BankMarketing$grp_cmpn)
#rechecking normality of transformed variables
par(mfrow=c(2,2))
hist(BankMarketingNA$trans_age, main = "Transformed Age")
hist(BankMarketingNA$new_bal, main = "Transformed Balance")
hist(BankMarketingNA$dur_min, main = "Transformed Duration")
Histograms for Transformed Features
The campaign variable was more difficult because none of the transformations like the log, square root, and cube root transformations normalized the distribution. So, instead, the variable was grouped into categories to get rid of the sparse values. Similarly, the variables pdays and previous were made up of 80% of customers that were not previously contacted, so they were grouped into two groups, not contacted or contacted. This was done to get rid of the sparse groups as well. The new groups and the number of observations in each group can be seen below.
par(mfrow=c(3,1))
pre_sum <- BankMarketingNA %>% count(grp_pre)
pre_n <- t(pre_sum$n)
kable(x = pre_n, col.names = c("Not Previously Contacted (0)", "Previously Contacted (1+)"), caption = "Newly Formed Groups for Variable: Previous")
| Not Previously Contacted (0) | Previously Contacted (1+) |
|---|---|
| 36954 | 8257 |
pd_sum <- BankMarketingNA %>% count(grp_pd)
pd_n <- t(pd_sum$n)
kable(x = pd_n, col.names = c("Not Previously Contacted (-1)", "Previously Contacted one or more days ago"), caption = "Newly Formed Groups for Variable: Pdays")
| Not Previously Contacted (-1) | Previously Contacted one or more days ago |
|---|---|
| 36954 | 8257 |
cmpn_sum <- BankMarketingNA %>% count(grp_cmpn)
cmpn_n <- t(cmpn_sum$n)
kable(x = cmpn_n, col.names = c("Contacted 1 Time", "Contacted 2 Times", "Contacted 3 Times", "Contacted 4 or More Times"), caption = "Newly Formed Groups for Variable: Campaign")
| Contacted 1 Time | Contacted 2 Times | Contacted 3 Times | Contacted 4 or More Times |
|---|---|---|---|
| 17544 | 12505 | 5521 | 9641 |
Next, pairwise comparisons was performed with a pairwise scatter plot for each numeric variable, excluding the newly grouped variables campaign, previous, and pdays. The scatter plots below all showed a similar trend with the red curve differing paths from the blue curve, providing evidence that there is only a small correlation between the compared numerical variables. Furthermore, the correlation values listed in grey are are fairly low as well. A low correlation value for each comparison indicates that all the variables should be included in further sub sequential models and algorithms because they all provide new information for the models.
#creating a data set with only numeric variables, and outcome
numericVar = BankMarketingNA[, c("new_bal", "dur_min", "trans_age", "day", "y")]
#taking random samples of about 400 observations
randsamp.numericVar <-sample_n(numericVar,400)
#pairwise comparison
ggpairs(randsamp.numericVar,
columns = 1:4,
aes(color = y, # Color by group (cat. variable)
alpha = 0.5))
## Warning: Removed 23 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 23 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 23 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 23 rows containing missing values
## Warning: Removed 23 rows containing missing values (`geom_point()`).
## Removed 23 rows containing missing values (`geom_point()`).
## Removed 23 rows containing missing values (`geom_point()`).
include_graphics("https://github.com/MT24STA490/STA-551/blob/main/pairwise-plot.png?raw=true")
Pairwise Plots
None of the categorical variables included any missing values or null values, so the next step is to minimize the number of different levels by grouping variables with a lot of different categories. Both job and month had over 10 different categories, so month was grouped by season, and job was grouped by more broad job titles. One of the new category titles was called unknown/unemployed which included students, retired, unemployed, and unknown. Another was called Blue-collar/Services which included the jobs: services, housemaid, and technician. The last category was called Business, which included admin., entrepreneur, management, and self-employed. Below are the newly grouped categorical variables and the number of observations included in each new group.
#putting new groups of variables into tables for display
par(mfrow=c(2,1))
pre_sum <- BankMarketingNA %>% count(grp_mon)
pre_n <- t(pre_sum$n)
kable(x = pre_n, col.names = c("Fall", "Spring", "Summer", "Winter"), caption = "Newly Formed Groups for Variable: Month")
| Fall | Spring | Summer | Winter |
|---|---|---|---|
| 5287 | 17175 | 18483 | 4266 |
pd_sum <- BankMarketingNA %>% count(grp_job)
pd_n <- t(pd_sum$n)
kable(x = pd_n, col.names = c("Blue Collar / Service", "Business", "Unknown / Unemployed"), caption = "Newly Formed Groups for Variable: Job")
| Blue Collar / Service | Business | Unknown / Unemployed |
|---|---|---|
| 22723 | 17695 | 4793 |
Next, pairwise comparison was performed. Each variable was compared to the output variable y in a mosaic plot. The newly grouped variables previous and pdays were included in this comparison. All of the plots showed that the outcome was dependent on the different variables, evident by the plots individual categories which were not equal in height. Because the outcome varied between each group in each variable, this means that the variables could potentially provide significant information when put in a predicting model. This indicates that they should all be considered in future models.
#looking at boxplots and pairwise comparisons for categorical variables
par(mfrow = c(3,2))
mosaicplot(mar ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="marital vs. outcome")
mosaicplot(grp_job ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="job vs. outcome")
mosaicplot(edu ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="education vs. outcome")
mosaicplot(def ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="default vs. outcome")
mosaicplot(hous. ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="housing vs. outcome")
mosaicplot(loan ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="loan vs. outcome")
Pairwise Comparison
mosaicplot(cont ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="contact vs. outcome")
mosaicplot(grp_mon ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="season vs outcome")
mosaicplot(pout ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="poutcome vs outcome")
mosaicplot(grp_pre ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="previous vs. outcome")
mosaicplot(grp_pd ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="pdays vs. outcome")
mosaicplot(grp_cmpn ~ y, data=BankMarketingNA,col=c("Blue","Red"), main="campaign vs. outcome")
Pairwise Comparison
Now that the variables have been engineered for our needs, the next step is modeling. When building a model for the newly engineered data set, we decided to use a logistic regression approach due to the binary response variable. Starting off with the full model, this model is built up with all of the possible variables, despite their p-values and significance. After that, we move onto the reduced model, which is only made up of significant variables that improve the model. The significant variables are determined by a p-value less then 0.05, meaning that any variable that exceeded this p-value was not included in the reduced model. Both of these models together were utilized as bounds for the final model.
#assignment 3
#making y numeric
y <- gsub(" ", "",BankMarketingNA$y)
BankMarketingNA$y_num <- ifelse(y %in% c('yes'), 1, 0)
#creating full model using glm-model didn't converge, consider ridge regression
full.model =glm(BankMarketingNA$y_num ~ mar + grp_job + edu + def + hous. + loan + cont + grp_mon + pout + grp_pre + grp_pd + trans_age + day + dur_min + new_bal + grp_cmpn, # model formula, the response variable is on the left side.
family=binomial, # this must be specified since the response is binary!
data=BankMarketingNA)
#printing parameter estimates for full model
coefficient.table = summary(full.model)$coef
kable(coefficient.table, caption = "Significance tests of logistic regression full model")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -6.0250432 | 1.0850496 | -5.5527816 | 0.0000000 |
| mar married | -0.1985343 | 0.0598173 | -3.3190110 | 0.0009034 |
| mar single | 0.0643846 | 0.0687361 | 0.9366924 | 0.3489168 |
| grp_jobBus. | 0.1519907 | 0.0449950 | 3.3779429 | 0.0007303 |
| grp_jobUnk./Unemp. | 0.5323866 | 0.0570103 | 9.3384219 | 0.0000000 |
| edu secondary | 0.2250788 | 0.0615388 | 3.6575087 | 0.0002547 |
| edu tertiary | 0.4154247 | 0.0687663 | 6.0411049 | 0.0000000 |
| edu unknown | 0.2909910 | 0.1006219 | 2.8919258 | 0.0038289 |
| def yes | -0.2418033 | 0.2354876 | -1.0268193 | 0.3045055 |
| hous. yes | -0.9128193 | 0.0430473 | -21.2050323 | 0.0000000 |
| loan yes | -0.5008711 | 0.0618657 | -8.0961013 | 0.0000000 |
| cont telephone | -0.0136247 | 0.0743607 | -0.1832243 | 0.8546220 |
| cont unknown | -1.1597910 | 0.0592143 | -19.5863366 | 0.0000000 |
| grp_monSpri | 0.0800941 | 0.0581875 | 1.3764838 | 0.1686719 |
| grp_monSumm | -0.2744039 | 0.0569068 | -4.8219879 | 0.0000014 |
| grp_monWint | -0.2552634 | 0.0691594 | -3.6909409 | 0.0002234 |
| pout other | 0.2874105 | 0.0903786 | 3.1800730 | 0.0014724 |
| pout success | 2.3486974 | 0.0810564 | 28.9760971 | 0.0000000 |
| pout unknown | 1.2522823 | 1.0254055 | 1.2212557 | 0.2219892 |
| grp_pre1+ | 1.4891726 | 1.0244034 | 1.4536973 | 0.1460302 |
| trans_age | -0.1465316 | 0.0838891 | -1.7467291 | 0.0806843 |
| day | -0.0057074 | 0.0022398 | -2.5481238 | 0.0108304 |
| dur_min | 2.1338470 | 0.0323923 | 65.8752105 | 0.0000000 |
| new_bal | 0.0225097 | 0.0033625 | 6.6942925 | 0.0000000 |
| grp_cmpn2 | -0.3786628 | 0.0443444 | -8.5391391 | 0.0000000 |
| grp_cmpn3 | -0.3080132 | 0.0598627 | -5.1453307 | 0.0000003 |
| grp_cmpn4-63 | -0.5046849 | 0.0567881 | -8.8871523 | 0.0000000 |
#reduced model by p-value and AIC criterion
#getting rid of p-value, grp_previous, trans_age,grp_month, contact, day, and grp_pdays
reduced.model = glm(BankMarketingNA$y_num ~ mar + grp_job + edu + hous. + loan + pout + dur_min + new_bal,
family = binomial,
data = BankMarketingNA)
#printing parameter estimates for reduced model
coefficient.table = summary(reduced.model)$coef
kable(coefficient.table, caption = "Significance Tests of logistic regression reduced model")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -5.4278622 | 0.1130642 | -48.006885 | 0.0000000 |
| mar married | -0.1824294 | 0.0585650 | -3.114989 | 0.0018395 |
| mar single | 0.1764423 | 0.0616811 | 2.860555 | 0.0042290 |
| grp_jobBus. | 0.1805590 | 0.0442184 | 4.083343 | 0.0000444 |
| grp_jobUnk./Unemp. | 0.5943619 | 0.0547011 | 10.865630 | 0.0000000 |
| edu secondary | 0.2800343 | 0.0598823 | 4.676414 | 0.0000029 |
| edu tertiary | 0.4876643 | 0.0668516 | 7.294724 | 0.0000000 |
| edu unknown | 0.2744635 | 0.0989194 | 2.774617 | 0.0055267 |
| hous. yes | -0.9228536 | 0.0384500 | -24.001414 | 0.0000000 |
| loan yes | -0.5037262 | 0.0608733 | -8.274994 | 0.0000000 |
| pout other | 0.2260754 | 0.0898139 | 2.517154 | 0.0118307 |
| pout success | 2.2696256 | 0.0802112 | 28.295626 | 0.0000000 |
| pout unknown | -0.6519499 | 0.0549714 | -11.859808 | 0.0000000 |
| dur_min | 2.0871488 | 0.0314338 | 66.398144 | 0.0000000 |
| new_bal | 0.0242147 | 0.0032708 | 7.403219 | 0.0000000 |
Using the full and reduced model to build the final model, the final model ended up with 13 of the 16 predicting variables available. The final model excluded the default variable, which indicates whether the customer has credit in a default, the previous variable which indicates how many times the customer was contacted before this campaign, and pdays which indicates how many days since the customer was last contacted before this campaign. This means that these variables were not significantly improving the model when predicting whether the customer subscribed to the term deposit.
final.model = stepAIC(full.model,
scope=list(lower=formula(reduced.model),upper=formula(full.model)),
data = BankMarketingNA,
direction = "backward",
trace = 0) #trace = 0:suppress the detailed
#selection process
#got rid of grp_previous, grp_pdays, default
#printing final model coefficients
final.model.coef = summary(final.model)$coef
kable(final.model.coef , caption = "Summary Table of Significant Tests")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -4.5392208 | 0.3516724 | -12.9075249 | 0.0000000 |
| mar married | -0.1962908 | 0.0597952 | -3.2827176 | 0.0010281 |
| mar single | 0.0660461 | 0.0687212 | 0.9610730 | 0.3365155 |
| grp_jobBus. | 0.1520453 | 0.0449911 | 3.3794556 | 0.0007263 |
| grp_jobUnk./Unemp. | 0.5341695 | 0.0569901 | 9.3730185 | 0.0000000 |
| edu secondary | 0.2256504 | 0.0615350 | 3.6670244 | 0.0002454 |
| edu tertiary | 0.4165539 | 0.0687592 | 6.0581558 | 0.0000000 |
| edu unknown | 0.2916336 | 0.1006156 | 2.8984939 | 0.0037496 |
| hous. yes | -0.9117506 | 0.0430296 | -21.1889126 | 0.0000000 |
| loan yes | -0.5027728 | 0.0617813 | -8.1379507 | 0.0000000 |
| cont telephone | -0.0134104 | 0.0743609 | -0.1803422 | 0.8568839 |
| cont unknown | -1.1602671 | 0.0592103 | -19.5956967 | 0.0000000 |
| grp_monSpri | 0.0797174 | 0.0581543 | 1.3707895 | 0.1704406 |
| grp_monSumm | -0.2754371 | 0.0568831 | -4.8421587 | 0.0000013 |
| grp_monWint | -0.2559840 | 0.0691383 | -3.7024937 | 0.0002135 |
| pout other | 0.2868420 | 0.0903877 | 3.1734613 | 0.0015063 |
| pout success | 2.3493013 | 0.0810550 | 28.9840362 | 0.0000000 |
| pout unknown | -0.2369468 | 0.0576687 | -4.1087592 | 0.0000398 |
| trans_age | -0.1471512 | 0.0838811 | -1.7542844 | 0.0793818 |
| day | -0.0057369 | 0.0022398 | -2.5613819 | 0.0104257 |
| dur_min | 2.1337476 | 0.0323876 | 65.8815797 | 0.0000000 |
| new_bal | 0.0227796 | 0.0033506 | 6.7985578 | 0.0000000 |
| grp_cmpn2 | -0.3786729 | 0.0443409 | -8.5400410 | 0.0000000 |
| grp_cmpn3 | -0.3072668 | 0.0598581 | -5.1332530 | 0.0000003 |
| grp_cmpn4-63 | -0.5041658 | 0.0567725 | -8.8804611 | 0.0000000 |
Using the full and reduced model to build the final model, the final model ended up with 13 of the 16 predicting variables available. The final model excluded the default variable, which indicates whether the customer has credit in a default, the previous variable which indicates how many times the customer was contacted before this campaign, and pdays which indicates how many days since the customer was last contacted before this campaign. This means that these variables were not significantly improving the model when predicting whether the customer subscribed to the term deposit.
When looking at the coefficients provided above for the final model variables, the variables with negative coefficients can be interpreted as having a lesser chance of getting the customer to subscribe to the term deposit when the variable category outcome is true. For example, the marital variable is negative when the customer is married, indicating that when a customer is married, that customer is less likely to subscribe to the term deposit. This goes for customers with housing loans, personal loans, those who were contacted by the telephone or by unknown means, those who were last contacted in the winter and summer, those who are older in age, those contacted multiple times during this campaign, those who were contacted later in the month, and those who had an unknown outcome for the previous campaign.
The variables with a positive coefficient have a greater chance at getting the customer to subscribe to the term deposit when the variable category outcome is true. This includes customers who are single, those with jobs that qualify as business, unemployed, or unknown, those with secondary, tertiary, or unknown education, those contacted in the spring, those with a previous subscription outcome of success or other, those with a higher yearly balance of money, and those who had longer duration of time spent in conversations when contacted. For, example customers who were contacted in this campaign during the spring were more likely to subscribe to the term deposit then those contacted in the default season which was fall.
#predictive analysis
#creating new data frame for data within bounds of the model
mynewdata = data.frame(trans_age=c(3, 4), #using my own variables here, with cutoffs
mar = c(" married", " single"),
grp_job = c("Bus.","Unk./Unemp."),
edu = c(" primary", " tertiary"),
dur_min = c(2,4),
new_bal = c(100,40),
loan = c(" no", " yes"))
grp_mon = c("Wint", "Summ")
day = c(2, 1)
cont = c(" unknown", " unknown")
hous. = c(" yes", " no")
pout = c(" unknown", " unknown")
grp_cmpn = c("1", "3")
pred.success.prob = predict(final.model, newdata = mynewdata, type="response")
##
## threshold probability
cut.off.prob = 0.5
pred.response = ifelse(pred.success.prob > cut.off.prob, 1, 0) # This predicts the response
## Add the new predicted response to Mynewdata
mynewdata$Pred.Response = pred.response
##printing prediction table
kable(mynewdata, caption = "Predicted Value of Response Variable ")
| trans_age | mar | grp_job | edu | dur_min | new_bal | loan | Pred.Response |
|---|---|---|---|---|---|---|---|
| 3 | married | Bus. | primary | 2 | 100 | no | 0 |
| 4 | single | Unk./Unemp. | tertiary | 4 | 40 | yes | 1 |
We can now use this model to predict the outcome of customer subscription with information regarding the campaign and the customer. In this case, the model predicted that a customer who was married, with a job in business, in their twenties, primary education, an average yearly balance of 100 (most likely thousand) euros, no personal loan, and had a conversation for between 5 and 10 minutes to not subscribe to the term deposit with this campaign. In the second case, the model predicted that a customer who was single, with an unknown job or unemployed, in their fifties, tertiary education, an average yearly balance of 40 (most likely thousand) euros, currently with a personal loan, and had a conversation for about 50 minutes long to subscribe to the term deposit with this campaign.
After building several models, we want to see how the models perform. We can use cross-validation, cut-off probability, test accuracy and an ROC curve to help determine the best model. To start this process, the data set first needs to be split into two random groups, the training data set and the testing data set. The training data set will be made up of 70% of the observations while the testing data set uses the last 30%. The training data set will be used to help fit the model therefore 70% of the original data is required to give a reliable model. Then after the model is complete, the testing data set will be used for an unbiased evaluation.
#Assignment 4
## Cross-Validation and Performance of Logistic Predictive Models
BankMarketing <- na.omit(BankMarketingNA)
nn = dim(BankMarketing)[1]
train.id = sample(1:nn, round(nn*0.7), replace = FALSE)
training = BankMarketing[train.id,]
testing = BankMarketing[-train.id,]
The next objective is to try to find the optimal cut-off probability. We take several candidates for the optimal cut-off probability, in this case 20 different values. By using a 5-fold cross validation method to identify the best cut-off point based on the highest accuracy, the results above show the optimal value of 0.48. The optimal cut-off probability is then used to find the accuracy of the model.
#checking with test data-subset how the accuracy performs with the final model from before
test.model = glm(y_num ~ mar + grp_job + edu + hous. + loan + cont + grp_mon + pout + trans_age + day + dur_min + new_bal + grp_cmpn, family = binomial(link = logit), data = training)
newdata2 = data.frame = c(testing) #creating test data set
pred.prob.test = predict.glm(test.model, newdata2, type = "response")
testing$test.status = as.numeric(pred.prob.test > 0.48) #plug in optimal cut-off value
a11 = sum(testing$test.status == testing$y_num, na.rm=TRUE)
test.accuracy = a11/length(pred.prob.test) #percentage of correct predictions
kable(as.data.frame(test.accuracy), align='c', caption = "Test Accuracy")
| test.accuracy |
|---|
| 0.8982628 |
#test accuracy is really good bc there are no assumptions
# model -> prediction probability -> highest accuracy (all by cross-validation approach)
Here we have tested the performance of the model by using the test data set and got the test accuracy. To test the model, the accuracy is found by plugging the test data set into the model created by the training data set and calculating how many observations were predicted correctly. Then we divide that value by the predicted probability utilizing the optimal cut-off score. Because the accuracy for the test data was about the same as the accuracy found with the training data, this shows that the model is not under fitting. Furthermore, because this accuracy has a high value, this shows us that the model is doing a fairly good job at predicting the outcome.
#sensitivity and Specificity
test.model = glm(y_num ~ mar + grp_job + edu + hous. + loan + cont + grp_mon + pout + trans_age + day + dur_min + new_bal + grp_cmpn, family = binomial(link = logit), data = training)
newdata3 = data.frame = c(testing)
pred.prob.test = predict.glm(test.model, newdata3, type = "response")
testing$test.status = as.numeric(pred.prob.test > 0.48)
### components for defining various measures
p0.a0 = sum(testing$test.status ==0 & testing$y_num==0)
p0.a1 = sum(testing$test.status ==0 & testing$y_num ==1)
p1.a0 = sum(testing$test.status ==1 & testing$y_num ==0)
p1.a1 = sum(testing$test.status ==1 & testing$y_num ==1)
###
sensitivity = p1.a1 / (p1.a1 + p0.a1)
specificity = p0.a0 / (p0.a0 + p1.a0)
###
precision = p1.a1 / (p1.a1 + p1.a0)
recall = sensitivity
F1 = 2*precision*recall/(precision + recall)
metric.list = cbind(sensitivity = sensitivity,
specificity = specificity,
precision = precision,
recall = recall,
F1 = F1)
kable(as.data.frame(metric.list), align='c', caption = "Local Performance Metrics")
| sensitivity | specificity | precision | recall | F1 |
|---|---|---|---|---|
| 0.3665529 | 0.9692771 | 0.6144165 | 0.3665529 | 0.4591706 |
The above table provides more values used to see how the model is performing. Sensitivity measures the probability of how many costumers were predicted to subscribe divided by the amount of customers who actually did subscribe. Sensitivity is the same as recall, explaining their identical values. Specificity is the probability of those customers who were predicted not to subscribe to the term deposit divided by all of the customers who actually did not subscribe. The sensitivity makes the model look poor in it’s performance with such a low value, but this is not a great way of measuring the performance of the model due to the imbalance between customers who subscribe and not in the overall data set. Over 85% of the observations in the original data set are made of customers who did not subscribe to the term deposit, therefore making it difficult for the model to learn how to discern which customers truly will subscribe. This also explains why the specificity value is so high. A better way to measure the performance of the model is by using the precision percentage. This value calculates the number of customers who were actually subscribed among all the costumers predicted to subscribe, making it a more logical choice for this data set. Finally the F1 value is a way of combining the recall and precision value by simply getting the mean, and this tends to be a popular choice for measuring the model performance as it combines two reliable measuring values. This being said, the value shown above does not show a great performance from the model.
#ROC curve
cut.off.seq = seq(0.01, 0.99, length = 100)
sensitivity.vec = NULL
specificity.vec = NULL
training.model = glm(y_num ~ mar + grp_job + edu + hous. + loan + cont + grp_mon + pout + trans_age + day + dur_min + new_bal + grp_cmpn, family = binomial(link = logit), data = training)
newdata4 = data.frame = c(training)
pred.prob.train = predict.glm(training.model, newdata4, type = "response")
for (i in 1:100){
training$train.status = as.numeric(pred.prob.train > cut.off.seq[i])
### components for defining various measures
p0.a0 = sum(training$train.status ==0 & training$y_num==0, na.rm = TRUE)
p0.a1 = sum(training$train.status ==0 & training$y_num ==1, na.rm = TRUE)
p1.a0 = sum(training$train.status ==1 & training$y_num==0, na.rm = TRUE)
p1.a1 = sum(training$train.status ==1 & training$y_num ==1, na.rm = TRUE)
###
sensitivity.vec[i] = p1.a1 / (p1.a1 + p0.a1)
specificity.vec[i] = p0.a0 / (p0.a0 + p1.a0)
}
one.minus.spec = c(1,1 - specificity.vec)
sens.vec = c(1,sensitivity.vec)
##
##better way - need pROC
prediction = pred.prob.train
category = training$y_num == 1
ROCobj <- roc(category, prediction)
## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls < cases
AUC = round(auc(ROCobj),4)
par(pty = "s") # make a square figure
plot(one.minus.spec, sens.vec, type = "l", xlim = c(0,1),
xlab ="1 - specificity",
ylab = "sensitivity",
main = "ROC curve of Logistic Bank Marketing Model",
lwd = 2,
col = "blue", )
segments(0,0,1,1, col = "red", lty = 2, lwd = 2)
AUC = round(sum(sens.vec*(one.minus.spec[-101]-one.minus.spec[-1])),4)
## Warning in sens.vec * (one.minus.spec[-101] - one.minus.spec[-1]): longer
## object length is not a multiple of shorter object length
text(0.8, 0.3, paste("AUC = ", AUC), col = "blue")
include_graphics("https://github.com/MT24STA490/STA-551/blob/main/Logistic%20ROC%20Curve.PNG?raw=true")
One last way to measure the performance of a model, and also a good way to compare the performance of one model to another which will help us later on when we have more models, is the receiver operating characteristic(ROC) curve. The graph above shows sensitivity and 1 minus specificity plotted together. This specificity and sensitivity is not from the test data set like the local metrics we calculated before, but is from the training data set as we are using the ROC curve to compare the model to others and therefore need the global measurements. These measurements can be defined as the false positive rate and the true positive rate being plotted together. The area under the curve value(AUC) represents how much of the data is modeled correctly.The curve shows to be much higher then the 50% dotted line which represents what a 50-50 chance of predicting the outcome would be. This tells us that the model is doing well, but again, the ROC curve and AUC are more useful for when we need to compare competing models, which is what we will be doing later on in the paper.
The neural network method is just another way to build a model for our data set regarding bank marketing. It uses forward propagation, backward propagation, and implementing various weights to achieve a model with minimized error. To start this process, numeric features need to be scaled and categorical features names need to be extracted. The numerical features need to be scaled to make them normalized. The character variables need to be given dummy variables simply because the method we are using only takes numerical features. When the categorical features are turned into numerical features by using dummy variables, each new dummy variable is renamed and put into a simple model as seen below.
#feature conversion for neuralnet
#reading in Feature Engineered Data set
EDA_data = read.csv("https://pengdsci.github.io/datasets/BankMarketing/BankMarketingCSV.csv")[ , -1]
#numeric scaling
BankMarketing$trans_age.scaled = (BankMarketing$trans_age-min(BankMarketing$trans_age))/(max(BankMarketing$trans_age)-min(BankMarketing$trans_age))
BankMarketing$day.scaled = (BankMarketing$day-min(BankMarketing$day))/(max(BankMarketing$day)-min(BankMarketing$day))
BankMarketing$dur_min.scaled = (BankMarketing$dur_min-min(BankMarketing$dur_min))/(max(BankMarketing$dur_min)-min(BankMarketing$dur_min))
BankMarketing$new_bal.scaled = (BankMarketing$new_bal-min(BankMarketing$new_bal, na.rm = TRUE))/(max(BankMarketing$new_bal, na.rm = TRUE)-min(BankMarketing$new_bal, na.rm = TRUE))
#extracting names for categorical variables
BankMtx0 = model.matrix(~ grp_job + edu + grp_mon + grp_pd + grp_cmpn + grp_pre + mar + def + hous. + loan + cont + pout, data = BankMarketing)
colnames(BankMtx0)
## [1] "(Intercept)" "grp_jobBus." "grp_jobUnk./Unemp."
## [4] "edu secondary" "edu tertiary" "edu unknown"
## [7] "grp_monSpri" "grp_monSumm" "grp_monWint"
## [10] "grp_pd1+" "grp_cmpn2" "grp_cmpn3"
## [13] "grp_cmpn4-63" "grp_pre1+" "mar married"
## [16] "mar single" "def yes" "hous. yes"
## [19] "loan yes" "cont telephone" "cont unknown"
## [22] "pout other" "pout success" "pout unknown"
BankMtx = model.matrix(~ grp_job + edu + grp_mon + grp_pd + grp_cmpn + grp_pre + mar + def + hous. + loan + cont + pout + trans_age.scaled + day.scaled + dur_min.scaled + new_bal.scaled + y_num, data = BankMarketing)
colnames(BankMtx)
## [1] "(Intercept)" "grp_jobBus." "grp_jobUnk./Unemp."
## [4] "edu secondary" "edu tertiary" "edu unknown"
## [7] "grp_monSpri" "grp_monSumm" "grp_monWint"
## [10] "grp_pd1+" "grp_cmpn2" "grp_cmpn3"
## [13] "grp_cmpn4-63" "grp_pre1+" "mar married"
## [16] "mar single" "def yes" "hous. yes"
## [19] "loan yes" "cont telephone" "cont unknown"
## [22] "pout other" "pout success" "pout unknown"
## [25] "trans_age.scaled" "day.scaled" "dur_min.scaled"
## [28] "new_bal.scaled" "y_num"
#renaming collumn names
colnames(BankMtx)[2] <- "JobBus"
colnames(BankMtx)[3] <- "JobUnkUnemp"
colnames(BankMtx)[4] <- "Edu2"
colnames(BankMtx)[5] <- "Edu3"
colnames(BankMtx)[6] <- "EduUnk"
colnames(BankMtx)[7] <- "MonSpr"
colnames(BankMtx)[8] <- "MonSumm"
colnames(BankMtx)[9] <- "MonWint"
colnames(BankMtx)[10] <- "pd0"
colnames(BankMtx)[11] <- "cmpn2"
colnames(BankMtx)[12] <- "cmpn3"
colnames(BankMtx)[13] <- "cmpn4to63"
colnames(BankMtx)[14] <- "pre1plus"
colnames(BankMtx)[15] <- "marMar"
colnames(BankMtx)[16] <- "marSin"
colnames(BankMtx)[17] <- "defYes"
colnames(BankMtx)[18] <- "housyes"
colnames(BankMtx)[19] <- "loanyes"
colnames(BankMtx)[20] <- "contTel"
colnames(BankMtx)[21] <- "contUnk"
colnames(BankMtx)[22] <- "poutOther"
colnames(BankMtx)[23] <- "poutSuc"
colnames(BankMtx)[24] <- "poutUnk"
colnames(BankMtx)[25] <- "age"
colnames(BankMtx)[26] <- "day"
colnames(BankMtx)[27] <- "durMin"
colnames(BankMtx)[28] <- "balance"
colnames(BankMtx)[29] <- "y"
#short-cutting the formula
columnNames = colnames(BankMtx)
colnames(BankMtx)
## [1] "(Intercept)" "JobBus" "JobUnkUnemp" "Edu2" "Edu3"
## [6] "EduUnk" "MonSpr" "MonSumm" "MonWint" "pd0"
## [11] "cmpn2" "cmpn3" "cmpn4to63" "pre1plus" "marMar"
## [16] "marSin" "defYes" "housyes" "loanyes" "contTel"
## [21] "contUnk" "poutOther" "poutSuc" "poutUnk" "age"
## [26] "day" "durMin" "balance" "y"
columnList = paste(columnNames[-c(1,length(columnNames))], collapse = "+")
columnList = paste(c(columnNames[length(columnNames)],"~",columnList), collapse="")
modelFormula = formula(columnList)
Neural Network Formula:
y = JobBus + JobUnkUnemp + Edu2 + Edu3 + EduUnk + MonSpr + MonSumm + MonWint + pd0 + cmpn2 + cmpn3 + cmpn4to63 + pre1plus + marMar + marSin + defYes + housyes + loanyes + contTel + contUnk + poutOther + poutSuc + poutUnk + age + day + durMin + balance
Single Layer Neural Network Model
By the use of putting the new simple model through back propagation, we can get it’s associated weights and now build the neural network model. In the figure we can see a visual representation of the neural network model using weights to predict the final outcome. The next step after building a model with the neural network method is to use cross-validation to get the optimal cut-off score.
#cross - validation on neural network model
#getting coefficients for network model
logiModel = glm(factor(y) ~ grp_job + edu + grp_mon + grp_pd + grp_cmpn + grp_pre + mar + def + hous. + loan + cont + pout + trans_age.scaled + day.scaled + dur_min.scaled + new_bal.scaled, family = binomial, data = BankMarketing)
pander::pander(summary(logiModel)$coefficients)
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -6.462 | 1.036 | -6.235 | 4.528e-10 |
| grp_jobBus. | 0.152 | 0.045 | 3.378 | 0.0007303 |
| grp_jobUnk./Unemp. | 0.5324 | 0.05701 | 9.338 | 9.778e-21 |
| edu secondary | 0.2251 | 0.06154 | 3.658 | 0.0002547 |
| edu tertiary | 0.4154 | 0.06877 | 6.041 | 1.531e-09 |
| edu unknown | 0.291 | 0.1006 | 2.892 | 0.003829 |
| grp_monSpri | 0.08009 | 0.05819 | 1.376 | 0.1687 |
| grp_monSumm | -0.2744 | 0.05691 | -4.822 | 1.421e-06 |
| grp_monWint | -0.2553 | 0.06916 | -3.691 | 0.0002234 |
| grp_pd1+ | 1.489 | 1.024 | 1.454 | 0.146 |
| grp_cmpn2 | -0.3787 | 0.04434 | -8.539 | 1.352e-17 |
| grp_cmpn3 | -0.308 | 0.05986 | -5.145 | 2.67e-07 |
| grp_cmpn4-63 | -0.5047 | 0.05679 | -8.887 | 6.27e-19 |
| mar married | -0.1985 | 0.05982 | -3.319 | 0.0009034 |
| mar single | 0.06438 | 0.06874 | 0.9367 | 0.3489 |
| def yes | -0.2418 | 0.2355 | -1.027 | 0.3045 |
| hous. yes | -0.9128 | 0.04305 | -21.21 | 8.581e-100 |
| loan yes | -0.5009 | 0.06187 | -8.096 | 5.675e-16 |
| cont telephone | -0.01362 | 0.07436 | -0.1832 | 0.8546 |
| cont unknown | -1.16 | 0.05921 | -19.59 | 2.022e-85 |
| pout other | 0.2874 | 0.09038 | 3.18 | 0.001472 |
| pout success | 2.349 | 0.08106 | 28.98 | 1.317e-184 |
| pout unknown | 1.252 | 1.025 | 1.221 | 0.222 |
| trans_age.scaled | -0.2374 | 0.1359 | -1.747 | 0.08068 |
| day.scaled | -0.1712 | 0.0672 | -2.548 | 0.01083 |
| dur_min.scaled | 9.428 | 0.1431 | 65.88 | 0 |
| new_bal.scaled | 1.052 | 0.1572 | 6.694 | 2.167e-11 |
As seen in the 5-fold cross validation performance graph above, there
are several suggested optimal cut-off points. In our case, we will use
the mean value among the reported scores. Now that we have a optimal
cut-off score, we can test the model performance with the test accuracy
rate.
#testing model performance
nn.results <- predict(NetworkModel, testDat)
results <- data.frame(actual = testDat[,17], prediction = nn.results > .90)
confMatrix = table(results$prediction, results$actual) # confusion matrix
accuracy=sum(results$actual == results$prediction)/length(results$prediction)
#list(confusion.matrix = confMatrix, accuracy = accuracy)
| accuracy |
|---|
| 0.9912102 |
The test accuracy of about 99% shows that the model is performing very well, although when we see percentages this high, knowing there is an imbalance between the customers who subscribed and not, this can also be a warning sign that the model isn’t learning enough to logically predict the customers who subscribed to the term deposit.
#Roc Analysis
#based on training set
#used for model selection
nn.results = predict(NetworkModel, trainDat) # Keep in mind that trainDat is a matrix!
cut0 = seq(0,1, length = 20) #we can use as many as we want, the more we use, the smoother the curve will be
SenSpe = matrix(0, ncol = length(cut0), nrow = 2, byrow = FALSE)
for (i in 1:length(cut0)){
a = sum(trainDat[,"y"] == 1 & (nn.results > cut0[i]))
d = sum(trainDat[,"y"] == 0 & (nn.results < cut0[i]))
b = sum(trainDat[,"y"] == 0 & (nn.results > cut0[i]))
c = sum(trainDat[,"y"] == 1 & (nn.results < cut0[i]))
sen = a/(a + c)
spe = d/(b + d)
SenSpe[,i] = c(sen, spe)
}
# plotting ROC
plot(1-SenSpe[2,], SenSpe[1,], type ="b", xlim=c(0,1), ylim=c(0,1),
xlab = "1 - specificity", ylab = "Sensitivity", lty = 1,
main = "Neural Network ROC Curve", col = "blue")
abline(0,1, lty = 2, col = "red") #adds straight line
legend("bottomright", c("ROC of the model", "Random guessing"), lty=c(1,2),
col = c("blue", "red"), bty = "n", cex = 0.8)
## Calculate AUC
xx = 1-SenSpe[2,]
yy = SenSpe[1,]
width = xx[-length(xx)] - xx[-1]
height = yy[-1]
## A better approx of ROC, need library {pROC}
prediction = nn.results
category = trainDat[,"y"] == 1
ROCobj <- roc(category, prediction)
## Setting levels: control = FALSE, case = TRUE
## Warning in roc.default(category, prediction): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
AUC = auc(ROCobj)[1]
##
###
AUC =mean(sum(width*height), sum(width*yy[-length(yy)]))
text(0.8, 0.3, paste("AUC = ", round(AUC,4)), col = "purple", cex = 0.9)
legend("bottomright", c("ROC of the model", "Random guessing"), lty=c(1,2),
col = c("blue", "red"), bty = "n", cex = 0.8)
Here we have the ROC curve obtained using the optimal cut-off score, sensitivity and the specificity minus 1. The curve shows to be higher than the 50-50 line meaning that the model still performs better then just a coin flip. Now that we have the ROC curve for the neural network model, we can compare it to other models too.
The last model that will be explored is the decision tree model. The decision tree model is created by a set of conditions or rules that help predict the outcome of a new incoming observation. The conditions are shaped around the existing data set. So in this case, the decision tree created will use all of the features from the original data set called bank marketing to create conditions and predict which customers will subscribe to the term deposit. Similarly to the logistic regression model and neural network model, we extract observations from the data set and create a training data set and testing data set. 70% of the original data set is randomly sampled for the training data set and the remaining 30% is used for the testing data set.
#decision tree - assignment 6
#use training data set
#create ROC curves
# need specificity and sensitivity - provides matrix
# We use a random split approach
n = dim(BankMarketing)[1] # sample size
# caution: using without replacement
train.id = sample(1:n, round(0.7*n), replace = FALSE)
train = BankMarketing[train.id, ] # training data
test = BankMarketing[-train.id, ] # testing data
tree.builder = function(in.data, fp, fn, purity){
tree = rpart(y_num ~ mar + grp_job + edu + def + hous. + loan + cont + grp_mon + pout + grp_pre + grp_pd + trans_age + day + dur_min + new_bal + grp_cmpn, # including all features
data = BankMarketing,
na.action = na.rpart, # By default, deleted if the outcome is missing,
# kept if predictors are missing
method = "class", # Classification form factor
model = FALSE,
x = FALSE,
y = TRUE,
parms = list( # loss matrix. Penalize false positives or negatives more heavily
loss = matrix(c(0, fp, fn, 0), ncol = 2, byrow = TRUE),
split = purity), # "gini" or "information"
## rpart algorithm options (These are defaults)
control = rpart.control(
minsplit = 10, # minimum number of observations required before split
minbucket= 10, # minimum number of observations in any terminal node, default = minsplit/3
cp = 0.01, # complexity parameter for stopping rule, 0.02 -> small tree
xval = 10 # number of cross-validation )
)
)
}
After that, the training data set is used to create several different decision trees to pick from. In this case, there were 4 trees created. The first tree(gini1.1) is based on an impurity measure that we call a gini index, without penalizing for any false predictions. The second tree(info1.1) is similar in the way that it does not penalize for any false predictions, but instead of the gini index, entropy is used. Entropy is simply another kind of impurity measurement. The next two trees use the same impurity measurements as the first two but they do penalize for the false predictions. Both the gini1.10 and info1.10 trees penalizes heavily for false positive predictions. These decision trees can be seen below.
Non-Penalized Decision Trees
Penalized Decision Trees
After we have created these decision trees, we need a way to compare them, so we use the ROC curve with sensitivity and specificity. With the ROC curve below, we can see that none of the decision trees performed very well. None of the trees performed better then a 50% chance of guessing the outcome of the customer’s subscription. Despite the poor performances, it can still be said that the first decision tree(gini1.1) was the best with the highest AUC value a little higher then 45%.
# function returning a sensitivity and specificity matrix
SenSpe = function(in.data, fp, fn, purity){
cutoff = seq(0,1, length = 20) # 20 cut-offs including 0 and 1.
model = tree.builder(in.data, fp, fn, purity)
## Caution: decision tree returns both "success" and "failure" probabilities.
## We need only "success" probability to define sensitivity and specificity!!!
pred = predict(model, newdata = in.data, type = "prob") # two-column matrix.
senspe.mtx = matrix(0, ncol = length(cutoff), nrow= 2, byrow = FALSE)
for (i in 1:length(cutoff)){
pred.out = ifelse(pred[,1] > cutoff[i], 1, 0)
TP = sum(pred.out == 1 & in.data$y_num == 1)
TN = sum(pred.out == 0 & in.data$y_num == 0)
FP = sum(pred.out == 1 & in.data$y_num == 0)
FN = sum(pred.out == 0 & in.data$y_num == 1)
senspe.mtx[1,i] = TP/(TP + FN)
senspe.mtx[2,i] = TN/(TN + FP)
}
sen = senspe.mtx[1,]
spe = senspe.mtx[2,]
AUC = mean(sum(sen[-20]*(spe[-1]-spe[-20])), sum(sen[-1]*(spe[-1]-spe[-20])))
list(senspe.mtx= senspe.mtx, AUC = round(AUC,3))
}
giniROC11 = SenSpe(in.data = train, fp=1, fn=1, purity="gini")
infoROC11 = SenSpe(in.data = train, fp=1, fn=1, purity="information")
giniROC110 = SenSpe(in.data = train, fp=1, fn=10, purity="gini")
infoROC110 = SenSpe(in.data = train, fp=1, fn=10, purity="information")
Decision Tree ROC Curves
After making the ROC curve, the next step is to find the optimal cut-off scores for each decision tree. Focusing on the first decision tree, we can see there are several optimal cut-off points, so the score that will be used to give us the best average accuracy will be the mean cut-off score. With this cut-off, it gives us a test accuracy of around 87%, which shows a good performance by the first decision tree.
#optimal cut-off score determination
Optm.cutoff = function(in.data, fp, fn, purity){
n0 = dim(in.data)[1]/5
cutoff = seq(0,1, length = 20) # candidate cut off prob
## accuracy for each candidate cut-off
accuracy.mtx = matrix(0, ncol=20, nrow=5) # 20 candidate cutoffs and gini.11
##
for (k in 1:5){
valid.id = ((k-1)*n0 + 1):(k*n0)
valid.dat = in.data[valid.id,]
train.dat = in.data[-valid.id,]
## tree model
tree.model = tree.builder(in.data, fp, fn, purity)
## prediction
pred = predict(tree.model, newdata = valid.dat, type = "prob")[,2]
## for-loop
for (i in 1:20){
## predicted probabilities
pc.1 = ifelse(pred > cutoff[i], 1, 0)
## accuracy
a1 = mean(pc.1 == valid.dat$y_num)
accuracy.mtx[k,i] = a1
}
}
avg.acc = apply(accuracy.mtx, 2, mean)
## plots
n = length(avg.acc)
idx = which(avg.acc == max(avg.acc))
tick.label = as.character(round(cutoff,2))
##
plot(1:n, avg.acc, xlab="cut-off score", ylab="average accuracy",
ylim=c(min(avg.acc), 1),
axes = FALSE,
main=paste("5-fold CV optimal cut-off \n ",purity,"(fp, fn) = (", fp, ",", fn,")" , collapse = ""),
cex.main = 0.9,
col.main = "navy")
axis(1, at=1:20, label = tick.label, las = 2)
axis(2)
points(idx, avg.acc[idx], pch=19, col = "red")
segments(idx , min(avg.acc), idx , avg.acc[idx ], col = "red")
text(idx, avg.acc[idx]+0.03, as.character(round(avg.acc[idx],4)), col = "red", cex = 0.8)
}
par(mfrow=c(2,2))
Optm.cutoff(in.data = train, fp=1, fn=1, purity="gini")
Optm.cutoff(in.data = train, fp=1, fn=1, purity="information")
Optm.cutoff(in.data = train, fp=1, fn=10, purity="gini")
Optm.cutoff(in.data = train, fp=1, fn=10, purity="information")
5-fold CV Optimal Cut-Off for Decision Trees
pred = predict(gini.tree.1.1, newdata = test, type = "prob")[,2]
pc.2 = ifelse(pred > .89, 1, 0)
## accuracy
gini_accuracy = sum(pc.2 == test$y_num)/length(pc.2)
kable(as.data.frame(gini_accuracy), align='c', caption = "Gini1.1 Decision Tree Test Accuracy")
| gini_accuracy |
|---|
| 0.8779154 |
#typically use average of all final score
Now that there are several models to choose from, we can choose the best one based on the ROC curve and AUC value. In the figure below, the three different model’s ROC curves calculated from before are plotted. The gini decision tree is the first model to be eliminated from the competition as the ROC curve does not even reach the 50-50 chance line. The neural network model and logistic model ROC curves are close together but when looking at the AUC values, we can see that the logistic model is better with an AUC value of .8974. A reminder that the AUC value is the measurement of the area under the curve which equates to the percentage of correct predictions. This means that the logistic model correctly predicts almost 90% of the customers subscription status.
AUC = round(sum(sens.vec*(one.minus.spec[-101]-one.minus.spec[-1])),4)
## Warning in sens.vec * (one.minus.spec[-101] - one.minus.spec[-1]): longer
## object length is not a multiple of shorter object length
NNAUC =mean(sum(width*height), sum(width*yy[-length(yy)]))
#plottig ROC curves for 6 different decision trees
par(pty="s") # set up square plot through graphic parameter
plot(1-giniROC11$senspe.mtx[2,], giniROC11$senspe.mtx[1,], type = "l", xlim=c(0,1), ylim=c(0,1),
xlab="1 - specificity", ylab="Sensitivity", col = "blue", lwd = 2,
main="Compared ROC Curves", cex.main = 0.9, col.main = "navy")
abline(0,1, lty = 2, col = "firebrick2", lwd = 2)
lines(one.minus.spec, sens.vec, col = "purple", lwd = 2, lty=2)
lines(xx, yy, col = "olivedrab", lwd = 2)
legend("bottomright", c(paste("Gini.1.1 AUC =",giniROC11$AUC), paste("Logistic AUC = ", AUC),
paste("NN AUC =", round(NNAUC,4))),
col=c("blue","purple","olivedrab"),
lty=c(1,2,rep(1,4)), lwd=rep(2,6), cex = 0.6, bty = "n")
Now that a model has been chosen, it would be helpful to explain how to use it when needed. The logistic regression model is fairly easy to use because once one has collected the needed information, they can be simply plugged into the model and the outcome will predict whether or not the customer will subscribe to the term deposit. A response with a 0 indicates the customer will not subscribe while an outcome of a 1 indicates the customer will subscribe. The information needed to use the model includes age, marital status, job, education, housing loan status, personal loan status, campaign form of contact, month contacted, previous campaign outcome, day of the month, duration of contact, yearly balance, and number of contacts during this campaign. Once this information is gathered and put into the newly engineered groups, they are ready for the model.