Bankruptcy

Load the data

library(readr)
bankf <- read.csv("bank-additional/bank-additional-full.csv",sep = ";",header = T)
bank <- read.csv("bank-additional/bank-additional.csv",sep = ";",header = T)
data<- bankf#Backup
b<- bank

Basic Checks and Manipulation

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

# Checking Missing Data 
colSums(is.na(data))

##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

# This shows that there are no missing data in our Dataset, Good Start!
# converting all the variables into factor which have less than 4 unique values 

col_names <- sapply(data, function(col) length(unique(col)) < 4)
data[ , col_names] <- lapply(data[ , col_names] , factor)

str(data)

## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

dim(data)

## [1] 41188    21

nearZeroVar(data, saveMetrics= TRUE)

##                freqRatio percentUnique zeroVar   nzv
## age             1.054713   0.189375546   FALSE FALSE
## job             1.126216   0.029134699   FALSE FALSE
## marital         2.154910   0.009711566   FALSE FALSE
## education       1.278823   0.019423133   FALSE FALSE
## default         3.790625   0.007283675   FALSE FALSE
## housing         1.158630   0.007283675   FALSE FALSE
## loan            5.433739   0.007283675   FALSE FALSE
## contact         1.737836   0.004855783   FALSE FALSE
## month           1.919292   0.024278916   FALSE FALSE
## day_of_week     1.012802   0.012139458   FALSE FALSE
## duration        1.000000   3.748664660   FALSE FALSE
## campaign        1.669063   0.101971448   FALSE FALSE
## pdays          90.371298   0.065553074   FALSE  TRUE
## previous        7.797194   0.019423133   FALSE FALSE
## poutcome        8.363829   0.007283675   FALSE FALSE
## emp.var.rate    1.767639   0.024278916   FALSE FALSE
## cons.price.idx  1.161257   0.063125182   FALSE FALSE
## cons.conf.idx   1.161257   0.063125182   FALSE FALSE
## euribor3m       1.097589   0.767213752   FALSE FALSE
## nr.employed     1.902273   0.026706808   FALSE FALSE
## y               7.876724   0.004855783   FALSE FALSE

# Zero and Near Zero Variance features do not explain any variance in the predictor variable.

# Similarly I can check for linearly dependent columns among the continuous variables.

feature_map <- unlist(lapply(data, is.numeric)) 
findLinearCombos((data[,feature_map]))

## $linearCombos
## list()
## 
## $remove
## NULL

# There are no linearly dependent columns.

# Let us see all the numeric variables in the column

num_cols <- unlist(lapply(data, is.numeric))  
only_numeric<- data[, num_cols]
str(only_numeric)

## 'data.frame':    41188 obs. of  10 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...

# find correlations to exclude from the model 
findCorrelation( cor(only_numeric), cutoff = .75, names = TRUE )

## [1] "euribor3m"    "emp.var.rate"

# euribor3m and emp.var.rate are the 2 high corelated variables

Let us look into the dependent variable - Binary CLassification

data$Target <- data$y
data$y <- NULL

# Checking the proportions of the class 
table(data$Target)

## 
##    no   yes 
## 36548  4640

prop.table(table(data$Target))

## 
##        no       yes 
## 0.8873458 0.1126542

# Class Imbalanced problem has occured as Majority to Minority class ratio is - 88.7 to 11:3

# Let us see the data graphically 

library(ggplot2)

ggplot(data = data, mapping = aes(x =Target)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable distributed")+
  xlab("No->Client did not subscribe to term deposit \n Yes->Client did subscribe to term deposit")+
  theme_bw()

# Class Imbalanced immensely 

####### We will have to apply some sampling technique like SMOTE or Undersampling for better performance of the model, as the model will then be biased towards the majority class.

Univariate for Factors Variables

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Univariate Analysis 
univ_cat_df <- data %>% select_if(function(col) {is.factor(col) | is.character(col)})

for(column in colnames(univ_cat_df)){
  plot(ggplot(univ_cat_df,aes_string(column)) +
    geom_bar(color="black",fill="orange") + coord_flip() +
    ggtitle(column) +
    theme_minimal())
}

Univariate analysis for Numeric variables

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(dplyr)

cont_univ_df <- data %>% select_if(is.numeric) %>% mutate(row_no = as.numeric(rownames(data)))

for(column in colnames(cont_univ_df[-ncol(cont_univ_df)])){
  p1 <- ggplot(cont_univ_df, aes_string(x='row_no', y= column)) +
    geom_point(show.legend = FALSE) +
    labs(x = 'Univariate plot', y=column) +
    ggtitle(column) +
    theme_minimal()

  # Cumulative plot
  legendcols <- c("Normal distribution"="darkred","Density"="darkBlue","Histogram"="lightBlue")
  p2 <- ggplot(cont_univ_df,aes_string(x = column)) +
    geom_histogram(aes(y=..density.., fill ="Histogram"), bins = 50) +
    stat_function(fun = dnorm, aes(color="Normal distribution"),  size = 1,
                args = list(mean = mean(cont_univ_df[[column]]),
                            sd = sd(cont_univ_df[[column]]))) +
  geom_density(aes(y=..density.., color="Density"),  size = 1)+
  scale_colour_manual(name="Distribution",values=legendcols) +
  scale_fill_manual(name="Bar",values=legendcols) +
  theme_minimal() + theme(legend.position="none")

  p3 <- ggplot(cont_univ_df %>% mutate(constant = column),
    aes_string(x="constant", y= column, group = 123)) +
    geom_boxplot() +
    labs(y=column) +
    theme_minimal()

  p4 <- ggplot(cont_univ_df,aes_string(sample = column)) +
    stat_qq() + stat_qq_line() +
    theme_minimal()
  grid.arrange(p1, p2, p3, p4, nrow=2)
}

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

## Warning: `mapping` is not used by stat_function()

################

Bi variate analysis for Numeric Variables

I want to understand the relationship of each continuous variable with the \(y\) variable. I will achieve that by doing the following: 1. Plot box plot for each of the variables to do a visual comparison between the groups 2. Plot the explanatory variable distribution for both the variables to understand the variability uniquely explained (The non-intersecting part of the blue and the pink is the variation explained by the variable) 3. Predict using Logistic regression using the variable alone to observe the decrease in deviation/AIC 4. Plot Lorenz curve to compute Gini coefficient if applicable (high gini coefficient means that high inequality is caused by the column, which means more explain-ability)

library(gglorenz)
library(ineq)
library(caret)
plot_bivariate_cont <- function(raw_data, pred_column_name){
  bi_var_df <- raw_df %>% select_if(is.numeric)
  for(column in colnames(bi_var_df)){
    p1 <- ggplot(raw_df,
      aes_string(x = pred_column_name, y= column, group = pred_column_name)) +
      geom_boxplot() +
      labs(y=column) +
      ggtitle(column) +
      theme_minimal()
    
    p2 <- ggplot(raw_df, aes_string(x=column, fill=pred_column_name)) + 
      geom_histogram(alpha=0.5, position="identity") +
      theme_minimal()+ theme(legend.position="bottom")
    
    grid.arrange(p1, p2, nrow=1, widths = c(1,2))
    
    trainList_bi <- createDataPartition(y = unlist(raw_df[pred_column_name]), times = 1,p = 0.8, list = FALSE)
    dfTest_bi <- raw_df[-trainList_bi,]
    dfTrain_bi <- raw_df[trainList_bi,]
    form_2 = as.formula(paste0(pred_column_name,' ~ ',column))
    set.seed(1234)
    objControl <- trainControl(method = "none",
                             summaryFunction = twoClassSummary,
                             # sampling = 'smote',
                             classProbs = TRUE)
    
    cont_loop_caret_model <- train(form_2, data = dfTrain_bi,
                           method = 'glm',
                           trControl = objControl,
                           metric = "ROC"
                           )
    print(summary(cont_loop_caret_model))
    if(sum(raw_df[column]<0) == 0){
      plot(ggplot(raw_df, aes_string(column)) +
        gglorenz::stat_lorenz(color = "red") + 
        geom_abline(intercept = 0, slope = 1, color = 'blue') +
        theme_minimal())
      print(paste0('Gini coefficient = ', Gini(unlist(raw_df[column]))))
    }
    print(strrep("-",100))
  }
}

raw_df<- data
plot_bivariate_cont(raw_df, pred_column_name = 'Target')

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6016  -0.4998  -0.4806  -0.4694   2.1715  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.399650   0.069011 -34.772  < 2e-16 ***
## age          0.008319   0.001640   5.072 3.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23174  on 32949  degrees of freedom
## AIC: 23178
## 
## Number of Fisher Scoring iterations: 4

## [1] "Gini coefficient = 0.144564295379368"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.4208  -0.4256  -0.3494  -0.3062   2.5307  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.296e+00  3.165e-02 -104.13   <2e-16 ***
## duration     3.658e-03  6.544e-05   55.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 19274  on 32949  degrees of freedom
## AIC: 19278
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.456840836174438"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5278  -0.5278  -0.4974  -0.4411   3.0649  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.77414    0.02768  -64.10   <2e-16 ***
## campaign    -0.12668    0.01032  -12.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22995  on 32949  degrees of freedom
## AIC: 22999
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.419561162655728"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.416  -0.442  -0.442  -0.442   2.179  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.443e-01  6.002e-02    9.07   <2e-16 ***
## pdays       -2.824e-03  6.315e-05  -44.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 21246  on 32949  degrees of freedom
## AIC: 21250
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.036555463874123"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6092  -0.4345  -0.4345  -0.4345   2.1941  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.31271    0.02009 -115.12   <2e-16 ***
## previous     0.94713    0.02730   34.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22013  on 32949  degrees of freedom
## AIC: 22017
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.887949359660966"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9997  -0.4434  -0.3213  -0.2961   2.5095  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.32583    0.02157 -107.81   <2e-16 ***
## emp.var.rate -0.55653    0.01135  -49.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 20531  on 32949  degrees of freedom
## AIC: 20535
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7526  -0.5387  -0.4190  -0.4078   2.4760  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    67.18486    2.83399   23.71   <2e-16 ***
## cons.price.idx -0.74079    0.03035  -24.41   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22590  on 32949  degrees of freedom
## AIC: 22594
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.00347005253734073"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6139  -0.5222  -0.4740  -0.4404   2.2523  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -0.582128   0.149400  -3.896 9.76e-05 ***
## cons.conf.idx  0.036848   0.003718   9.910  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23102  on 32949  degrees of freedom
## AIC: 23106
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8531  -0.3750  -0.3020  -0.2941   2.5315  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.48707    0.03081  -15.81   <2e-16 ***
## euribor3m   -0.53037    0.01063  -49.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 20335  on 32949  degrees of freedom
## AIC: 20339
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.240119000068784"
## [1] "----------------------------------------------------------------------------------------------------"

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2810  -0.3537  -0.3431  -0.2790   2.5556  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 65.3003908  1.1880989   54.96   <2e-16 ***
## nr.employed -0.0131075  0.0002323  -56.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 19707  on 32949  degrees of freedom
## AIC: 19711
## 
## Number of Fisher Scoring iterations: 5

## [1] "Gini coefficient = 0.00721907070938384"
## [1] "----------------------------------------------------------------------------------------------------"

Bivariate Analysis with Categorical variables

I want to understand the relationship of each categorical variable with the \(y\) variable. I will achieve that by doing the following: 1. A mosaic plot shows if any column is significantly different from base column 2. Predict using Logistic regression using the variable alone to observe the decrease in deviation/AIC

library(ggmosaic)
plot_bivariate_cat <- function(raw_d, pred_column_name){
  plot_bi_cat_df <- raw_df %>% select_if(function(col) {is.factor(col) | is.character(col)})
  for(column in colnames(plot_bi_cat_df)){
    if(column != pred_column_name){
      plot(ggplot(data = plot_bi_cat_df %>% group_by_(pred_column_name,column) %>% summarise(count = n())) +
        geom_mosaic(aes_string(weight = 'count', 
                               x = paste0("product(", pred_column_name," , ", column, ")"), 
                               fill = pred_column_name), na.rm=TRUE) +
        labs(x = column, y='%',  title = column) +
        theme_minimal()+theme(legend.position="bottom") +
        theme(axis.text.x=element_text(angle = 45, vjust = 0.5, hjust=1)))
      
      trainList_cat <- createDataPartition(y = unlist(raw_df[pred_column_name]), times = 1,p = 0.8, list = FALSE)
      dfTest_bi_cat <- raw_df[-trainList_cat,]
      dfTrain_bi_cat <- raw_df[trainList_cat,]
      form_2 = as.formula(paste0(pred_column_name,' ~ ',column))
      set.seed(1234)
      objControl <- trainControl(method = "none",
                               summaryFunction = twoClassSummary,
                               # sampling = 'smote',
                               classProbs = TRUE)
      
      cat_loop_caret_model <- train(form_2, data = dfTrain_bi_cat,
                             method = 'glm',
                             trControl = objControl,
                             metric = "ROC")
      print(summary(cat_loop_caret_model))
    }
    print(strrep("-",100))
  }
}

plot_bivariate_cat(raw_df, pred_column_name = 'Target')

## Warning: group_by_() is deprecated. 
## Please use group_by() instead
## 
## The 'programming' vignette or the tidyeval book can help you
## to program with group_by() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8704  -0.5268  -0.4722  -0.3792   2.3101  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.90505    0.03271 -58.248  < 2e-16 ***
## `jobblue-collar`   -0.69140    0.05629 -12.283  < 2e-16 ***
## jobentrepreneur    -0.50508    0.11145  -4.532 5.85e-06 ***
## jobhousemaid       -0.22788    0.11559  -1.971   0.0487 *  
## jobmanagement      -0.16764    0.07348  -2.281   0.0225 *  
## jobretired          0.83137    0.06987  11.899  < 2e-16 ***
## `jobself-employed` -0.18819    0.09992  -1.883   0.0597 .  
## jobservices        -0.52679    0.07271  -7.245 4.31e-13 ***
## jobstudent          1.12978    0.08746  12.918  < 2e-16 ***
## jobtechnician      -0.23255    0.05499  -4.229 2.35e-05 ***
## jobunemployed       0.11896    0.10471   1.136   0.2559    
## jobunknown         -0.08150    0.19430  -0.419   0.6749    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22539  on 32939  degrees of freedom
## AIC: 22563
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5984  -0.4675  -0.4628  -0.4628   2.1387  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.15888    0.05411 -39.902  < 2e-16 ***
## maritalmarried -0.02111    0.05897  -0.358     0.72    
## maritalsingle   0.34091    0.06182   5.514  3.5e-08 ***
## maritalunknown  0.52964    0.35005   1.513     0.13    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23106  on 32947  degrees of freedom
## AIC: 23114
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7244  -0.5443  -0.4763  -0.4118   2.2623  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -2.14583    0.05682 -37.767  < 2e-16 ***
## educationbasic.6y            -0.27897    0.10240  -2.724  0.00644 ** 
## educationbasic.9y            -0.33261    0.07822  -4.252 2.12e-05 ***
## educationhigh.school          0.02674    0.06779   0.394  0.69327    
## educationilliterate           0.94186    0.66073   1.425  0.15402    
## educationprofessional.course  0.08556    0.07488   1.143  0.25319    
## educationuniversity.degree    0.31128    0.06397   4.866 1.14e-06 ***
## educationunknown              0.39942    0.09494   4.207 2.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23033  on 32943  degrees of freedom
## AIC: 23049
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5245  -0.5245  -0.5245  -0.3284   2.4277  
## 
## Coefficients:
##                Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)    -1.91412    0.01850 -103.439   <2e-16 ***
## defaultunknown -0.97871    0.05715  -17.126   <2e-16 ***
## defaultyes     -8.65191   84.47666   -0.102    0.918    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22832  on 32948  degrees of freedom
## AIC: 22838
## 
## Number of Fisher Scoring iterations: 9
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4981  -0.4981  -0.4797  -0.4797   2.1430  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.10439    0.02629 -80.042   <2e-16 ***
## housingunknown -0.08578    0.12151  -0.706   0.4802    
## housingyes      0.07985    0.03542   2.254   0.0242 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23193  on 32948  degrees of freedom
## AIC: 23199
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4897  -0.4896  -0.4896  -0.4896   2.1430  
## 
## Coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -2.0610796  0.0191591 -107.577   <2e-16 ***
## loanunknown -0.1290878  0.1201647   -1.074    0.283    
## loanyes      0.0004981  0.0487135    0.010    0.992    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23198  on 32948  degrees of freedom
## AIC: 23204
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5629  -0.5629  -0.5629  -0.3327   2.4174  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.76228    0.01955  -90.12   <2e-16 ***
## contacttelephone -1.10423    0.04487  -24.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 22473  on 32949  degrees of freedom
## AIC: 22477
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1636  -0.4666  -0.4396  -0.3731   2.3236  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.38214    0.05445 -25.385  < 2e-16 ***
## monthaug    -0.74428    0.07139 -10.426  < 2e-16 ***
## monthdec     1.27986    0.17955   7.128 1.02e-12 ***
## monthjul    -0.90598    0.07104 -12.753  < 2e-16 ***
## monthjun    -0.78082    0.07408 -10.541  < 2e-16 ***
## monthmar     1.34950    0.11086  12.172  < 2e-16 ***
## monthmay    -1.24782    0.06642 -18.787  < 2e-16 ***
## monthnov    -0.80395    0.07953 -10.109  < 2e-16 ***
## monthoct     1.11078    0.10050  11.053  < 2e-16 ***
## monthsep     1.12248    0.10893  10.305  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 21595  on 32941  degrees of freedom
## AIC: 21615
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5048  -0.5039  -0.4944  -0.4579   2.1482  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.09147    0.04029 -51.907   <2e-16 ***
## day_of_weekmon -0.11116    0.05715  -1.945   0.0518 .  
## day_of_weekthu  0.09185    0.05495   1.671   0.0946 .  
## day_of_weektue  0.05132    0.05607   0.915   0.3601    
## day_of_weekwed  0.09553    0.05536   1.726   0.0844 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 23180  on 32946  degrees of freedom
## AIC: 23190
## 
## Number of Fisher Scoring iterations: 4
## 
## [1] "----------------------------------------------------------------------------------------------------"

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4377  -0.4311  -0.4311  -0.4311   2.2010  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.78937    0.04890  -36.59   <2e-16 ***
## poutcomenonexistent -0.53987    0.05316  -10.15   <2e-16 ***
## poutcomesuccess      2.38318    0.08008   29.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23199  on 32950  degrees of freedom
## Residual deviance: 21263  on 32948  degrees of freedom
## AIC: 21269
## 
## Number of Fisher Scoring iterations: 5
## 
## [1] "----------------------------------------------------------------------------------------------------"
## [1] "----------------------------------------------------------------------------------------------------"

Checking for Corelation

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:ggmosaic':
## 
##     happy

## The following object is masked from 'package:dplyr':
## 
##     nasa

# Plot correlation heatmap
ggcorr(data, label = TRUE, 
       palette = "RdBu", 
       name = "Correlation", 
       hjust = 0.75, 
       label_size = 3, 
       label_round = 2)

## Warning in ggcorr(data, label = TRUE, palette = "RdBu", name = "Correlation", :
## data in column(s) 'job', 'marital', 'education', 'default', 'housing', 'loan',
## 'contact', 'month', 'day_of_week', 'poutcome', 'Target' are not numeric and were
## ignored

###### This shows that there are alot of variables which have corelation among them

Feature selection using FCcaret Package in R

library(fscaret)

## Loading required package: gsubfn

## Loading required package: proto

## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/3.6/Resources/modules/R_X11.so
##   Reason: image not found

## Could not load tcltk.  Will use slower R code instead.

## Loading required package: hmeasure

## Loading required package: parallel

## 
## Attaching package: 'fscaret'

## The following object is masked from 'package:caret':
## 
##     RMSE

library(caret)

# Make sure target variable is at the bottom of the class 

set.seed(1234)
splitIndex <- createDataPartition(data$Target, p = .75, list = FALSE, times = 1)
trainDF <- data[ splitIndex,]
testDF  <- data[-splitIndex,]

fsModels <- c("glm", "gbm", "treebag", "ridge", "lasso") 
myFS<-fscaret(trainDF, testDF, myTimeLimit = 40, preprocessData=TRUE,
              Used.funcRegPred = fsModels, with.labels=TRUE,
              supress.output=FALSE, no.cores=2)

## Loading required package: R.utils

## Loading required package: multicore

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## ----Packages loaded successfully----
## 
## 
## -----Warnings have been supressed!----
## 
## ----Processing files:----
## [1] "17in_default_REGControl_glm.RData"
## [1] ""
## [1] "Calculating error for model:"
## [1] "17in_default_REGControl_glm.RData"
## [1] ""
## 
## ----Processing files:----
## [1] "17in_default_REGControl_VarImp_glm.txt"
## 
##  matrycaVarImp.RMSE after 
##              [,1] [,2] [,3] [,4]
##  [1,]  1.30388955    0    0    0
##  [2,]  0.21764769    0    0    0
##  [3,]  0.93453918    0    0    0
##  [4,]  1.85272246    0    0    0
##  [5,]  2.16970585    0    0    0
##  [6,]  0.06917630    0    0    0
##  [7,]  0.02695651    0    0    0
##  [8,]  5.11617310    0    0    0
##  [9,]  4.48215630    0    0    0
## [10,]  1.35181466    0    0    0
## [11,] 31.03849318    0    0    0
## [12,]  0.59067043    0    0    0
## [13,]  7.04887190    0    0    0
## [14,] 12.58578043    0    0    0
## [15,]  4.51356490    0    0    0
## [16,]  6.59184961    0    0    0
## [17,] 20.10598795    0    0    0

myFS$VarImp$matrixVarImp.MSE

##            glm         SUM         SUM%    ImpGrad Input_no
## 11 31.03849318 31.03849318 100.00000000  0.0000000       11
## 17 20.10598795 20.10598795  64.77759031 35.2224097       17
## 14 12.58578043 12.58578043  40.54894145 37.4028252       14
## 13  7.04887190  7.04887190  22.71009698 43.9933666       13
## 16  6.59184961  6.59184961  21.23765989  6.4836231       16
## 8   5.11617310  5.11617310  16.48331662 22.3863801        8
## 15  4.51356490  4.51356490  14.54182996 11.7784952       15
## 9   4.48215630  4.48215630  14.44063753  0.6958713        9
## 5   2.16970585  2.16970585   6.99037107 51.5923653        5
## 4   1.85272246  1.85272246   5.96911214 14.6095095        4
## 10  1.35181466  1.35181466   4.35528444 27.0363106       10
## 1   1.30388955  1.30388955   4.20087903  3.5452429        1
## 3   0.93453918  0.93453918   3.01090383 28.3268143        3
## 12  0.59067043  0.59067043   1.90302545 36.7955417       12
## 2   0.21764769  0.21764769   0.70121860 63.1524317        2
## 6   0.06917630  0.06917630   0.22287262 68.2163858        6
## 7   0.02695651  0.02695651   0.08684864 61.0321601        7

# Getting the Lables 

results <- myFS$VarImp$matrixVarImp.MSE
results$Input_no <- as.numeric(results$Input_no)
results <- results[c("SUM","SUM%","ImpGrad","Input_no")]
myFS$PPlabels$Input_no <-  as.numeric(rownames(myFS$PPlabels))
results <- merge(x=results, y=myFS$PPlabels, by="Input_no", all.x=T)
results <- results[c('Labels', 'SUM')]
results <- subset(results,results$SUM !=0)
results <- results[order(-results$SUM),]
print(results)

##            Labels         SUM
## 11       duration 31.03849318
## 17    nr.employed 20.10598795
## 14       poutcome 12.58578043
## 13       previous  7.04887190
## 16  cons.conf.idx  6.59184961
## 8         contact  5.11617310
## 15 cons.price.idx  4.51356490
## 9           month  4.48215630
## 5         default  2.16970585
## 4       education  1.85272246
## 10    day_of_week  1.35181466
## 1             age  1.30388955
## 3         marital  0.93453918
## 12       campaign  0.59067043
## 2             job  0.21764769
## 6         housing  0.06917630
## 7            loan  0.02695651

##### from this code we can see that all the 5 models present the following variables to be important variables in getting to know the Target Variable 

# Age
# Job
# Marital Status 
# Eduation

T-Test and Chisq Test for Exploratory Data Analysis

str(data)

## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ Target        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

# Chisq Test for Target variable with all Factor Independent variables 
myTests <- lapply(data[-length(data)], function(x) chisq.test(table(x, data$Target)))
a <- (round(unlist(sapply(myTests, "[", "p.value")),4))
print(a)

##            age.p.value            job.p.value        marital.p.value 
##                 0.0000                 0.0000                 0.0000 
##      education.p.value        default.p.value        housing.p.value 
##                 0.0000                 0.0000                 0.0583 
##           loan.p.value        contact.p.value          month.p.value 
##                 0.5787                 0.0000                 0.0000 
##    day_of_week.p.value       duration.p.value       campaign.p.value 
##                 0.0000                 0.0000                 0.0000 
##          pdays.p.value       previous.p.value       poutcome.p.value 
##                 0.0000                 0.0000                 0.0000 
##   emp.var.rate.p.value cons.price.idx.p.value  cons.conf.idx.p.value 
##                 0.0000                 0.0000                 0.0000 
##      euribor3m.p.value    nr.employed.p.value 
##                 0.0000                 0.0000

# T-test for All IV(numerics) with Target Variable 
options(scripen=999)
num <- unlist(lapply(data, is.numeric))
t_test <- lapply(data[,num],function(x) t.test(x ~ data$Target, var.equal = TRUE)$p.value)
print(t_test)

## $age
## [1] 6.802136e-10
## 
## $duration
## [1] 0
## 
## $campaign
## [1] 2.00778e-41
## 
## $pdays
## [1] 0
## 
## $previous
## [1] 0
## 
## $emp.var.rate
## [1] 0
## 
## $cons.price.idx
## [1] 9.318965e-170
## 
## $cons.conf.idx
## [1] 7.536665e-29
## 
## $euribor3m
## [1] 0
## 
## $nr.employed
## [1] 0

let us get the final thoughts and put the analysis

Stepwise Logistic Regression with SMOTE Technique

# Let us see if our factor variables have more than 1 level 
a<- which(sapply(bankf, function(x) length(unique(x))<2))

# let us split the data into Training and Testing data 
set.seed(1234)
index1 <- sample(1:2, nrow(bankf), replace = TRUE, prob = c(0.65,0.35))
train <- bankf[index1 == 1, ]
test <- bankf[index1 == 2, ]


full <- glm(y~., data=train,family = "binomial")
null <- glm(y~1, data=train,family = "binomial")


# Forward selection

mod_log<- step(full,scope=list(lower=null,upper=full),
                       direction = "both")

## Start:  AIC=11389.23
## y ~ age + job + marital + education + default + housing + loan + 
##     contact + month + day_of_week + duration + campaign + pdays + 
##     previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - education       7    11293 11385
## - marital         3    11285 11385
## - housing         1    11283 11387
## - age             1    11283 11387
## - previous        1    11284 11388
## - job            11    11305 11389
## - nr.employed     1    11285 11389
## <none>                 11283 11389
## - euribor3m       1    11286 11390
## - loan            1    11286 11390
## - campaign        1    11287 11391
## - cons.conf.idx   1    11288 11392
## - day_of_week     4    11296 11394
## - pdays           1    11294 11398
## - default         2    11299 11401
## - poutcome        2    11308 11410
## - contact         1    11326 11430
## - cons.price.idx  1    11328 11432
## - emp.var.rate    1    11375 11479
## - month           9    11615 11703
## - duration        1    14813 14917
## 
## Step:  AIC=11384.69
## y ~ age + job + marital + default + housing + loan + contact + 
##     month + day_of_week + duration + campaign + pdays + previous + 
##     poutcome + emp.var.rate + cons.price.idx + cons.conf.idx + 
##     euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - marital         3    11295 11381
## - housing         1    11293 11383
## - age             1    11293 11383
## - previous        1    11294 11384
## - nr.employed     1    11294 11384
## <none>                 11293 11385
## - loan            1    11296 11386
## - euribor3m       1    11296 11386
## - campaign        1    11297 11387
## - cons.conf.idx   1    11297 11387
## + education       7    11283 11389
## - day_of_week     4    11305 11389
## - job            11    11323 11393
## - pdays           1    11303 11393
## - default         2    11309 11397
## - poutcome        2    11318 11406
## - contact         1    11336 11426
## - cons.price.idx  1    11337 11427
## - emp.var.rate    1    11385 11475
## - month           9    11632 11706
## - duration        1    14818 14908
## 
## Step:  AIC=11381.2
## y ~ age + job + default + housing + loan + contact + month + 
##     day_of_week + duration + campaign + pdays + previous + poutcome + 
##     emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m + 
##     nr.employed
## 
##                  Df Deviance   AIC
## - housing         1    11295 11379
## - age             1    11295 11379
## - previous        1    11296 11380
## - nr.employed     1    11297 11381
## <none>                 11295 11381
## - loan            1    11298 11382
## - euribor3m       1    11298 11382
## - campaign        1    11299 11383
## - cons.conf.idx   1    11300 11384
## + marital         3    11293 11385
## + education       7    11285 11385
## - day_of_week     4    11308 11386
## - pdays           1    11306 11390
## - job            11    11329 11393
## - default         2    11312 11394
## - poutcome        2    11320 11402
## - contact         1    11339 11423
## - cons.price.idx  1    11340 11424
## - emp.var.rate    1    11388 11472
## - month           9    11638 11706
## - duration        1    14821 14905
## 
## Step:  AIC=11379.23
## y ~ age + job + default + loan + contact + month + day_of_week + 
##     duration + campaign + pdays + previous + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx + euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - age             1    11295 11377
## - previous        1    11296 11378
## - loan            2    11298 11378
## - nr.employed     1    11297 11379
## <none>                 11295 11379
## - euribor3m       1    11298 11380
## - campaign        1    11299 11381
## + housing         1    11295 11381
## - cons.conf.idx   1    11300 11382
## + marital         3    11293 11383
## + education       7    11285 11383
## - day_of_week     4    11308 11384
## - pdays           1    11306 11388
## - job            11    11329 11391
## - default         2    11312 11392
## - poutcome        2    11320 11400
## - contact         1    11339 11421
## - cons.price.idx  1    11340 11422
## - emp.var.rate    1    11388 11470
## - month           9    11638 11704
## - duration        1    14822 14904
## 
## Step:  AIC=11377.33
## y ~ job + default + loan + contact + month + day_of_week + duration + 
##     campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - previous        1    11296 11376
## - loan            2    11298 11376
## - nr.employed     1    11297 11377
## <none>                 11295 11377
## - euribor3m       1    11298 11378
## + age             1    11295 11379
## - campaign        1    11299 11379
## + housing         1    11295 11379
## - cons.conf.idx   1    11300 11380
## + marital         3    11293 11381
## + education       7    11285 11381
## - day_of_week     4    11308 11382
## - pdays           1    11306 11386
## - job            11    11330 11390
## - default         2    11313 11391
## - poutcome        2    11320 11398
## - contact         1    11339 11419
## - cons.price.idx  1    11340 11420
## - emp.var.rate    1    11388 11468
## - month           9    11638 11702
## - duration        1    14822 14902
## 
## Step:  AIC=11376.23
## y ~ job + default + loan + contact + month + day_of_week + duration + 
##     campaign + pdays + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - loan            2    11299 11375
## - nr.employed     1    11298 11376
## <none>                 11296 11376
## + previous        1    11295 11377
## - euribor3m       1    11299 11377
## + age             1    11296 11378
## - campaign        1    11300 11378
## + housing         1    11296 11378
## - cons.conf.idx   1    11300 11378
## + marital         3    11294 11380
## + education       7    11286 11380
## - day_of_week     4    11309 11381
## - pdays           1    11306 11384
## - job            11    11331 11389
## - default         2    11314 11390
## - contact         1    11339 11417
## - cons.price.idx  1    11341 11419
## - poutcome        2    11358 11434
## - emp.var.rate    1    11388 11466
## - month           9    11639 11701
## - duration        1    14823 14901
## 
## Step:  AIC=11375.37
## y ~ job + default + contact + month + day_of_week + duration + 
##     campaign + pdays + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m + nr.employed
## 
##                  Df Deviance   AIC
## - nr.employed     1    11301 11375
## <none>                 11299 11375
## + loan            2    11296 11376
## + previous        1    11298 11376
## - euribor3m       1    11302 11376
## + age             1    11299 11377
## - campaign        1    11303 11377
## - cons.conf.idx   1    11304 11378
## + marital         3    11297 11379
## + housing         2    11299 11379
## + education       7    11290 11380
## - day_of_week     4    11312 11380
## - pdays           1    11309 11383
## - job            11    11334 11388
## - default         2    11317 11389
## - contact         1    11342 11416
## - cons.price.idx  1    11344 11418
## - poutcome        2    11362 11434
## - emp.var.rate    1    11392 11466
## - month           9    11643 11701
## - duration        1    14825 14899
## 
## Step:  AIC=11375.02
## y ~ job + default + contact + month + day_of_week + duration + 
##     campaign + pdays + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m
## 
##                  Df Deviance   AIC
## <none>                 11301 11375
## + nr.employed     1    11299 11375
## - cons.conf.idx   1    11304 11376
## + loan            2    11298 11376
## + previous        1    11300 11376
## + age             1    11301 11377
## - campaign        1    11305 11377
## + marital         3    11298 11378
## + housing         2    11301 11379
## - day_of_week     4    11313 11379
## + education       7    11292 11380
## - pdays           1    11311 11383
## - job            11    11335 11387
## - default         2    11318 11388
## - euribor3m       1    11318 11390
## - contact         1    11342 11414
## - poutcome        2    11363 11433
## - emp.var.rate    1    11420 11492
## - cons.price.idx  1    11467 11539
## - month           9    11644 11700
## - duration        1    14828 14900

summary(mod_log)

## 
## Call:
## glm(formula = y ~ job + default + contact + month + day_of_week + 
##     duration + campaign + pdays + poutcome + emp.var.rate + cons.price.idx + 
##     cons.conf.idx + euribor3m, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.9230  -0.3050  -0.1876  -0.1380   3.2606  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.620e+02  1.256e+01 -12.894  < 2e-16 ***
## jobblue-collar      -3.094e-01  8.059e-02  -3.840 0.000123 ***
## jobentrepreneur     -1.841e-01  1.514e-01  -1.216 0.223908    
## jobhousemaid        -2.181e-02  1.700e-01  -0.128 0.897896    
## jobmanagement       -4.143e-02  1.033e-01  -0.401 0.688288    
## jobretired           1.979e-01  1.017e-01   1.945 0.051773 .  
## jobself-employed    -9.133e-02  1.399e-01  -0.653 0.513820    
## jobservices         -2.861e-01  1.017e-01  -2.814 0.004899 ** 
## jobstudent           1.867e-01  1.259e-01   1.483 0.138118    
## jobtechnician       -6.054e-02  7.815e-02  -0.775 0.438562    
## jobunemployed       -2.633e-03  1.542e-01  -0.017 0.986371    
## jobunknown          -3.133e-01  3.165e-01  -0.990 0.322246    
## defaultunknown      -3.348e-01  8.190e-02  -4.088 4.35e-05 ***
## defaultyes          -6.262e+00  1.195e+02  -0.052 0.958201    
## contacttelephone    -5.557e-01  8.871e-02  -6.264 3.75e-10 ***
## monthaug             7.300e-01  1.318e-01   5.539 3.03e-08 ***
## monthdec             1.316e-01  2.343e-01   0.562 0.574248    
## monthjul             3.630e-02  1.178e-01   0.308 0.758041    
## monthjun            -3.694e-01  1.297e-01  -2.849 0.004387 ** 
## monthmar             1.766e+00  1.456e-01  12.125  < 2e-16 ***
## monthmay            -5.285e-01  9.256e-02  -5.710 1.13e-08 ***
## monthnov            -5.396e-01  1.320e-01  -4.089 4.34e-05 ***
## monthoct             1.053e-01  1.508e-01   0.698 0.484879    
## monthsep             1.879e-01  1.590e-01   1.181 0.237499    
## day_of_weekmon      -9.604e-02  8.081e-02  -1.188 0.234651    
## day_of_weekthu       6.249e-02  7.837e-02   0.797 0.425264    
## day_of_weektue       8.495e-02  8.115e-02   1.047 0.295199    
## day_of_weekwed       1.674e-01  8.068e-02   2.075 0.038029 *  
## duration             4.593e-03  9.128e-05  50.320  < 2e-16 ***
## campaign            -2.672e-02  1.373e-02  -1.946 0.051709 .  
## pdays               -7.787e-04  2.470e-04  -3.152 0.001619 ** 
## poutcomenonexistent  5.108e-01  7.871e-02   6.490 8.58e-11 ***
## poutcomesuccess      1.047e+00  2.474e-01   4.231 2.33e-05 ***
## emp.var.rate        -1.524e+00  1.382e-01 -11.022  < 2e-16 ***
## cons.price.idx       1.683e+00  1.303e-01  12.921  < 2e-16 ***
## cons.conf.idx        1.103e-02  6.732e-03   1.638 0.101330    
## euribor3m            4.387e-01  1.047e-01   4.191 2.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 18898  on 26733  degrees of freedom
## Residual deviance: 11301  on 26697  degrees of freedom
## AIC: 11375
## 
## Number of Fisher Scoring iterations: 9

# let us tidy the model 
library (devtools)

## Loading required package: usethis

library (broom)
tidy(mod_log)

## # A tibble: 37 x 5
##    term              estimate std.error statistic  p.value
##    <chr>                <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)      -162.       12.6      -12.9   4.88e-38
##  2 jobblue-collar     -0.309     0.0806    -3.84  1.23e- 4
##  3 jobentrepreneur    -0.184     0.151     -1.22  2.24e- 1
##  4 jobhousemaid       -0.0218    0.170     -0.128 8.98e- 1
##  5 jobmanagement      -0.0414    0.103     -0.401 6.88e- 1
##  6 jobretired          0.198     0.102      1.95  5.18e- 2
##  7 jobself-employed   -0.0913    0.140     -0.653 5.14e- 1
##  8 jobservices        -0.286     0.102     -2.81  4.90e- 3
##  9 jobstudent          0.187     0.126      1.48  1.38e- 1
## 10 jobtechnician      -0.0605    0.0781    -0.775 4.39e- 1
## # … with 27 more rows

# Checking Odds Ratio 
exp(coef(mod_log))

##         (Intercept)      jobblue-collar     jobentrepreneur        jobhousemaid 
##        4.432758e-71        7.338526e-01        8.318114e-01        9.784280e-01 
##       jobmanagement          jobretired    jobself-employed         jobservices 
##        9.594204e-01        1.218796e+00        9.127205e-01        7.511883e-01 
##          jobstudent       jobtechnician       jobunemployed          jobunknown 
##        1.205228e+00        9.412592e-01        9.973700e-01        7.310454e-01 
##      defaultunknown          defaultyes    contacttelephone            monthaug 
##        7.154565e-01        1.908303e-03        5.736702e-01        2.075004e+00 
##            monthdec            monthjul            monthjun            monthmar 
##        1.140701e+00        1.036962e+00        6.911543e-01        5.846638e+00 
##            monthmay            monthnov            monthoct            monthsep 
##        5.894878e-01        5.829971e-01        1.111063e+00        1.206662e+00 
##      day_of_weekmon      day_of_weekthu      day_of_weektue      day_of_weekwed 
##        9.084267e-01        1.064480e+00        1.088659e+00        1.182203e+00 
##            duration            campaign               pdays poutcomenonexistent 
##        1.004604e+00        9.736343e-01        9.992216e-01        1.666644e+00 
##     poutcomesuccess        emp.var.rate      cons.price.idx       cons.conf.idx 
##        2.848763e+00        2.178821e-01        5.382497e+00        1.011092e+00 
##           euribor3m 
##        1.550715e+00

exp(cbind(OR = coef(mod_log), confint(mod_log)))

## Waiting for profiling to be done...

##                               OR        2.5 %       97.5 %
## (Intercept)         4.432758e-71 8.820596e-82 2.176538e-60
## jobblue-collar      7.338526e-01 6.262402e-01 8.589564e-01
## jobentrepreneur     8.318114e-01 6.142734e-01 1.112633e+00
## jobhousemaid        9.784280e-01 6.960815e-01 1.355943e+00
## jobmanagement       9.594204e-01 7.822068e-01 1.172651e+00
## jobretired          1.218796e+00 9.975868e-01 1.486509e+00
## jobself-employed    9.127205e-01 6.905498e-01 1.195323e+00
## jobservices         7.511883e-01 6.142255e-01 9.151668e-01
## jobstudent          1.205228e+00 9.403211e-01 1.540496e+00
## jobtechnician       9.412592e-01 8.072042e-01 1.096606e+00
## jobunemployed       9.973700e-01 7.340426e-01 1.343708e+00
## jobunknown          7.310454e-01 3.817421e-01 1.325868e+00
## defaultunknown      7.154565e-01 6.083242e-01 8.387037e-01
## defaultyes          1.908303e-03           NA 7.156733e+07
## contacttelephone    5.736702e-01 4.814265e-01 6.816876e-01
## monthaug            2.075004e+00 1.602933e+00 2.687120e+00
## monthdec            1.140701e+00 7.196007e-01 1.804984e+00
## monthjul            1.036962e+00 8.230246e-01 1.306254e+00
## monthjun            6.911543e-01 5.361272e-01 8.913336e-01
## monthmar            5.846638e+00 4.391679e+00 7.774960e+00
## monthmay            5.894878e-01 4.918541e-01 7.070293e-01
## monthnov            5.829971e-01 4.498423e-01 7.546865e-01
## monthoct            1.111063e+00 8.266682e-01 1.493077e+00
## monthsep            1.206662e+00 8.831113e-01 1.647494e+00
## day_of_weekmon      9.084267e-01 7.753501e-01 1.064380e+00
## day_of_weekthu      1.064480e+00 9.130428e-01 1.241456e+00
## day_of_weektue      1.088659e+00 9.286100e-01 1.276463e+00
## day_of_weekwed      1.182203e+00 1.009378e+00 1.384948e+00
## duration            1.004604e+00 1.004425e+00 1.004785e+00
## campaign            9.736343e-01 9.472224e-01 9.995974e-01
## pdays               9.992216e-01 9.987382e-01 9.997077e-01
## poutcomenonexistent 1.666644e+00 1.429992e+00 1.946947e+00
## poutcomesuccess     2.848763e+00 1.754804e+00 4.637654e+00
## emp.var.rate        2.178821e-01 1.662128e-01 2.857885e-01
## cons.price.idx      5.382497e+00 4.170129e+00 6.949455e+00
## cons.conf.idx       1.011092e+00 9.978444e-01 1.024534e+00
## euribor3m           1.550715e+00 1.262575e+00 1.903214e+00

# Let us do some prediction
predicted <- predict(mod_log, newdata=test, type="response")

# Model Evaluation

#run anova for checking the model 
anova(mod_log, test = 'Chisq')

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                           26733      18898              
## job            11    543.2     26722      18354 < 2.2e-16 ***
## default         2    263.4     26720      18091 < 2.2e-16 ***
## contact         1    443.1     26719      17648 < 2.2e-16 ***
## month           9    961.8     26710      16686 < 2.2e-16 ***
## day_of_week     4     25.5     26706      16661 4.053e-05 ***
## duration        1   3331.8     26705      13329 < 2.2e-16 ***
## campaign        1     36.7     26704      13292 1.401e-09 ***
## pdays           1    827.3     26703      12465 < 2.2e-16 ***
## poutcome        2     12.6     26701      12452 0.0017978 ** 
## emp.var.rate    1    837.6     26700      11615 < 2.2e-16 ***
## cons.price.idx  1    283.7     26699      11331 < 2.2e-16 ***
## cons.conf.idx   1     12.6     26698      11318 0.0003824 ***
## euribor3m       1     17.4     26697      11301 3.004e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Recode Predicted to factors
y_pred <- ifelse(predicted>0.5,"yes","no") # let us recode with cut off of 0.5
table(y_pred,test$y)

##       
## y_pred    no   yes
##    no  12515   937
##    yes   329   673

# Confusion Matrix 
confusionMatrix(factor(y_pred),test$y,positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  12515   937
##        yes   329   673
##                                          
##                Accuracy : 0.9124         
##                  95% CI : (0.9077, 0.917)
##     No Information Rate : 0.8886         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.47           
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.41801        
##             Specificity : 0.97438        
##          Pos Pred Value : 0.67166        
##          Neg Pred Value : 0.93034        
##              Prevalence : 0.11139        
##          Detection Rate : 0.04656        
##    Detection Prevalence : 0.06932        
##       Balanced Accuracy : 0.69620        
##                                          
##        'Positive' Class : yes            
##

 # ROC Curve with ROCR package 

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

pred <- predict(mod_log, test, type="response")
pred1 <- prediction(pred,test$y)
eval <- performance(pred1,"tpr","fpr")
plot(eval) # ROC Curve

# wee need to boost our RECALL as well as FNR