library(readr)
bankf <- read.csv("bank-additional/bank-additional-full.csv",sep = ";",header = T)
bank <- read.csv("bank-additional/bank-additional.csv",sep = ";",header = T)
data<- bankf#Backup
b<- bank
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
# Checking Missing Data
colSums(is.na(data))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
# This shows that there are no missing data in our Dataset, Good Start!
# converting all the variables into factor which have less than 4 unique values
col_names <- sapply(data, function(col) length(unique(col)) < 4)
data[ , col_names] <- lapply(data[ , col_names] , factor)
str(data)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
dim(data)
## [1] 41188 21
nearZeroVar(data, saveMetrics= TRUE)
## freqRatio percentUnique zeroVar nzv
## age 1.054713 0.189375546 FALSE FALSE
## job 1.126216 0.029134699 FALSE FALSE
## marital 2.154910 0.009711566 FALSE FALSE
## education 1.278823 0.019423133 FALSE FALSE
## default 3.790625 0.007283675 FALSE FALSE
## housing 1.158630 0.007283675 FALSE FALSE
## loan 5.433739 0.007283675 FALSE FALSE
## contact 1.737836 0.004855783 FALSE FALSE
## month 1.919292 0.024278916 FALSE FALSE
## day_of_week 1.012802 0.012139458 FALSE FALSE
## duration 1.000000 3.748664660 FALSE FALSE
## campaign 1.669063 0.101971448 FALSE FALSE
## pdays 90.371298 0.065553074 FALSE TRUE
## previous 7.797194 0.019423133 FALSE FALSE
## poutcome 8.363829 0.007283675 FALSE FALSE
## emp.var.rate 1.767639 0.024278916 FALSE FALSE
## cons.price.idx 1.161257 0.063125182 FALSE FALSE
## cons.conf.idx 1.161257 0.063125182 FALSE FALSE
## euribor3m 1.097589 0.767213752 FALSE FALSE
## nr.employed 1.902273 0.026706808 FALSE FALSE
## y 7.876724 0.004855783 FALSE FALSE
# Zero and Near Zero Variance features do not explain any variance in the predictor variable.
# Similarly I can check for linearly dependent columns among the continuous variables.
feature_map <- unlist(lapply(data, is.numeric))
findLinearCombos((data[,feature_map]))
## $linearCombos
## list()
##
## $remove
## NULL
# There are no linearly dependent columns.
# Let us see all the numeric variables in the column
num_cols <- unlist(lapply(data, is.numeric))
only_numeric<- data[, num_cols]
str(only_numeric)
## 'data.frame': 41188 obs. of 10 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
# find correlations to exclude from the model
findCorrelation( cor(only_numeric), cutoff = .75, names = TRUE )
## [1] "euribor3m" "emp.var.rate"
# euribor3m and emp.var.rate are the 2 high corelated variables
data$Target <- data$y
data$y <- NULL
# Checking the proportions of the class
table(data$Target)
##
## no yes
## 36548 4640
prop.table(table(data$Target))
##
## no yes
## 0.8873458 0.1126542
# Class Imbalanced problem has occured as Majority to Minority class ratio is - 88.7 to 11:3
# Let us see the data graphically
library(ggplot2)
ggplot(data = data, mapping = aes(x =Target)) +
geom_bar(color="black",fill="orange")+
ggtitle("How is the target variable distributed")+
xlab("No->Client did not subscribe to term deposit \n Yes->Client did subscribe to term deposit")+
theme_bw()
# Class Imbalanced immensely
####### We will have to apply some sampling technique like SMOTE or Undersampling for better performance of the model, as the model will then be biased towards the majority class.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Univariate Analysis
univ_cat_df <- data %>% select_if(function(col) {is.factor(col) | is.character(col)})
for(column in colnames(univ_cat_df)){
plot(ggplot(univ_cat_df,aes_string(column)) +
geom_bar(color="black",fill="orange") + coord_flip() +
ggtitle(column) +
theme_minimal())
}
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
cont_univ_df <- data %>% select_if(is.numeric) %>% mutate(row_no = as.numeric(rownames(data)))
for(column in colnames(cont_univ_df[-ncol(cont_univ_df)])){
p1 <- ggplot(cont_univ_df, aes_string(x='row_no', y= column)) +
geom_point(show.legend = FALSE) +
labs(x = 'Univariate plot', y=column) +
ggtitle(column) +
theme_minimal()
# Cumulative plot
legendcols <- c("Normal distribution"="darkred","Density"="darkBlue","Histogram"="lightBlue")
p2 <- ggplot(cont_univ_df,aes_string(x = column)) +
geom_histogram(aes(y=..density.., fill ="Histogram"), bins = 50) +
stat_function(fun = dnorm, aes(color="Normal distribution"), size = 1,
args = list(mean = mean(cont_univ_df[[column]]),
sd = sd(cont_univ_df[[column]]))) +
geom_density(aes(y=..density.., color="Density"), size = 1)+
scale_colour_manual(name="Distribution",values=legendcols) +
scale_fill_manual(name="Bar",values=legendcols) +
theme_minimal() + theme(legend.position="none")
p3 <- ggplot(cont_univ_df %>% mutate(constant = column),
aes_string(x="constant", y= column, group = 123)) +
geom_boxplot() +
labs(y=column) +
theme_minimal()
p4 <- ggplot(cont_univ_df,aes_string(sample = column)) +
stat_qq() + stat_qq_line() +
theme_minimal()
grid.arrange(p1, p2, p3, p4, nrow=2)
}
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
## Warning: `mapping` is not used by stat_function()
################
I want to understand the relationship of each continuous variable with the \(y\) variable. I will achieve that by doing the following: 1. Plot box plot for each of the variables to do a visual comparison between the groups 2. Plot the explanatory variable distribution for both the variables to understand the variability uniquely explained (The non-intersecting part of the blue and the pink is the variation explained by the variable) 3. Predict using Logistic regression using the variable alone to observe the decrease in deviation/AIC 4. Plot Lorenz curve to compute Gini coefficient if applicable (high gini coefficient means that high inequality is caused by the column, which means more explain-ability)
library(gglorenz)
library(ineq)
library(caret)
plot_bivariate_cont <- function(raw_data, pred_column_name){
bi_var_df <- raw_df %>% select_if(is.numeric)
for(column in colnames(bi_var_df)){
p1 <- ggplot(raw_df,
aes_string(x = pred_column_name, y= column, group = pred_column_name)) +
geom_boxplot() +
labs(y=column) +
ggtitle(column) +
theme_minimal()
p2 <- ggplot(raw_df, aes_string(x=column, fill=pred_column_name)) +
geom_histogram(alpha=0.5, position="identity") +
theme_minimal()+ theme(legend.position="bottom")
grid.arrange(p1, p2, nrow=1, widths = c(1,2))
trainList_bi <- createDataPartition(y = unlist(raw_df[pred_column_name]), times = 1,p = 0.8, list = FALSE)
dfTest_bi <- raw_df[-trainList_bi,]
dfTrain_bi <- raw_df[trainList_bi,]
form_2 = as.formula(paste0(pred_column_name,' ~ ',column))
set.seed(1234)
objControl <- trainControl(method = "none",
summaryFunction = twoClassSummary,
# sampling = 'smote',
classProbs = TRUE)
cont_loop_caret_model <- train(form_2, data = dfTrain_bi,
method = 'glm',
trControl = objControl,
metric = "ROC"
)
print(summary(cont_loop_caret_model))
if(sum(raw_df[column]<0) == 0){
plot(ggplot(raw_df, aes_string(column)) +
gglorenz::stat_lorenz(color = "red") +
geom_abline(intercept = 0, slope = 1, color = 'blue') +
theme_minimal())
print(paste0('Gini coefficient = ', Gini(unlist(raw_df[column]))))
}
print(strrep("-",100))
}
}
raw_df<- data
plot_bivariate_cont(raw_df, pred_column_name = 'Target')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6016 -0.4998 -0.4806 -0.4694 2.1715
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.399650 0.069011 -34.772 < 2e-16 ***
## age 0.008319 0.001640 5.072 3.94e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23174 on 32949 degrees of freedom
## AIC: 23178
##
## Number of Fisher Scoring iterations: 4
## [1] "Gini coefficient = 0.144564295379368"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4208 -0.4256 -0.3494 -0.3062 2.5307
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.296e+00 3.165e-02 -104.13 <2e-16 ***
## duration 3.658e-03 6.544e-05 55.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 19274 on 32949 degrees of freedom
## AIC: 19278
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.456840836174438"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5278 -0.5278 -0.4974 -0.4411 3.0649
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.77414 0.02768 -64.10 <2e-16 ***
## campaign -0.12668 0.01032 -12.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22995 on 32949 degrees of freedom
## AIC: 22999
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.419561162655728"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.416 -0.442 -0.442 -0.442 2.179
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.443e-01 6.002e-02 9.07 <2e-16 ***
## pdays -2.824e-03 6.315e-05 -44.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 21246 on 32949 degrees of freedom
## AIC: 21250
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.036555463874123"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6092 -0.4345 -0.4345 -0.4345 2.1941
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.31271 0.02009 -115.12 <2e-16 ***
## previous 0.94713 0.02730 34.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22013 on 32949 degrees of freedom
## AIC: 22017
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.887949359660966"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9997 -0.4434 -0.3213 -0.2961 2.5095
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.32583 0.02157 -107.81 <2e-16 ***
## emp.var.rate -0.55653 0.01135 -49.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 20531 on 32949 degrees of freedom
## AIC: 20535
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7526 -0.5387 -0.4190 -0.4078 2.4760
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 67.18486 2.83399 23.71 <2e-16 ***
## cons.price.idx -0.74079 0.03035 -24.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22590 on 32949 degrees of freedom
## AIC: 22594
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.00347005253734073"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6139 -0.5222 -0.4740 -0.4404 2.2523
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.582128 0.149400 -3.896 9.76e-05 ***
## cons.conf.idx 0.036848 0.003718 9.910 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23102 on 32949 degrees of freedom
## AIC: 23106
##
## Number of Fisher Scoring iterations: 4
##
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8531 -0.3750 -0.3020 -0.2941 2.5315
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.48707 0.03081 -15.81 <2e-16 ***
## euribor3m -0.53037 0.01063 -49.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 20335 on 32949 degrees of freedom
## AIC: 20339
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.240119000068784"
## [1] "----------------------------------------------------------------------------------------------------"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2810 -0.3537 -0.3431 -0.2790 2.5556
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 65.3003908 1.1880989 54.96 <2e-16 ***
## nr.employed -0.0131075 0.0002323 -56.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 19707 on 32949 degrees of freedom
## AIC: 19711
##
## Number of Fisher Scoring iterations: 5
## [1] "Gini coefficient = 0.00721907070938384"
## [1] "----------------------------------------------------------------------------------------------------"
I want to understand the relationship of each categorical variable with the \(y\) variable. I will achieve that by doing the following: 1. A mosaic plot shows if any column is significantly different from base column 2. Predict using Logistic regression using the variable alone to observe the decrease in deviation/AIC
library(ggmosaic)
plot_bivariate_cat <- function(raw_d, pred_column_name){
plot_bi_cat_df <- raw_df %>% select_if(function(col) {is.factor(col) | is.character(col)})
for(column in colnames(plot_bi_cat_df)){
if(column != pred_column_name){
plot(ggplot(data = plot_bi_cat_df %>% group_by_(pred_column_name,column) %>% summarise(count = n())) +
geom_mosaic(aes_string(weight = 'count',
x = paste0("product(", pred_column_name," , ", column, ")"),
fill = pred_column_name), na.rm=TRUE) +
labs(x = column, y='%', title = column) +
theme_minimal()+theme(legend.position="bottom") +
theme(axis.text.x=element_text(angle = 45, vjust = 0.5, hjust=1)))
trainList_cat <- createDataPartition(y = unlist(raw_df[pred_column_name]), times = 1,p = 0.8, list = FALSE)
dfTest_bi_cat <- raw_df[-trainList_cat,]
dfTrain_bi_cat <- raw_df[trainList_cat,]
form_2 = as.formula(paste0(pred_column_name,' ~ ',column))
set.seed(1234)
objControl <- trainControl(method = "none",
summaryFunction = twoClassSummary,
# sampling = 'smote',
classProbs = TRUE)
cat_loop_caret_model <- train(form_2, data = dfTrain_bi_cat,
method = 'glm',
trControl = objControl,
metric = "ROC")
print(summary(cat_loop_caret_model))
}
print(strrep("-",100))
}
}
plot_bivariate_cat(raw_df, pred_column_name = 'Target')
## Warning: group_by_() is deprecated.
## Please use group_by() instead
##
## The 'programming' vignette or the tidyeval book can help you
## to program with group_by() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8704 -0.5268 -0.4722 -0.3792 2.3101
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.90505 0.03271 -58.248 < 2e-16 ***
## `jobblue-collar` -0.69140 0.05629 -12.283 < 2e-16 ***
## jobentrepreneur -0.50508 0.11145 -4.532 5.85e-06 ***
## jobhousemaid -0.22788 0.11559 -1.971 0.0487 *
## jobmanagement -0.16764 0.07348 -2.281 0.0225 *
## jobretired 0.83137 0.06987 11.899 < 2e-16 ***
## `jobself-employed` -0.18819 0.09992 -1.883 0.0597 .
## jobservices -0.52679 0.07271 -7.245 4.31e-13 ***
## jobstudent 1.12978 0.08746 12.918 < 2e-16 ***
## jobtechnician -0.23255 0.05499 -4.229 2.35e-05 ***
## jobunemployed 0.11896 0.10471 1.136 0.2559
## jobunknown -0.08150 0.19430 -0.419 0.6749
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22539 on 32939 degrees of freedom
## AIC: 22563
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5984 -0.4675 -0.4628 -0.4628 2.1387
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.15888 0.05411 -39.902 < 2e-16 ***
## maritalmarried -0.02111 0.05897 -0.358 0.72
## maritalsingle 0.34091 0.06182 5.514 3.5e-08 ***
## maritalunknown 0.52964 0.35005 1.513 0.13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23106 on 32947 degrees of freedom
## AIC: 23114
##
## Number of Fisher Scoring iterations: 4
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7244 -0.5443 -0.4763 -0.4118 2.2623
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.14583 0.05682 -37.767 < 2e-16 ***
## educationbasic.6y -0.27897 0.10240 -2.724 0.00644 **
## educationbasic.9y -0.33261 0.07822 -4.252 2.12e-05 ***
## educationhigh.school 0.02674 0.06779 0.394 0.69327
## educationilliterate 0.94186 0.66073 1.425 0.15402
## educationprofessional.course 0.08556 0.07488 1.143 0.25319
## educationuniversity.degree 0.31128 0.06397 4.866 1.14e-06 ***
## educationunknown 0.39942 0.09494 4.207 2.59e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23033 on 32943 degrees of freedom
## AIC: 23049
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5245 -0.5245 -0.5245 -0.3284 2.4277
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.91412 0.01850 -103.439 <2e-16 ***
## defaultunknown -0.97871 0.05715 -17.126 <2e-16 ***
## defaultyes -8.65191 84.47666 -0.102 0.918
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22832 on 32948 degrees of freedom
## AIC: 22838
##
## Number of Fisher Scoring iterations: 9
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4981 -0.4981 -0.4797 -0.4797 2.1430
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.10439 0.02629 -80.042 <2e-16 ***
## housingunknown -0.08578 0.12151 -0.706 0.4802
## housingyes 0.07985 0.03542 2.254 0.0242 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23193 on 32948 degrees of freedom
## AIC: 23199
##
## Number of Fisher Scoring iterations: 4
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4897 -0.4896 -0.4896 -0.4896 2.1430
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0610796 0.0191591 -107.577 <2e-16 ***
## loanunknown -0.1290878 0.1201647 -1.074 0.283
## loanyes 0.0004981 0.0487135 0.010 0.992
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23198 on 32948 degrees of freedom
## AIC: 23204
##
## Number of Fisher Scoring iterations: 4
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5629 -0.5629 -0.5629 -0.3327 2.4174
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.76228 0.01955 -90.12 <2e-16 ***
## contacttelephone -1.10423 0.04487 -24.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 22473 on 32949 degrees of freedom
## AIC: 22477
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1636 -0.4666 -0.4396 -0.3731 2.3236
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.38214 0.05445 -25.385 < 2e-16 ***
## monthaug -0.74428 0.07139 -10.426 < 2e-16 ***
## monthdec 1.27986 0.17955 7.128 1.02e-12 ***
## monthjul -0.90598 0.07104 -12.753 < 2e-16 ***
## monthjun -0.78082 0.07408 -10.541 < 2e-16 ***
## monthmar 1.34950 0.11086 12.172 < 2e-16 ***
## monthmay -1.24782 0.06642 -18.787 < 2e-16 ***
## monthnov -0.80395 0.07953 -10.109 < 2e-16 ***
## monthoct 1.11078 0.10050 11.053 < 2e-16 ***
## monthsep 1.12248 0.10893 10.305 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 21595 on 32941 degrees of freedom
## AIC: 21615
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5048 -0.5039 -0.4944 -0.4579 2.1482
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.09147 0.04029 -51.907 <2e-16 ***
## day_of_weekmon -0.11116 0.05715 -1.945 0.0518 .
## day_of_weekthu 0.09185 0.05495 1.671 0.0946 .
## day_of_weektue 0.05132 0.05607 0.915 0.3601
## day_of_weekwed 0.09553 0.05536 1.726 0.0844 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 23180 on 32946 degrees of freedom
## AIC: 23190
##
## Number of Fisher Scoring iterations: 4
##
## [1] "----------------------------------------------------------------------------------------------------"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4377 -0.4311 -0.4311 -0.4311 2.2010
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.78937 0.04890 -36.59 <2e-16 ***
## poutcomenonexistent -0.53987 0.05316 -10.15 <2e-16 ***
## poutcomesuccess 2.38318 0.08008 29.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23199 on 32950 degrees of freedom
## Residual deviance: 21263 on 32948 degrees of freedom
## AIC: 21269
##
## Number of Fisher Scoring iterations: 5
##
## [1] "----------------------------------------------------------------------------------------------------"
## [1] "----------------------------------------------------------------------------------------------------"
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:ggmosaic':
##
## happy
## The following object is masked from 'package:dplyr':
##
## nasa
# Plot correlation heatmap
ggcorr(data, label = TRUE,
palette = "RdBu",
name = "Correlation",
hjust = 0.75,
label_size = 3,
label_round = 2)
## Warning in ggcorr(data, label = TRUE, palette = "RdBu", name = "Correlation", :
## data in column(s) 'job', 'marital', 'education', 'default', 'housing', 'loan',
## 'contact', 'month', 'day_of_week', 'poutcome', 'Target' are not numeric and were
## ignored
###### This shows that there are alot of variables which have corelation among them
library(fscaret)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Versions/3.6/Resources/modules/R_X11.so
## Reason: image not found
## Could not load tcltk. Will use slower R code instead.
## Loading required package: hmeasure
## Loading required package: parallel
##
## Attaching package: 'fscaret'
## The following object is masked from 'package:caret':
##
## RMSE
library(caret)
# Make sure target variable is at the bottom of the class
set.seed(1234)
splitIndex <- createDataPartition(data$Target, p = .75, list = FALSE, times = 1)
trainDF <- data[ splitIndex,]
testDF <- data[-splitIndex,]
fsModels <- c("glm", "gbm", "treebag", "ridge", "lasso")
myFS<-fscaret(trainDF, testDF, myTimeLimit = 40, preprocessData=TRUE,
Used.funcRegPred = fsModels, with.labels=TRUE,
supress.output=FALSE, no.cores=2)
## Loading required package: R.utils
## Loading required package: multicore
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## ----Packages loaded successfully----
##
##
## -----Warnings have been supressed!----
##
## ----Processing files:----
## [1] "17in_default_REGControl_glm.RData"
## [1] ""
## [1] "Calculating error for model:"
## [1] "17in_default_REGControl_glm.RData"
## [1] ""
##
## ----Processing files:----
## [1] "17in_default_REGControl_VarImp_glm.txt"
##
## matrycaVarImp.RMSE after
## [,1] [,2] [,3] [,4]
## [1,] 1.30388955 0 0 0
## [2,] 0.21764769 0 0 0
## [3,] 0.93453918 0 0 0
## [4,] 1.85272246 0 0 0
## [5,] 2.16970585 0 0 0
## [6,] 0.06917630 0 0 0
## [7,] 0.02695651 0 0 0
## [8,] 5.11617310 0 0 0
## [9,] 4.48215630 0 0 0
## [10,] 1.35181466 0 0 0
## [11,] 31.03849318 0 0 0
## [12,] 0.59067043 0 0 0
## [13,] 7.04887190 0 0 0
## [14,] 12.58578043 0 0 0
## [15,] 4.51356490 0 0 0
## [16,] 6.59184961 0 0 0
## [17,] 20.10598795 0 0 0
myFS$VarImp$matrixVarImp.MSE
## glm SUM SUM% ImpGrad Input_no
## 11 31.03849318 31.03849318 100.00000000 0.0000000 11
## 17 20.10598795 20.10598795 64.77759031 35.2224097 17
## 14 12.58578043 12.58578043 40.54894145 37.4028252 14
## 13 7.04887190 7.04887190 22.71009698 43.9933666 13
## 16 6.59184961 6.59184961 21.23765989 6.4836231 16
## 8 5.11617310 5.11617310 16.48331662 22.3863801 8
## 15 4.51356490 4.51356490 14.54182996 11.7784952 15
## 9 4.48215630 4.48215630 14.44063753 0.6958713 9
## 5 2.16970585 2.16970585 6.99037107 51.5923653 5
## 4 1.85272246 1.85272246 5.96911214 14.6095095 4
## 10 1.35181466 1.35181466 4.35528444 27.0363106 10
## 1 1.30388955 1.30388955 4.20087903 3.5452429 1
## 3 0.93453918 0.93453918 3.01090383 28.3268143 3
## 12 0.59067043 0.59067043 1.90302545 36.7955417 12
## 2 0.21764769 0.21764769 0.70121860 63.1524317 2
## 6 0.06917630 0.06917630 0.22287262 68.2163858 6
## 7 0.02695651 0.02695651 0.08684864 61.0321601 7
# Getting the Lables
results <- myFS$VarImp$matrixVarImp.MSE
results$Input_no <- as.numeric(results$Input_no)
results <- results[c("SUM","SUM%","ImpGrad","Input_no")]
myFS$PPlabels$Input_no <- as.numeric(rownames(myFS$PPlabels))
results <- merge(x=results, y=myFS$PPlabels, by="Input_no", all.x=T)
results <- results[c('Labels', 'SUM')]
results <- subset(results,results$SUM !=0)
results <- results[order(-results$SUM),]
print(results)
## Labels SUM
## 11 duration 31.03849318
## 17 nr.employed 20.10598795
## 14 poutcome 12.58578043
## 13 previous 7.04887190
## 16 cons.conf.idx 6.59184961
## 8 contact 5.11617310
## 15 cons.price.idx 4.51356490
## 9 month 4.48215630
## 5 default 2.16970585
## 4 education 1.85272246
## 10 day_of_week 1.35181466
## 1 age 1.30388955
## 3 marital 0.93453918
## 12 campaign 0.59067043
## 2 job 0.21764769
## 6 housing 0.06917630
## 7 loan 0.02695651
##### from this code we can see that all the 5 models present the following variables to be important variables in getting to know the Target Variable
# Age
# Job
# Marital Status
# Eduation
str(data)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 2 2 2 2 2 2 2 3 3 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 1 4 4 2 4 3 6 8 6 4 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 2 1 1 1 2 1 2 1 1 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 1 1 3 1 1 1 1 1 3 3 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 3 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ Target : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Chisq Test for Target variable with all Factor Independent variables
myTests <- lapply(data[-length(data)], function(x) chisq.test(table(x, data$Target)))
a <- (round(unlist(sapply(myTests, "[", "p.value")),4))
print(a)
## age.p.value job.p.value marital.p.value
## 0.0000 0.0000 0.0000
## education.p.value default.p.value housing.p.value
## 0.0000 0.0000 0.0583
## loan.p.value contact.p.value month.p.value
## 0.5787 0.0000 0.0000
## day_of_week.p.value duration.p.value campaign.p.value
## 0.0000 0.0000 0.0000
## pdays.p.value previous.p.value poutcome.p.value
## 0.0000 0.0000 0.0000
## emp.var.rate.p.value cons.price.idx.p.value cons.conf.idx.p.value
## 0.0000 0.0000 0.0000
## euribor3m.p.value nr.employed.p.value
## 0.0000 0.0000
# T-test for All IV(numerics) with Target Variable
options(scripen=999)
num <- unlist(lapply(data, is.numeric))
t_test <- lapply(data[,num],function(x) t.test(x ~ data$Target, var.equal = TRUE)$p.value)
print(t_test)
## $age
## [1] 6.802136e-10
##
## $duration
## [1] 0
##
## $campaign
## [1] 2.00778e-41
##
## $pdays
## [1] 0
##
## $previous
## [1] 0
##
## $emp.var.rate
## [1] 0
##
## $cons.price.idx
## [1] 9.318965e-170
##
## $cons.conf.idx
## [1] 7.536665e-29
##
## $euribor3m
## [1] 0
##
## $nr.employed
## [1] 0
# Let us see if our factor variables have more than 1 level
a<- which(sapply(bankf, function(x) length(unique(x))<2))
# let us split the data into Training and Testing data
set.seed(1234)
index1 <- sample(1:2, nrow(bankf), replace = TRUE, prob = c(0.65,0.35))
train <- bankf[index1 == 1, ]
test <- bankf[index1 == 2, ]
full <- glm(y~., data=train,family = "binomial")
null <- glm(y~1, data=train,family = "binomial")
# Forward selection
mod_log<- step(full,scope=list(lower=null,upper=full),
direction = "both")
## Start: AIC=11389.23
## y ~ age + job + marital + education + default + housing + loan +
## contact + month + day_of_week + duration + campaign + pdays +
## previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed
##
## Df Deviance AIC
## - education 7 11293 11385
## - marital 3 11285 11385
## - housing 1 11283 11387
## - age 1 11283 11387
## - previous 1 11284 11388
## - job 11 11305 11389
## - nr.employed 1 11285 11389
## <none> 11283 11389
## - euribor3m 1 11286 11390
## - loan 1 11286 11390
## - campaign 1 11287 11391
## - cons.conf.idx 1 11288 11392
## - day_of_week 4 11296 11394
## - pdays 1 11294 11398
## - default 2 11299 11401
## - poutcome 2 11308 11410
## - contact 1 11326 11430
## - cons.price.idx 1 11328 11432
## - emp.var.rate 1 11375 11479
## - month 9 11615 11703
## - duration 1 14813 14917
##
## Step: AIC=11384.69
## y ~ age + job + marital + default + housing + loan + contact +
## month + day_of_week + duration + campaign + pdays + previous +
## poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed
##
## Df Deviance AIC
## - marital 3 11295 11381
## - housing 1 11293 11383
## - age 1 11293 11383
## - previous 1 11294 11384
## - nr.employed 1 11294 11384
## <none> 11293 11385
## - loan 1 11296 11386
## - euribor3m 1 11296 11386
## - campaign 1 11297 11387
## - cons.conf.idx 1 11297 11387
## + education 7 11283 11389
## - day_of_week 4 11305 11389
## - job 11 11323 11393
## - pdays 1 11303 11393
## - default 2 11309 11397
## - poutcome 2 11318 11406
## - contact 1 11336 11426
## - cons.price.idx 1 11337 11427
## - emp.var.rate 1 11385 11475
## - month 9 11632 11706
## - duration 1 14818 14908
##
## Step: AIC=11381.2
## y ~ age + job + default + housing + loan + contact + month +
## day_of_week + duration + campaign + pdays + previous + poutcome +
## emp.var.rate + cons.price.idx + cons.conf.idx + euribor3m +
## nr.employed
##
## Df Deviance AIC
## - housing 1 11295 11379
## - age 1 11295 11379
## - previous 1 11296 11380
## - nr.employed 1 11297 11381
## <none> 11295 11381
## - loan 1 11298 11382
## - euribor3m 1 11298 11382
## - campaign 1 11299 11383
## - cons.conf.idx 1 11300 11384
## + marital 3 11293 11385
## + education 7 11285 11385
## - day_of_week 4 11308 11386
## - pdays 1 11306 11390
## - job 11 11329 11393
## - default 2 11312 11394
## - poutcome 2 11320 11402
## - contact 1 11339 11423
## - cons.price.idx 1 11340 11424
## - emp.var.rate 1 11388 11472
## - month 9 11638 11706
## - duration 1 14821 14905
##
## Step: AIC=11379.23
## y ~ age + job + default + loan + contact + month + day_of_week +
## duration + campaign + pdays + previous + poutcome + emp.var.rate +
## cons.price.idx + cons.conf.idx + euribor3m + nr.employed
##
## Df Deviance AIC
## - age 1 11295 11377
## - previous 1 11296 11378
## - loan 2 11298 11378
## - nr.employed 1 11297 11379
## <none> 11295 11379
## - euribor3m 1 11298 11380
## - campaign 1 11299 11381
## + housing 1 11295 11381
## - cons.conf.idx 1 11300 11382
## + marital 3 11293 11383
## + education 7 11285 11383
## - day_of_week 4 11308 11384
## - pdays 1 11306 11388
## - job 11 11329 11391
## - default 2 11312 11392
## - poutcome 2 11320 11400
## - contact 1 11339 11421
## - cons.price.idx 1 11340 11422
## - emp.var.rate 1 11388 11470
## - month 9 11638 11704
## - duration 1 14822 14904
##
## Step: AIC=11377.33
## y ~ job + default + loan + contact + month + day_of_week + duration +
## campaign + pdays + previous + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + nr.employed
##
## Df Deviance AIC
## - previous 1 11296 11376
## - loan 2 11298 11376
## - nr.employed 1 11297 11377
## <none> 11295 11377
## - euribor3m 1 11298 11378
## + age 1 11295 11379
## - campaign 1 11299 11379
## + housing 1 11295 11379
## - cons.conf.idx 1 11300 11380
## + marital 3 11293 11381
## + education 7 11285 11381
## - day_of_week 4 11308 11382
## - pdays 1 11306 11386
## - job 11 11330 11390
## - default 2 11313 11391
## - poutcome 2 11320 11398
## - contact 1 11339 11419
## - cons.price.idx 1 11340 11420
## - emp.var.rate 1 11388 11468
## - month 9 11638 11702
## - duration 1 14822 14902
##
## Step: AIC=11376.23
## y ~ job + default + loan + contact + month + day_of_week + duration +
## campaign + pdays + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + nr.employed
##
## Df Deviance AIC
## - loan 2 11299 11375
## - nr.employed 1 11298 11376
## <none> 11296 11376
## + previous 1 11295 11377
## - euribor3m 1 11299 11377
## + age 1 11296 11378
## - campaign 1 11300 11378
## + housing 1 11296 11378
## - cons.conf.idx 1 11300 11378
## + marital 3 11294 11380
## + education 7 11286 11380
## - day_of_week 4 11309 11381
## - pdays 1 11306 11384
## - job 11 11331 11389
## - default 2 11314 11390
## - contact 1 11339 11417
## - cons.price.idx 1 11341 11419
## - poutcome 2 11358 11434
## - emp.var.rate 1 11388 11466
## - month 9 11639 11701
## - duration 1 14823 14901
##
## Step: AIC=11375.37
## y ~ job + default + contact + month + day_of_week + duration +
## campaign + pdays + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + nr.employed
##
## Df Deviance AIC
## - nr.employed 1 11301 11375
## <none> 11299 11375
## + loan 2 11296 11376
## + previous 1 11298 11376
## - euribor3m 1 11302 11376
## + age 1 11299 11377
## - campaign 1 11303 11377
## - cons.conf.idx 1 11304 11378
## + marital 3 11297 11379
## + housing 2 11299 11379
## + education 7 11290 11380
## - day_of_week 4 11312 11380
## - pdays 1 11309 11383
## - job 11 11334 11388
## - default 2 11317 11389
## - contact 1 11342 11416
## - cons.price.idx 1 11344 11418
## - poutcome 2 11362 11434
## - emp.var.rate 1 11392 11466
## - month 9 11643 11701
## - duration 1 14825 14899
##
## Step: AIC=11375.02
## y ~ job + default + contact + month + day_of_week + duration +
## campaign + pdays + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m
##
## Df Deviance AIC
## <none> 11301 11375
## + nr.employed 1 11299 11375
## - cons.conf.idx 1 11304 11376
## + loan 2 11298 11376
## + previous 1 11300 11376
## + age 1 11301 11377
## - campaign 1 11305 11377
## + marital 3 11298 11378
## + housing 2 11301 11379
## - day_of_week 4 11313 11379
## + education 7 11292 11380
## - pdays 1 11311 11383
## - job 11 11335 11387
## - default 2 11318 11388
## - euribor3m 1 11318 11390
## - contact 1 11342 11414
## - poutcome 2 11363 11433
## - emp.var.rate 1 11420 11492
## - cons.price.idx 1 11467 11539
## - month 9 11644 11700
## - duration 1 14828 14900
summary(mod_log)
##
## Call:
## glm(formula = y ~ job + default + contact + month + day_of_week +
## duration + campaign + pdays + poutcome + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.9230 -0.3050 -0.1876 -0.1380 3.2606
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.620e+02 1.256e+01 -12.894 < 2e-16 ***
## jobblue-collar -3.094e-01 8.059e-02 -3.840 0.000123 ***
## jobentrepreneur -1.841e-01 1.514e-01 -1.216 0.223908
## jobhousemaid -2.181e-02 1.700e-01 -0.128 0.897896
## jobmanagement -4.143e-02 1.033e-01 -0.401 0.688288
## jobretired 1.979e-01 1.017e-01 1.945 0.051773 .
## jobself-employed -9.133e-02 1.399e-01 -0.653 0.513820
## jobservices -2.861e-01 1.017e-01 -2.814 0.004899 **
## jobstudent 1.867e-01 1.259e-01 1.483 0.138118
## jobtechnician -6.054e-02 7.815e-02 -0.775 0.438562
## jobunemployed -2.633e-03 1.542e-01 -0.017 0.986371
## jobunknown -3.133e-01 3.165e-01 -0.990 0.322246
## defaultunknown -3.348e-01 8.190e-02 -4.088 4.35e-05 ***
## defaultyes -6.262e+00 1.195e+02 -0.052 0.958201
## contacttelephone -5.557e-01 8.871e-02 -6.264 3.75e-10 ***
## monthaug 7.300e-01 1.318e-01 5.539 3.03e-08 ***
## monthdec 1.316e-01 2.343e-01 0.562 0.574248
## monthjul 3.630e-02 1.178e-01 0.308 0.758041
## monthjun -3.694e-01 1.297e-01 -2.849 0.004387 **
## monthmar 1.766e+00 1.456e-01 12.125 < 2e-16 ***
## monthmay -5.285e-01 9.256e-02 -5.710 1.13e-08 ***
## monthnov -5.396e-01 1.320e-01 -4.089 4.34e-05 ***
## monthoct 1.053e-01 1.508e-01 0.698 0.484879
## monthsep 1.879e-01 1.590e-01 1.181 0.237499
## day_of_weekmon -9.604e-02 8.081e-02 -1.188 0.234651
## day_of_weekthu 6.249e-02 7.837e-02 0.797 0.425264
## day_of_weektue 8.495e-02 8.115e-02 1.047 0.295199
## day_of_weekwed 1.674e-01 8.068e-02 2.075 0.038029 *
## duration 4.593e-03 9.128e-05 50.320 < 2e-16 ***
## campaign -2.672e-02 1.373e-02 -1.946 0.051709 .
## pdays -7.787e-04 2.470e-04 -3.152 0.001619 **
## poutcomenonexistent 5.108e-01 7.871e-02 6.490 8.58e-11 ***
## poutcomesuccess 1.047e+00 2.474e-01 4.231 2.33e-05 ***
## emp.var.rate -1.524e+00 1.382e-01 -11.022 < 2e-16 ***
## cons.price.idx 1.683e+00 1.303e-01 12.921 < 2e-16 ***
## cons.conf.idx 1.103e-02 6.732e-03 1.638 0.101330
## euribor3m 4.387e-01 1.047e-01 4.191 2.78e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18898 on 26733 degrees of freedom
## Residual deviance: 11301 on 26697 degrees of freedom
## AIC: 11375
##
## Number of Fisher Scoring iterations: 9
# let us tidy the model
library (devtools)
## Loading required package: usethis
library (broom)
tidy(mod_log)
## # A tibble: 37 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -162. 12.6 -12.9 4.88e-38
## 2 jobblue-collar -0.309 0.0806 -3.84 1.23e- 4
## 3 jobentrepreneur -0.184 0.151 -1.22 2.24e- 1
## 4 jobhousemaid -0.0218 0.170 -0.128 8.98e- 1
## 5 jobmanagement -0.0414 0.103 -0.401 6.88e- 1
## 6 jobretired 0.198 0.102 1.95 5.18e- 2
## 7 jobself-employed -0.0913 0.140 -0.653 5.14e- 1
## 8 jobservices -0.286 0.102 -2.81 4.90e- 3
## 9 jobstudent 0.187 0.126 1.48 1.38e- 1
## 10 jobtechnician -0.0605 0.0781 -0.775 4.39e- 1
## # … with 27 more rows
# Checking Odds Ratio
exp(coef(mod_log))
## (Intercept) jobblue-collar jobentrepreneur jobhousemaid
## 4.432758e-71 7.338526e-01 8.318114e-01 9.784280e-01
## jobmanagement jobretired jobself-employed jobservices
## 9.594204e-01 1.218796e+00 9.127205e-01 7.511883e-01
## jobstudent jobtechnician jobunemployed jobunknown
## 1.205228e+00 9.412592e-01 9.973700e-01 7.310454e-01
## defaultunknown defaultyes contacttelephone monthaug
## 7.154565e-01 1.908303e-03 5.736702e-01 2.075004e+00
## monthdec monthjul monthjun monthmar
## 1.140701e+00 1.036962e+00 6.911543e-01 5.846638e+00
## monthmay monthnov monthoct monthsep
## 5.894878e-01 5.829971e-01 1.111063e+00 1.206662e+00
## day_of_weekmon day_of_weekthu day_of_weektue day_of_weekwed
## 9.084267e-01 1.064480e+00 1.088659e+00 1.182203e+00
## duration campaign pdays poutcomenonexistent
## 1.004604e+00 9.736343e-01 9.992216e-01 1.666644e+00
## poutcomesuccess emp.var.rate cons.price.idx cons.conf.idx
## 2.848763e+00 2.178821e-01 5.382497e+00 1.011092e+00
## euribor3m
## 1.550715e+00
exp(cbind(OR = coef(mod_log), confint(mod_log)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 4.432758e-71 8.820596e-82 2.176538e-60
## jobblue-collar 7.338526e-01 6.262402e-01 8.589564e-01
## jobentrepreneur 8.318114e-01 6.142734e-01 1.112633e+00
## jobhousemaid 9.784280e-01 6.960815e-01 1.355943e+00
## jobmanagement 9.594204e-01 7.822068e-01 1.172651e+00
## jobretired 1.218796e+00 9.975868e-01 1.486509e+00
## jobself-employed 9.127205e-01 6.905498e-01 1.195323e+00
## jobservices 7.511883e-01 6.142255e-01 9.151668e-01
## jobstudent 1.205228e+00 9.403211e-01 1.540496e+00
## jobtechnician 9.412592e-01 8.072042e-01 1.096606e+00
## jobunemployed 9.973700e-01 7.340426e-01 1.343708e+00
## jobunknown 7.310454e-01 3.817421e-01 1.325868e+00
## defaultunknown 7.154565e-01 6.083242e-01 8.387037e-01
## defaultyes 1.908303e-03 NA 7.156733e+07
## contacttelephone 5.736702e-01 4.814265e-01 6.816876e-01
## monthaug 2.075004e+00 1.602933e+00 2.687120e+00
## monthdec 1.140701e+00 7.196007e-01 1.804984e+00
## monthjul 1.036962e+00 8.230246e-01 1.306254e+00
## monthjun 6.911543e-01 5.361272e-01 8.913336e-01
## monthmar 5.846638e+00 4.391679e+00 7.774960e+00
## monthmay 5.894878e-01 4.918541e-01 7.070293e-01
## monthnov 5.829971e-01 4.498423e-01 7.546865e-01
## monthoct 1.111063e+00 8.266682e-01 1.493077e+00
## monthsep 1.206662e+00 8.831113e-01 1.647494e+00
## day_of_weekmon 9.084267e-01 7.753501e-01 1.064380e+00
## day_of_weekthu 1.064480e+00 9.130428e-01 1.241456e+00
## day_of_weektue 1.088659e+00 9.286100e-01 1.276463e+00
## day_of_weekwed 1.182203e+00 1.009378e+00 1.384948e+00
## duration 1.004604e+00 1.004425e+00 1.004785e+00
## campaign 9.736343e-01 9.472224e-01 9.995974e-01
## pdays 9.992216e-01 9.987382e-01 9.997077e-01
## poutcomenonexistent 1.666644e+00 1.429992e+00 1.946947e+00
## poutcomesuccess 2.848763e+00 1.754804e+00 4.637654e+00
## emp.var.rate 2.178821e-01 1.662128e-01 2.857885e-01
## cons.price.idx 5.382497e+00 4.170129e+00 6.949455e+00
## cons.conf.idx 1.011092e+00 9.978444e-01 1.024534e+00
## euribor3m 1.550715e+00 1.262575e+00 1.903214e+00
# Let us do some prediction
predicted <- predict(mod_log, newdata=test, type="response")
# Model Evaluation
#run anova for checking the model
anova(mod_log, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: y
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 26733 18898
## job 11 543.2 26722 18354 < 2.2e-16 ***
## default 2 263.4 26720 18091 < 2.2e-16 ***
## contact 1 443.1 26719 17648 < 2.2e-16 ***
## month 9 961.8 26710 16686 < 2.2e-16 ***
## day_of_week 4 25.5 26706 16661 4.053e-05 ***
## duration 1 3331.8 26705 13329 < 2.2e-16 ***
## campaign 1 36.7 26704 13292 1.401e-09 ***
## pdays 1 827.3 26703 12465 < 2.2e-16 ***
## poutcome 2 12.6 26701 12452 0.0017978 **
## emp.var.rate 1 837.6 26700 11615 < 2.2e-16 ***
## cons.price.idx 1 283.7 26699 11331 < 2.2e-16 ***
## cons.conf.idx 1 12.6 26698 11318 0.0003824 ***
## euribor3m 1 17.4 26697 11301 3.004e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Recode Predicted to factors
y_pred <- ifelse(predicted>0.5,"yes","no") # let us recode with cut off of 0.5
table(y_pred,test$y)
##
## y_pred no yes
## no 12515 937
## yes 329 673
# Confusion Matrix
confusionMatrix(factor(y_pred),test$y,positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 12515 937
## yes 329 673
##
## Accuracy : 0.9124
## 95% CI : (0.9077, 0.917)
## No Information Rate : 0.8886
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.47
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.41801
## Specificity : 0.97438
## Pos Pred Value : 0.67166
## Neg Pred Value : 0.93034
## Prevalence : 0.11139
## Detection Rate : 0.04656
## Detection Prevalence : 0.06932
## Balanced Accuracy : 0.69620
##
## 'Positive' Class : yes
##
# ROC Curve with ROCR package
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pred <- predict(mod_log, test, type="response")
pred1 <- prediction(pred,test$y)
eval <- performance(pred1,"tpr","fpr")
plot(eval) # ROC Curve
# wee need to boost our RECALL as well as FNR