Reading The Data into R
## Reading the data into RStudio ##
Bank_marketing<-read.csv("/Users/ravishankar/Downloads/Banking.csv",header=T,sep=",")
Data Preparation
#Colnames of the variables
names(Bank_marketing)
## [1] "Age" "Job" "Marital" "Education"
## [5] "Credit.default" "Housing.Loan" "Persona.Loan" "Contact.Type"
## [9] "Month" "Day" "Duration" "Campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "Term.Deposit"
#Top 10 Obserbations
head(Bank_marketing,10)
## Age Job Marital Education Credit.default Housing.Loan
## 1 56 housemaid married Basic no no
## 2 57 services married High School yes no
## 3 37 services married High School no yes
## 4 40 Admin married Basic no no
## 5 56 services married High School no no
## 6 45 services married Basic yes no
## 7 59 Admin married Professional Course no no
## 8 41 blue collar married High School yes no
## 9 24 technician single Professional Course no yes
## 10 25 services single High School no yes
## Persona.Loan Contact.Type Month Day Duration Campaign pdays previous
## 1 no telephone may Monday 261 1 999 0
## 2 no telephone may Monday 149 1 999 0
## 3 no telephone may Monday 226 1 999 0
## 4 no telephone may Monday 151 1 999 0
## 5 yes telephone may Monday 307 1 999 0
## 6 no telephone may Monday 198 1 999 0
## 7 no telephone may Monday 139 1 999 0
## 8 no telephone may Monday 217 1 999 0
## 9 no telephone may Monday 380 1 999 0
## 10 no telephone may Monday 50 1 999 0
## poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m
## 1 nonexistent 1.1 93.994 -36.4 4.857
## 2 nonexistent 1.1 93.994 -36.4 4.857
## 3 nonexistent 1.1 93.994 -36.4 4.857
## 4 nonexistent 1.1 93.994 -36.4 4.857
## 5 nonexistent 1.1 93.994 -36.4 4.857
## 6 nonexistent 1.1 93.994 -36.4 4.857
## 7 nonexistent 1.1 93.994 -36.4 4.857
## 8 nonexistent 1.1 93.994 -36.4 4.857
## 9 nonexistent 1.1 93.994 -36.4 4.857
## 10 nonexistent 1.1 93.994 -36.4 4.857
## nr.employed Term.Deposit
## 1 5191 no
## 2 5191 no
## 3 5191 no
## 4 5191 no
## 5 5191 no
## 6 5191 no
## 7 5191 no
## 8 5191 no
## 9 5191 no
## 10 5191 no
#Structure of the data
str(Bank_marketing)
## 'data.frame': 41188 obs. of 21 variables:
## $ Age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ Job : Factor w/ 11 levels "Admin","blue collar",..: 4 8 8 1 8 8 1 2 10 8 ...
## $ Marital : Factor w/ 2 levels "married","single": 1 1 1 1 1 1 1 1 2 2 ...
## $ Education : Factor w/ 5 levels "Basic","High School",..: 1 2 2 1 2 1 3 2 3 2 ...
## $ Credit.default: Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ Housing.Loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 1 1 2 2 ...
## $ Persona.Loan : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ Contact.Type : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ Month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ Day : Factor w/ 5 levels "Friday","Monday",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ Campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ Term.Deposit : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
#Summary of the data
summary(Bank_marketing)
## Age Job Marital
## Min. :17.00 Admin :10422 married:29540
## 1st Qu.:32.00 blue collar: 9254 single :11648
## Median :38.00 technician : 6743
## Mean :40.02 services : 3969
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## Education Credit.default Housing.Loan Persona.Loan
## Basic :12513 no :32588 no :18622 no :34940
## High School :11246 yes: 8600 yes:22566 yes: 6248
## Professional Course: 5243
## University degree :12168
## illiterate : 18
##
##
## Contact.Type Month Day Duration
## cellular :26144 may :13769 Friday :7827 Min. : 0.0
## telephone:15044 jul : 7174 Monday :8514 1st Qu.: 102.0
## aug : 6178 Thursday :8623 Median : 180.0
## jun : 5318 Tuesday :8090 Mean : 258.3
## nov : 4101 Wednesday:8134 3rd Qu.: 319.0
## apr : 2632 Max. :4918.0
## (Other): 2016
## Campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 4252
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:35563
## Median : 2.000 Median :999.0 Median :0.000 success : 1373
## Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :56.000 Max. :999.0 Max. :7.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08189 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed Term.Deposit
## Min. :4964 no :36548
## 1st Qu.:5099 yes: 4640
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
##Outlier Treatment
# The outliers can be detected with the help of box plots
# It can be treated by caping and flooring method
##Missing Value Treatmemt
#There is no Missing Value Treatment in this dataset but usually if the missing value is more than 30%
#then the variable can be removed else the missing values can be replaced with a mean if it is the
#representative value or median if its influenced by outliers
Conversion of Dummy Variables
##Conversion of Dummy Variables - all the factor variables will be converted into dummy variables
# Example - Job
#Bank_marketing$Admin <- ifelse(Bank_marketing$job=="Admin",1,0)
#Bank_marketing$blue.collar <- ifelse(Bank_marketing$job=="blue.collar",1,0)
#Bank_marketing$technician <- ifelse(Bank_marketing$job=="technician",1,0)
#Bank_marketing$services <- ifelse(Bank_marketing$job=="services",1,0)
#Bank_marketing$management <- ifelse(Bank_marketing$job=="management",1,0)
#Bank_marketing$retired <- ifelse(Bank_marketing$job=="retired",1,0)
#In this dataset the conversion of dummy variables is done through Excel
Bank_data<-read.csv("/Users/ravishankar/Desktop/Bank_Marketing.csv",header=T,sep=",")
Spliiting the data into Train data and Test data
## Split the data into Train data and Test data
nrow(Bank_data)
## [1] 41188
#Random Sampling
population_Size<-nrow(Bank_data)
sample_pct<-20/100
sample_size<-as.integer(sample_pct*population_Size)
test_data<-Bank_data[sample(1:population_Size,sample_size,replace=F),]
train_data<-Bank_data[1:40000,]
Choosing the Desired Model
## The Preferred Model is Logistic Regression Since the Target Variable is binominal Variable(Categorical)
## Assumptions Made for Logistic Regression
#1. Linearity Between Independent and Dependent Variables
#2. Normality of Errors(Observed-Predicted)
#3. Variance of Errors should be a Constant (Hetroscadacity)
#4. Multi-colloniearity Should be removed
Iterations
iteration1<-glm(Term.Deposit~Age,data=train_data,family=binomial(logit))
summary(iteration1)
##
## Call:
## glm(formula = Term.Deposit ~ Age, family = binomial(logit), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5544 -0.4704 -0.4562 -0.4455 2.2109
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.474887 0.066788 -37.056 < 2e-16 ***
## Age 0.007155 0.001594 4.488 7.19e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 26153 on 39998 degrees of freedom
## AIC: 26157
##
## Number of Fisher Scoring iterations: 4
library(car)
## Warning: package 'car' was built under R version 3.1.3
anova(object=iteration1,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.873 39998 26153 8.275e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration2<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed,data=train_data,family=binomial(logit))
summary(iteration2)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6252 -0.4939 -0.4452 -0.3719 2.3553
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.833224 0.081801 -22.411 < 2e-16 ***
## Age 0.003158 0.001605 1.968 0.0491 *
## housemaid -0.632239 0.115459 -5.476 4.35e-08 ***
## services -0.800268 0.072314 -11.067 < 2e-16 ***
## Admin -0.322876 0.049929 -6.467 1.00e-10 ***
## blue.collar -0.942336 0.056929 -16.553 < 2e-16 ***
## technician -0.542072 0.057272 -9.465 < 2e-16 ***
## management -0.473796 0.072310 -6.552 5.67e-11 ***
## unemployed -0.231534 0.104141 -2.223 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 25808 on 39991 degrees of freedom
## AIC: 25826
##
## Number of Fisher Scoring iterations: 5
vif(iteration2)
## Age housemaid services Admin blue.collar technician
## 1.072936 1.090764 1.306380 1.828918 1.545681 1.559250
## management unemployed
## 1.270735 1.122316
anova(object=iteration2,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.873 39998 26153 8.275e-06 ***
## housemaid 1 2.733 39997 26151 0.09830 .
## services 1 32.525 39996 26118 1.177e-08 ***
## Admin 1 23.922 39995 26094 1.003e-06 ***
## blue.collar 1 185.018 39994 25909 < 2.2e-16 ***
## technician 1 55.138 39993 25854 1.124e-13 ***
## management 1 40.845 39992 25813 1.648e-10 ***
## unemployed 1 5.152 39991 25808 0.02322 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration3<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed+retired+student+entrepreneur+Marital+Basic+High.School+Prodessional.Course+University.degree,data=train_data,family=binomial(logit))
summary(iteration3)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed + retired +
## student + entrepreneur + Marital + Basic + High.School +
## Prodessional.Course + University.degree, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9147 -0.4815 -0.4258 -0.3665 2.4259
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.264897 0.126484 -17.907 < 2e-16 ***
## Age 0.007040 0.001981 3.554 0.00038 ***
## housemaid 0.002795 0.142885 0.020 0.98439
## services -0.233488 0.111182 -2.100 0.03572 *
## Admin 0.098883 0.094710 1.044 0.29646
## blue.collar -0.263594 0.103439 -2.548 0.01083 *
## technician -0.071569 0.100765 -0.710 0.47755
## management -0.019926 0.108634 -0.183 0.85447
## unemployed 0.312838 0.132037 2.369 0.01782 *
## retired 0.949020 0.115840 8.193 2.56e-16 ***
## student 0.946443 0.118331 7.998 1.26e-15 ***
## entrepreneur -0.187610 0.131370 -1.428 0.15326
## Marital -0.337813 0.040786 -8.283 < 2e-16 ***
## Basic -0.169902 0.068493 -2.481 0.01312 *
## High.School -0.034860 0.064485 -0.541 0.58879
## Prodessional.Course NA NA NA NA
## University.degree 0.153875 0.062524 2.461 0.01385 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 25507 on 39984 degrees of freedom
## AIC: 25539
##
## Number of Fisher Scoring iterations: 5
anova(object=iteration3,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.873 39998 26153 8.275e-06 ***
## housemaid 1 2.733 39997 26151 0.0982980 .
## services 1 32.525 39996 26118 1.177e-08 ***
## Admin 1 23.922 39995 26094 1.003e-06 ***
## blue.collar 1 185.018 39994 25909 < 2.2e-16 ***
## technician 1 55.138 39993 25854 1.124e-13 ***
## management 1 40.845 39992 25813 1.648e-10 ***
## unemployed 1 5.152 39991 25808 0.0232188 *
## retired 1 68.119 39990 25740 < 2.2e-16 ***
## student 1 117.363 39989 25622 < 2.2e-16 ***
## entrepreneur 1 3.586 39988 25619 0.0582785 .
## Marital 1 76.647 39987 25542 < 2.2e-16 ***
## Basic 1 18.011 39986 25524 2.196e-05 ***
## High.School 1 11.558 39985 25513 0.0006746 ***
## Prodessional.Course 0 0.000 39985 25513
## University.degree 1 6.119 39984 25507 0.0133740 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration4<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed+retired+student+entrepreneur+Marital+Basic+High.School+Prodessional.Course+University.degree+Credit.default+Housing.Loan+Persona.Loan+Contact.Type,data=train_data,family=binomial(logit))
summary(iteration4)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed + retired +
## student + entrepreneur + Marital + Basic + High.School +
## Prodessional.Course + University.degree + Credit.default +
## Housing.Loan + Persona.Loan + Contact.Type, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0211 -0.5245 -0.3912 -0.3009 2.8411
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.058821 0.129604 -15.885 < 2e-16 ***
## Age 0.010304 0.002005 5.139 2.76e-07 ***
## housemaid 0.005501 0.144636 0.038 0.9697
## services -0.208812 0.112280 -1.860 0.0629 .
## Admin 0.045197 0.095658 0.472 0.6366
## blue.collar -0.201120 0.104766 -1.920 0.0549 .
## technician -0.131658 0.101903 -1.292 0.1964
## management -0.057458 0.109710 -0.524 0.6005
## unemployed 0.326672 0.133713 2.443 0.0146 *
## retired 0.803506 0.118384 6.787 1.14e-11 ***
## student 0.973841 0.120488 8.082 6.35e-16 ***
## entrepreneur -0.178927 0.132650 -1.349 0.1774
## Marital -0.273281 0.041424 -6.597 4.19e-11 ***
## Basic -0.070578 0.069520 -1.015 0.3100
## High.School 0.001859 0.065054 0.029 0.9772
## Prodessional.Course NA NA NA NA
## University.degree 0.112087 0.063169 1.774 0.0760 .
## Credit.default -0.733060 0.054404 -13.474 < 2e-16 ***
## Housing.Loan 0.001822 0.034241 0.053 0.9576
## Persona.Loan -0.069010 0.047762 -1.445 0.1485
## Contact.Type -0.933406 0.042877 -21.769 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 24670 on 39980 degrees of freedom
## AIC: 24710
##
## Number of Fisher Scoring iterations: 5
anova(object=iteration4,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.87 39998 26153 8.275e-06 ***
## housemaid 1 2.73 39997 26151 0.0982980 .
## services 1 32.52 39996 26118 1.177e-08 ***
## Admin 1 23.92 39995 26094 1.003e-06 ***
## blue.collar 1 185.02 39994 25909 < 2.2e-16 ***
## technician 1 55.14 39993 25854 1.124e-13 ***
## management 1 40.85 39992 25813 1.648e-10 ***
## unemployed 1 5.15 39991 25808 0.0232188 *
## retired 1 68.12 39990 25740 < 2.2e-16 ***
## student 1 117.36 39989 25622 < 2.2e-16 ***
## entrepreneur 1 3.59 39988 25619 0.0582785 .
## Marital 1 76.65 39987 25542 < 2.2e-16 ***
## Basic 1 18.01 39986 25524 2.196e-05 ***
## High.School 1 11.56 39985 25513 0.0006746 ***
## Prodessional.Course 0 0.00 39985 25513
## University.degree 1 6.12 39984 25507 0.0133740 *
## Credit.default 1 286.48 39983 25220 < 2.2e-16 ***
## Housing.Loan 1 2.85 39982 25217 0.0913077 .
## Persona.Loan 1 1.57 39981 25216 0.2105271
## Contact.Type 1 545.71 39980 24670 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration5<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed+retired+student+entrepreneur+Marital+Basic+High.School+Prodessional.Course+University.degree+Credit.default+Housing.Loan+Persona.Loan+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct,data=train_data,family=binomial(logit))
summary(iteration5)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed + retired +
## student + entrepreneur + Marital + Basic + High.School +
## Prodessional.Course + University.degree + Credit.default +
## Housing.Loan + Persona.Loan + Contact.Type + mar + apr +
## may + jun + jul + aug + sep + oct, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6346 -0.4667 -0.3889 -0.2705 2.9910
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.142703 0.141860 -15.104 < 2e-16 ***
## Age 0.005704 0.002070 2.755 0.005861 **
## housemaid 0.127620 0.148535 0.859 0.390237
## services -0.147812 0.115263 -1.282 0.199705
## Admin 0.088628 0.098429 0.900 0.367889
## blue.collar -0.122526 0.107820 -1.136 0.255795
## technician -0.046105 0.104964 -0.439 0.660483
## management -0.042545 0.112828 -0.377 0.706116
## unemployed 0.256152 0.138485 1.850 0.064360 .
## retired 0.687717 0.124261 5.534 3.12e-08 ***
## student 0.804466 0.125572 6.406 1.49e-10 ***
## entrepreneur -0.110168 0.135433 -0.813 0.415961
## Marital -0.190226 0.042812 -4.443 8.86e-06 ***
## Basic -0.082812 0.071410 -1.160 0.246181
## High.School 0.013901 0.066997 0.207 0.835627
## Prodessional.Course NA NA NA NA
## University.degree 0.110845 0.065110 1.702 0.088676 .
## Credit.default -0.529607 0.055558 -9.532 < 2e-16 ***
## Housing.Loan -0.013099 0.035257 -0.372 0.710255
## Persona.Loan -0.046403 0.049055 -0.946 0.344182
## Contact.Type -1.176047 0.052141 -22.555 < 2e-16 ***
## mar 2.000945 0.102139 19.590 < 2e-16 ***
## apr 0.733357 0.070747 10.366 < 2e-16 ***
## may 0.019292 0.064446 0.299 0.764668
## jun 0.852862 0.076203 11.192 < 2e-16 ***
## jul -0.353693 0.070527 -5.015 5.30e-07 ***
## aug -0.257409 0.069127 -3.724 0.000196 ***
## sep 1.488062 0.138787 10.722 < 2e-16 ***
## oct 1.807341 0.106734 16.933 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 23421 on 39972 degrees of freedom
## AIC: 23477
##
## Number of Fisher Scoring iterations: 5
anova(object=iteration5,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.87 39998 26153 8.275e-06 ***
## housemaid 1 2.73 39997 26151 0.0982980 .
## services 1 32.52 39996 26118 1.177e-08 ***
## Admin 1 23.92 39995 26094 1.003e-06 ***
## blue.collar 1 185.02 39994 25909 < 2.2e-16 ***
## technician 1 55.14 39993 25854 1.124e-13 ***
## management 1 40.85 39992 25813 1.648e-10 ***
## unemployed 1 5.15 39991 25808 0.0232188 *
## retired 1 68.12 39990 25740 < 2.2e-16 ***
## student 1 117.36 39989 25622 < 2.2e-16 ***
## entrepreneur 1 3.59 39988 25619 0.0582785 .
## Marital 1 76.65 39987 25542 < 2.2e-16 ***
## Basic 1 18.01 39986 25524 2.196e-05 ***
## High.School 1 11.56 39985 25513 0.0006746 ***
## Prodessional.Course 0 0.00 39985 25513
## University.degree 1 6.12 39984 25507 0.0133740 *
## Credit.default 1 286.48 39983 25220 < 2.2e-16 ***
## Housing.Loan 1 2.85 39982 25217 0.0913077 .
## Persona.Loan 1 1.57 39981 25216 0.2105271
## Contact.Type 1 545.71 39980 24670 < 2.2e-16 ***
## mar 1 378.77 39979 24291 < 2.2e-16 ***
## apr 1 154.26 39978 24137 < 2.2e-16 ***
## may 1 16.32 39977 24121 5.361e-05 ***
## jun 1 188.40 39976 23932 < 2.2e-16 ***
## jul 1 58.83 39975 23873 1.720e-14 ***
## aug 1 120.18 39974 23753 < 2.2e-16 ***
## sep 1 65.42 39973 23688 6.059e-16 ***
## oct 1 266.98 39972 23421 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration6<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed+retired+student+entrepreneur+Marital+Basic+High.School+Prodessional.Course+University.degree+Credit.default+Housing.Loan+Persona.Loan+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Monday+Tuesday+Wednesday+Thursday,data=train_data,family=binomial(logit))
summary(iteration6)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed + retired +
## student + entrepreneur + Marital + Basic + High.School +
## Prodessional.Course + University.degree + Credit.default +
## Housing.Loan + Persona.Loan + Contact.Type + mar + apr +
## may + jun + jul + aug + sep + oct + Monday + Tuesday + Wednesday +
## Thursday, family = binomial(logit), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6225 -0.4689 -0.3870 -0.2687 2.9795
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.205993 0.146631 -15.045 < 2e-16 ***
## Age 0.005938 0.002072 2.865 0.004170 **
## housemaid 0.121899 0.148667 0.820 0.412249
## services -0.151408 0.115312 -1.313 0.189172
## Admin 0.083308 0.098518 0.846 0.397770
## blue.collar -0.127912 0.107874 -1.186 0.235719
## technician -0.048897 0.105031 -0.466 0.641537
## management -0.046633 0.112909 -0.413 0.679595
## unemployed 0.248079 0.138605 1.790 0.073483 .
## retired 0.674576 0.124399 5.423 5.87e-08 ***
## student 0.799025 0.125691 6.357 2.06e-10 ***
## entrepreneur -0.114204 0.135531 -0.843 0.399431
## Marital -0.189174 0.042827 -4.417 1.00e-05 ***
## Basic -0.083258 0.071494 -1.165 0.244208
## High.School 0.017570 0.067080 0.262 0.793373
## Prodessional.Course NA NA NA NA
## University.degree 0.114120 0.065155 1.752 0.079858 .
## Credit.default -0.525133 0.055589 -9.447 < 2e-16 ***
## Housing.Loan -0.014067 0.035281 -0.399 0.690106
## Persona.Loan -0.044905 0.049095 -0.915 0.360372
## Contact.Type -1.179932 0.052184 -22.611 < 2e-16 ***
## mar 2.022773 0.102465 19.741 < 2e-16 ***
## apr 0.760729 0.071171 10.689 < 2e-16 ***
## may 0.026833 0.064521 0.416 0.677495
## jun 0.869040 0.076363 11.380 < 2e-16 ***
## jul -0.352986 0.070591 -5.000 5.72e-07 ***
## aug -0.256959 0.069170 -3.715 0.000203 ***
## sep 1.486721 0.139067 10.691 < 2e-16 ***
## oct 1.803758 0.106860 16.880 < 2e-16 ***
## Monday -0.109008 0.057084 -1.910 0.056183 .
## Tuesday 0.113770 0.056356 2.019 0.043510 *
## Wednesday 0.149380 0.056398 2.649 0.008081 **
## Thursday 0.085833 0.055213 1.555 0.120045
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 23393 on 39968 degrees of freedom
## AIC: 23457
##
## Number of Fisher Scoring iterations: 5
anova(object=iteration6,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.87 39998 26153 8.275e-06 ***
## housemaid 1 2.73 39997 26151 0.0982980 .
## services 1 32.52 39996 26118 1.177e-08 ***
## Admin 1 23.92 39995 26094 1.003e-06 ***
## blue.collar 1 185.02 39994 25909 < 2.2e-16 ***
## technician 1 55.14 39993 25854 1.124e-13 ***
## management 1 40.85 39992 25813 1.648e-10 ***
## unemployed 1 5.15 39991 25808 0.0232188 *
## retired 1 68.12 39990 25740 < 2.2e-16 ***
## student 1 117.36 39989 25622 < 2.2e-16 ***
## entrepreneur 1 3.59 39988 25619 0.0582785 .
## Marital 1 76.65 39987 25542 < 2.2e-16 ***
## Basic 1 18.01 39986 25524 2.196e-05 ***
## High.School 1 11.56 39985 25513 0.0006746 ***
## Prodessional.Course 0 0.00 39985 25513
## University.degree 1 6.12 39984 25507 0.0133740 *
## Credit.default 1 286.48 39983 25220 < 2.2e-16 ***
## Housing.Loan 1 2.85 39982 25217 0.0913077 .
## Persona.Loan 1 1.57 39981 25216 0.2105271
## Contact.Type 1 545.71 39980 24670 < 2.2e-16 ***
## mar 1 378.77 39979 24291 < 2.2e-16 ***
## apr 1 154.26 39978 24137 < 2.2e-16 ***
## may 1 16.32 39977 24121 5.361e-05 ***
## jun 1 188.40 39976 23932 < 2.2e-16 ***
## jul 1 58.83 39975 23873 1.720e-14 ***
## aug 1 120.18 39974 23753 < 2.2e-16 ***
## sep 1 65.42 39973 23688 6.059e-16 ***
## oct 1 266.98 39972 23421 < 2.2e-16 ***
## Monday 1 20.06 39971 23401 7.510e-06 ***
## Tuesday 1 0.56 39970 23400 0.4550487
## Wednesday 1 4.65 39969 23396 0.0310500 *
## Thursday 1 2.42 39968 23393 0.1197521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration7<-glm(Term.Deposit~Age+housemaid+services+Admin+blue.collar+technician+management+unemployed+retired+student+entrepreneur+Marital+Basic+High.School+University.degree+Credit.default+Housing.Loan+Persona.Loan+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Monday+Tuesday+Wednesday+Thursday+Duration+Campaign+emp.var.rate+cons.price.idx+cons.conf.idx+euribor3m+nr.employed+previous,data=train_data,family=binomial(logit))
summary(iteration7)
##
## Call:
## glm(formula = Term.Deposit ~ Age + housemaid + services + Admin +
## blue.collar + technician + management + unemployed + retired +
## student + entrepreneur + Marital + Basic + High.School +
## University.degree + Credit.default + Housing.Loan + Persona.Loan +
## Contact.Type + mar + apr + may + jun + jul + aug + sep +
## oct + Monday + Tuesday + Wednesday + Thursday + Duration +
## Campaign + emp.var.rate + cons.price.idx + cons.conf.idx +
## euribor3m + nr.employed + previous, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1024 -0.2917 -0.1843 -0.1338 3.7829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.632e+02 3.897e+01 -9.320 < 2e-16 ***
## Age 3.907e-04 2.450e-03 0.159 0.873298
## housemaid 1.773e-01 1.810e-01 0.979 0.327458
## services -3.457e-04 1.385e-01 -0.002 0.998009
## Admin 1.472e-01 1.191e-01 1.236 0.216577
## blue.collar -7.140e-02 1.306e-01 -0.547 0.584530
## technician 1.097e-01 1.267e-01 0.866 0.386719
## management 6.099e-02 1.359e-01 0.449 0.653674
## unemployed 1.628e-01 1.687e-01 0.965 0.334367
## retired 4.356e-01 1.509e-01 2.886 0.003896 **
## student 3.626e-01 1.519e-01 2.387 0.016996 *
## entrepreneur -2.906e-02 1.641e-01 -0.177 0.859497
## Marital -1.186e-01 5.196e-02 -2.283 0.022442 *
## Basic -8.457e-02 8.555e-02 -0.988 0.322918
## High.School -6.556e-02 8.063e-02 -0.813 0.416182
## University.degree 7.731e-02 7.834e-02 0.987 0.323713
## Credit.default -2.593e-01 6.749e-02 -3.843 0.000122 ***
## Housing.Loan -1.502e-02 4.227e-02 -0.355 0.722277
## Persona.Loan -6.013e-02 5.939e-02 -1.013 0.311273
## Contact.Type -4.345e-01 7.944e-02 -5.469 4.53e-08 ***
## mar 1.876e+00 1.493e-01 12.567 < 2e-16 ***
## apr -4.106e-01 1.372e-01 -2.993 0.002766 **
## may -4.902e-01 1.121e-01 -4.372 1.23e-05 ***
## jun -1.328e+00 1.751e-01 -7.587 3.26e-14 ***
## jul 5.470e-01 1.144e-01 4.783 1.73e-06 ***
## aug 2.580e+00 1.511e-01 17.076 < 2e-16 ***
## sep 5.779e-01 1.719e-01 3.363 0.000772 ***
## oct 7.649e-01 1.292e-01 5.921 3.19e-09 ***
## Monday -6.700e-02 6.822e-02 -0.982 0.326059
## Tuesday 1.041e-01 6.779e-02 1.535 0.124714
## Wednesday 1.619e-01 6.811e-02 2.377 0.017444 *
## Thursday 8.582e-02 6.629e-02 1.295 0.195442
## Duration 4.753e-03 7.608e-05 62.467 < 2e-16 ***
## Campaign -3.451e-02 1.172e-02 -2.945 0.003229 **
## emp.var.rate -2.771e+00 1.567e-01 -17.678 < 2e-16 ***
## cons.price.idx 3.854e+00 2.766e-01 13.932 < 2e-16 ***
## cons.conf.idx -5.707e-02 1.054e-02 -5.415 6.11e-08 ***
## euribor3m 9.922e-01 1.408e-01 7.045 1.85e-12 ***
## nr.employed -1.412e-03 3.159e-03 -0.447 0.654973
## previous 1.224e-01 3.830e-02 3.197 0.001390 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 16065 on 39960 degrees of freedom
## AIC: 16145
##
## Number of Fisher Scoring iterations: 6
vif(iteration7)
## Age housemaid services Admin
## 2.107506 1.694202 3.270716 6.584982
## blue.collar technician management unemployed
## 5.388995 4.909448 2.932368 1.781019
## retired student entrepreneur Marital
## 3.534897 2.585975 1.833974 1.386768
## Basic High.School University.degree Credit.default
## 3.184347 2.979485 3.189946 1.134886
## Housing.Loan Persona.Loan Contact.Type mar
## 1.011142 1.003787 2.325672 2.502671
## apr may jun jul
## 5.095600 5.278746 8.125101 3.342961
## aug sep oct Monday
## 6.182014 1.495800 1.649216 1.680410
## Tuesday Wednesday Thursday Duration
## 1.714373 1.690382 1.730033 1.255727
## Campaign emp.var.rate cons.price.idx cons.conf.idx
## 1.055949 174.940272 65.724224 9.528612
## euribor3m nr.employed previous
## 151.121193 139.380641 1.312375
anova(object=iteration7,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## Age 1 19.9 39998 26153 8.275e-06 ***
## housemaid 1 2.7 39997 26151 0.0982980 .
## services 1 32.5 39996 26118 1.177e-08 ***
## Admin 1 23.9 39995 26094 1.003e-06 ***
## blue.collar 1 185.0 39994 25909 < 2.2e-16 ***
## technician 1 55.1 39993 25854 1.124e-13 ***
## management 1 40.8 39992 25813 1.648e-10 ***
## unemployed 1 5.2 39991 25808 0.0232188 *
## retired 1 68.1 39990 25740 < 2.2e-16 ***
## student 1 117.4 39989 25622 < 2.2e-16 ***
## entrepreneur 1 3.6 39988 25619 0.0582785 .
## Marital 1 76.6 39987 25542 < 2.2e-16 ***
## Basic 1 18.0 39986 25524 2.196e-05 ***
## High.School 1 11.6 39985 25513 0.0006746 ***
## University.degree 1 6.1 39984 25507 0.0133740 *
## Credit.default 1 286.5 39983 25220 < 2.2e-16 ***
## Housing.Loan 1 2.9 39982 25217 0.0913077 .
## Persona.Loan 1 1.6 39981 25216 0.2105271
## Contact.Type 1 545.7 39980 24670 < 2.2e-16 ***
## mar 1 378.8 39979 24291 < 2.2e-16 ***
## apr 1 154.3 39978 24137 < 2.2e-16 ***
## may 1 16.3 39977 24121 5.361e-05 ***
## jun 1 188.4 39976 23932 < 2.2e-16 ***
## jul 1 58.8 39975 23873 1.720e-14 ***
## aug 1 120.2 39974 23753 < 2.2e-16 ***
## sep 1 65.4 39973 23688 6.059e-16 ***
## oct 1 267.0 39972 23421 < 2.2e-16 ***
## Monday 1 20.1 39971 23401 7.510e-06 ***
## Tuesday 1 0.6 39970 23400 0.4550487
## Wednesday 1 4.7 39969 23396 0.0310500 *
## Thursday 1 2.4 39968 23393 0.1197521
## Duration 1 5226.5 39967 18167 < 2.2e-16 ***
## Campaign 1 45.7 39966 18121 1.411e-11 ***
## emp.var.rate 1 1438.2 39965 16683 < 2.2e-16 ***
## cons.price.idx 1 496.9 39964 16186 < 2.2e-16 ***
## cons.conf.idx 1 10.3 39963 16176 0.0013171 **
## euribor3m 1 99.8 39962 16076 < 2.2e-16 ***
## nr.employed 1 0.1 39961 16076 0.7008176
## previous 1 10.2 39960 16065 0.0014018 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Let us Remove the Variables with insignificant P-Values
iteration8<-glm(Term.Deposit~retired+student+Marital+Credit.default+Housing.Loan+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Wednesday+Duration+Campaign+emp.var.rate+cons.price.idx+cons.conf.idx+euribor3m+previous,data=train_data,family=binomial(logit))
summary(iteration8)
##
## Call:
## glm(formula = Term.Deposit ~ retired + student + Marital + Credit.default +
## Housing.Loan + Contact.Type + mar + apr + may + jun + jul +
## aug + sep + oct + Wednesday + Duration + Campaign + emp.var.rate +
## cons.price.idx + cons.conf.idx + euribor3m + previous, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1001 -0.2923 -0.1842 -0.1350 3.7128
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.842e+02 1.796e+01 -21.387 < 2e-16 ***
## retired 3.394e-01 8.410e-02 4.036 5.44e-05 ***
## student 2.260e-01 9.955e-02 2.270 0.023213 *
## Marital -1.557e-01 4.676e-02 -3.330 0.000869 ***
## Credit.default -3.151e-01 6.598e-02 -4.775 1.80e-06 ***
## Housing.Loan -8.175e-03 4.213e-02 -0.194 0.846154
## Contact.Type -4.608e-01 7.646e-02 -6.026 1.68e-09 ***
## mar 1.917e+00 1.451e-01 13.215 < 2e-16 ***
## apr -4.506e-01 1.299e-01 -3.468 0.000525 ***
## may -5.304e-01 1.103e-01 -4.809 1.52e-06 ***
## jun -1.402e+00 1.426e-01 -9.836 < 2e-16 ***
## jul 5.147e-01 1.035e-01 4.975 6.53e-07 ***
## aug 2.636e+00 1.489e-01 17.702 < 2e-16 ***
## sep 6.165e-01 1.620e-01 3.806 0.000141 ***
## oct 7.703e-01 1.278e-01 6.026 1.69e-09 ***
## Wednesday 1.252e-01 5.268e-02 2.377 0.017445 *
## Duration 4.740e-03 7.579e-05 62.539 < 2e-16 ***
## Campaign -3.517e-02 1.172e-02 -3.000 0.002697 **
## emp.var.rate -2.844e+00 1.394e-01 -20.399 < 2e-16 ***
## cons.price.idx 4.005e+00 1.876e-01 21.348 < 2e-16 ***
## cons.conf.idx -5.267e-02 7.756e-03 -6.791 1.11e-11 ***
## euribor3m 9.659e-01 9.147e-02 10.559 < 2e-16 ***
## previous 1.209e-01 3.816e-02 3.169 0.001528 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 16108 on 39977 degrees of freedom
## AIC: 16154
##
## Number of Fisher Scoring iterations: 6
vif(iteration8)
## retired student Marital Credit.default Housing.Loan
## 1.093873 1.101369 1.125613 1.086046 1.007248
## Contact.Type mar apr may jun
## 2.159618 2.364508 4.587985 5.123953 5.438998
## jul aug sep oct Wednesday
## 2.739731 5.988754 1.333275 1.622380 1.012711
## Duration Campaign emp.var.rate cons.price.idx cons.conf.idx
## 1.249779 1.051470 138.753046 30.478336 5.181194
## euribor3m previous
## 63.901680 1.308782
anova(object=iteration8,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## retired 1 203.2 39998 25970 < 2.2e-16 ***
## student 1 172.6 39997 25797 < 2.2e-16 ***
## Marital 1 91.8 39996 25706 < 2.2e-16 ***
## Credit.default 1 333.9 39995 25372 < 2.2e-16 ***
## Housing.Loan 1 3.5 39994 25368 0.061660 .
## Contact.Type 1 583.3 39993 24785 < 2.2e-16 ***
## mar 1 395.6 39992 24389 < 2.2e-16 ***
## apr 1 151.5 39991 24238 < 2.2e-16 ***
## may 1 28.8 39990 24209 7.876e-08 ***
## jun 1 179.9 39989 24029 < 2.2e-16 ***
## jul 1 80.0 39988 23949 < 2.2e-16 ***
## aug 1 113.5 39987 23835 < 2.2e-16 ***
## sep 1 72.0 39986 23763 < 2.2e-16 ***
## oct 1 273.6 39985 23490 < 2.2e-16 ***
## Wednesday 1 7.5 39984 23482 0.006180 **
## Duration 1 5198.6 39983 18284 < 2.2e-16 ***
## Campaign 1 45.6 39982 18238 1.436e-11 ***
## emp.var.rate 1 1482.9 39981 16755 < 2.2e-16 ***
## cons.price.idx 1 523.3 39980 16232 < 2.2e-16 ***
## cons.conf.idx 1 8.8 39979 16223 0.002972 **
## euribor3m 1 105.4 39978 16118 < 2.2e-16 ***
## previous 1 10.0 39977 16108 0.001542 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration9<-glm(Term.Deposit~retired+student+Marital+Credit.default+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Wednesday+Duration+Campaign+emp.var.rate+cons.price.idx+cons.conf.idx+euribor3m+previous,data=train_data,family=binomial(logit))
summary(iteration9)
##
## Call:
## glm(formula = Term.Deposit ~ retired + student + Marital + Credit.default +
## Contact.Type + mar + apr + may + jun + jul + aug + sep +
## oct + Wednesday + Duration + Campaign + emp.var.rate + cons.price.idx +
## cons.conf.idx + euribor3m + previous, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1008 -0.2922 -0.1842 -0.1350 3.7118
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.842e+02 1.796e+01 -21.385 < 2e-16 ***
## retired 3.396e-01 8.410e-02 4.038 5.39e-05 ***
## student 2.262e-01 9.955e-02 2.272 0.023087 *
## Marital -1.556e-01 4.676e-02 -3.329 0.000873 ***
## Credit.default -3.150e-01 6.598e-02 -4.774 1.80e-06 ***
## Contact.Type -4.604e-01 7.644e-02 -6.024 1.70e-09 ***
## mar 1.917e+00 1.451e-01 13.216 < 2e-16 ***
## apr -4.503e-01 1.299e-01 -3.466 0.000528 ***
## may -5.301e-01 1.103e-01 -4.807 1.53e-06 ***
## jun -1.402e+00 1.425e-01 -9.834 < 2e-16 ***
## jul 5.150e-01 1.034e-01 4.978 6.41e-07 ***
## aug 2.636e+00 1.489e-01 17.701 < 2e-16 ***
## sep 6.161e-01 1.620e-01 3.804 0.000142 ***
## oct 7.705e-01 1.278e-01 6.027 1.67e-09 ***
## Wednesday 1.252e-01 5.268e-02 2.377 0.017464 *
## Duration 4.740e-03 7.578e-05 62.544 < 2e-16 ***
## Campaign -3.517e-02 1.172e-02 -3.001 0.002694 **
## emp.var.rate -2.844e+00 1.394e-01 -20.398 < 2e-16 ***
## cons.price.idx 4.005e+00 1.876e-01 21.346 < 2e-16 ***
## cons.conf.idx -5.264e-02 7.754e-03 -6.788 1.13e-11 ***
## euribor3m 9.658e-01 9.147e-02 10.559 < 2e-16 ***
## previous 1.211e-01 3.815e-02 3.173 0.001509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 16108 on 39978 degrees of freedom
## AIC: 16152
##
## Number of Fisher Scoring iterations: 6
vif(iteration9)
## retired student Marital Credit.default Contact.Type
## 1.093752 1.101253 1.125567 1.086047 2.158255
## mar apr may jun jul
## 2.364445 4.587293 5.122649 5.436576 2.739101
## aug sep oct Wednesday Duration
## 5.988899 1.333151 1.622316 1.012701 1.249654
## Campaign emp.var.rate cons.price.idx cons.conf.idx euribor3m
## 1.051460 138.746810 30.476720 5.178905 63.899553
## previous
## 1.308420
anova(object=iteration9,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## retired 1 203.2 39998 25970 < 2.2e-16 ***
## student 1 172.6 39997 25797 < 2.2e-16 ***
## Marital 1 91.8 39996 25706 < 2.2e-16 ***
## Credit.default 1 333.9 39995 25372 < 2.2e-16 ***
## Contact.Type 1 586.8 39994 24785 < 2.2e-16 ***
## mar 1 395.6 39993 24389 < 2.2e-16 ***
## apr 1 151.4 39992 24238 < 2.2e-16 ***
## may 1 28.9 39991 24209 7.712e-08 ***
## jun 1 180.0 39990 24029 < 2.2e-16 ***
## jul 1 79.9 39989 23949 < 2.2e-16 ***
## aug 1 113.4 39988 23836 < 2.2e-16 ***
## sep 1 71.9 39987 23764 < 2.2e-16 ***
## oct 1 273.8 39986 23490 < 2.2e-16 ***
## Wednesday 1 7.5 39985 23482 0.006215 **
## Duration 1 5198.6 39984 18284 < 2.2e-16 ***
## Campaign 1 45.6 39983 18238 1.439e-11 ***
## emp.var.rate 1 1483.0 39982 16755 < 2.2e-16 ***
## cons.price.idx 1 523.3 39981 16232 < 2.2e-16 ***
## cons.conf.idx 1 8.8 39980 16223 0.003016 **
## euribor3m 1 105.4 39979 16118 < 2.2e-16 ***
## previous 1 10.0 39978 16108 0.001524 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration10<-glm(Term.Deposit~retired+student+Marital+Credit.default+Contact.Type+mar+jul+aug+sep+oct+Wednesday+Duration+Campaign+previous,data=train_data,family=binomial(logit))
summary(iteration10)
##
## Call:
## glm(formula = Term.Deposit ~ retired + student + Marital + Credit.default +
## Contact.Type + mar + jul + aug + sep + oct + Wednesday +
## Duration + Campaign + previous, family = binomial(logit),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7912 -0.3683 -0.2505 -0.1677 3.0608
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.241e+00 5.674e-02 -57.118 < 2e-16 ***
## retired 9.510e-01 7.889e-02 12.056 < 2e-16 ***
## student 8.502e-01 9.404e-02 9.041 < 2e-16 ***
## Marital -2.371e-01 4.403e-02 -5.386 7.2e-08 ***
## Credit.default -6.887e-01 6.243e-02 -11.032 < 2e-16 ***
## Contact.Type -1.029e+00 5.424e-02 -18.974 < 2e-16 ***
## mar 2.205e+00 9.935e-02 22.189 < 2e-16 ***
## jul -6.843e-01 6.573e-02 -10.412 < 2e-16 ***
## aug -1.485e-01 6.047e-02 -2.456 0.014 *
## sep 1.461e+00 1.445e-01 10.109 < 2e-16 ***
## oct 1.790e+00 1.066e-01 16.794 < 2e-16 ***
## Wednesday 3.271e-02 4.945e-02 0.662 0.508
## Duration 4.292e-03 6.879e-05 62.385 < 2e-16 ***
## Campaign -6.705e-02 1.147e-02 -5.847 5.0e-09 ***
## previous 6.557e-01 3.508e-02 18.690 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 18249 on 39985 degrees of freedom
## AIC: 18279
##
## Number of Fisher Scoring iterations: 6
vif(iteration10)
## retired student Marital Credit.default Contact.Type
## 1.054572 1.071277 1.106137 1.045623 1.207102
## mar jul aug sep oct
## 1.054348 1.184041 1.161304 1.027613 1.048630
## Wednesday Duration Campaign previous
## 1.003463 1.116754 1.034729 1.144899
anova(object=iteration10,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## retired 1 203.2 39998 25970 < 2.2e-16 ***
## student 1 172.6 39997 25797 < 2.2e-16 ***
## Marital 1 91.8 39996 25706 < 2.2e-16 ***
## Credit.default 1 333.9 39995 25372 < 2.2e-16 ***
## Contact.Type 1 586.8 39994 24785 < 2.2e-16 ***
## mar 1 395.6 39993 24389 < 2.2e-16 ***
## jul 1 144.8 39992 24244 < 2.2e-16 ***
## aug 1 129.7 39991 24115 < 2.2e-16 ***
## sep 1 75.6 39990 24039 < 2.2e-16 ***
## oct 1 239.6 39989 23800 < 2.2e-16 ***
## Wednesday 1 3.2 39988 23796 0.07395 .
## Duration 1 5161.8 39987 18634 < 2.2e-16 ***
## Campaign 1 44.7 39986 18590 2.269e-11 ***
## previous 1 340.3 39985 18250 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
iteration11<-glm(Term.Deposit~retired+student+Marital+Credit.default+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Wednesday+Duration+Campaign+cons.conf.idx+previous,data=train_data,family=binomial(logit))
summary(iteration11)
##
## Call:
## glm(formula = Term.Deposit ~ retired + student + Marital + Credit.default +
## Contact.Type + mar + apr + may + jun + jul + aug + sep +
## oct + Wednesday + Duration + Campaign + cons.conf.idx + previous,
## family = binomial(logit), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.6387 -0.3363 -0.2341 -0.1640 3.4295
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.764e+00 2.574e-01 6.853 7.24e-12 ***
## retired 7.058e-01 8.165e-02 8.645 < 2e-16 ***
## student 6.325e-01 9.669e-02 6.542 6.08e-11 ***
## Marital -2.155e-01 4.524e-02 -4.764 1.90e-06 ***
## Credit.default -6.039e-01 6.346e-02 -9.517 < 2e-16 ***
## Contact.Type -1.843e+00 7.233e-02 -25.474 < 2e-16 ***
## mar 2.988e+00 1.281e-01 23.326 < 2e-16 ***
## apr 1.575e+00 9.394e-02 16.770 < 2e-16 ***
## may 4.579e-01 8.261e-02 5.543 2.97e-08 ***
## jun 1.604e+00 9.352e-02 17.152 < 2e-16 ***
## jul -1.831e-01 8.668e-02 -2.112 0.034647 *
## aug -5.646e-01 8.473e-02 -6.663 2.68e-11 ***
## sep 5.385e-01 1.616e-01 3.332 0.000862 ***
## oct 5.948e-01 1.395e-01 4.263 2.01e-05 ***
## Wednesday 9.707e-02 5.072e-02 1.914 0.055669 .
## Duration 4.385e-03 7.046e-05 62.243 < 2e-16 ***
## Campaign -5.596e-02 1.152e-02 -4.858 1.19e-06 ***
## cons.conf.idx 1.318e-01 6.292e-03 20.947 < 2e-16 ***
## previous 4.663e-01 3.735e-02 12.485 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 17454 on 39981 degrees of freedom
## AIC: 17492
##
## Number of Fisher Scoring iterations: 6
vif(iteration11)
## retired student Marital Credit.default Contact.Type
## 1.070323 1.080880 1.112779 1.058664 2.099328
## mar apr may jun jul
## 1.374010 2.399577 3.156431 2.530315 2.011731
## aug sep oct Wednesday Duration
## 2.234660 1.296207 1.688622 1.011577 1.134212
## Campaign cons.conf.idx previous
## 1.039015 3.154452 1.218036
anova(object=iteration9,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## retired 1 203.2 39998 25970 < 2.2e-16 ***
## student 1 172.6 39997 25797 < 2.2e-16 ***
## Marital 1 91.8 39996 25706 < 2.2e-16 ***
## Credit.default 1 333.9 39995 25372 < 2.2e-16 ***
## Contact.Type 1 586.8 39994 24785 < 2.2e-16 ***
## mar 1 395.6 39993 24389 < 2.2e-16 ***
## apr 1 151.4 39992 24238 < 2.2e-16 ***
## may 1 28.9 39991 24209 7.712e-08 ***
## jun 1 180.0 39990 24029 < 2.2e-16 ***
## jul 1 79.9 39989 23949 < 2.2e-16 ***
## aug 1 113.4 39988 23836 < 2.2e-16 ***
## sep 1 71.9 39987 23764 < 2.2e-16 ***
## oct 1 273.8 39986 23490 < 2.2e-16 ***
## Wednesday 1 7.5 39985 23482 0.006215 **
## Duration 1 5198.6 39984 18284 < 2.2e-16 ***
## Campaign 1 45.6 39983 18238 1.439e-11 ***
## emp.var.rate 1 1483.0 39982 16755 < 2.2e-16 ***
## cons.price.idx 1 523.3 39981 16232 < 2.2e-16 ***
## cons.conf.idx 1 8.8 39980 16223 0.003016 **
## euribor3m 1 105.4 39979 16118 < 2.2e-16 ***
## previous 1 10.0 39978 16108 0.001524 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
CONCORDANCE
# Assuming the input is a stored binomial GLM object
Concordance = function(GLM.binomial) {
outcome_and_fitted_col = cbind(GLM.binomial$y, GLM.binomial$fitted.values)
# get a subset of outcomes where the event actually happened
ones = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 1,]
# get a subset of outcomes where the event didn't actually happen
zeros = outcome_and_fitted_col[outcome_and_fitted_col[,1] == 0,]
# Equate the length of the event and non-event tables
if (length(ones[,1])>length(zeros[,1])) {ones = ones[1:length(zeros[,1]),]}
else {zeros = zeros[1:length(ones[,1]),]}
# Following will be c(ones_outcome, ones_fitted, zeros_outcome, zeros_fitted)
ones_and_zeros = data.frame(ones, zeros)
# initiate columns to store concordant, discordant, and tie pair evaluations
conc = rep(NA, length(ones_and_zeros[,1]))
disc = rep(NA, length(ones_and_zeros[,1]))
ties = rep(NA, length(ones_and_zeros[,1]))
for (i in 1:length(ones_and_zeros[,1])) {
# This tests for concordance
if (ones_and_zeros[i,2] > ones_and_zeros[i,4])
{conc[i] = 1
disc[i] = 0
ties[i] = 0}
# This tests for a tie
else if (ones_and_zeros[i,2] == ones_and_zeros[i,4])
{
conc[i] = 0
disc[i] = 0
ties[i] = 1
}
# This should catch discordant pairs.
else if (ones_and_zeros[i,2] < ones_and_zeros[i,4])
{
conc[i] = 0
disc[i] = 1
ties[i] = 0
}
}
# Here we save the various rates
conc_rate = mean(conc, na.rm=TRUE)
disc_rate = mean(disc, na.rm=TRUE)
tie_rate = mean(ties, na.rm=TRUE)
return(list(concordance=conc_rate, num_concordant=sum(conc), discordance=disc_rate, num_discordant=sum(disc), tie_rate=tie_rate,num_tied=sum(ties)))
}
Concordance(iteration7)
## $concordance
## [1] 0.9809312
##
## $num_concordant
## [1] 3961
##
## $discordance
## [1] 0.01906885
##
## $num_discordant
## [1] 77
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0
Concordance(iteration8)
## $concordance
## [1] 0.9809312
##
## $num_concordant
## [1] 3961
##
## $discordance
## [1] 0.01906885
##
## $num_discordant
## [1] 77
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0
Concordance(iteration9)
## $concordance
## [1] 0.9806835
##
## $num_concordant
## [1] 3960
##
## $discordance
## [1] 0.01931649
##
## $num_discordant
## [1] 78
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0
Concordance(iteration10)
## $concordance
## [1] 0.9551758
##
## $num_concordant
## [1] 3857
##
## $discordance
## [1] 0.04482417
##
## $num_discordant
## [1] 181
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0
Concordance(iteration11)
## $concordance
## [1] 0.957157
##
## $num_concordant
## [1] 3865
##
## $discordance
## [1] 0.04284299
##
## $num_discordant
## [1] 173
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0
The Best Model is iteration 11 Since there is no Multi-colinearity, Low AIC value and very good Concordance and the errors are normally distributed
iteration11<-glm(Term.Deposit~retired+student+Marital+Credit.default+Contact.Type+mar+apr+may+jun+jul+aug+sep+oct+Wednesday+Duration+Campaign+cons.conf.idx+previous,data=train_data,family=binomial(logit))
summary(iteration11)
##
## Call:
## glm(formula = Term.Deposit ~ retired + student + Marital + Credit.default +
## Contact.Type + mar + apr + may + jun + jul + aug + sep +
## oct + Wednesday + Duration + Campaign + cons.conf.idx + previous,
## family = binomial(logit), data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.6387 -0.3363 -0.2341 -0.1640 3.4295
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.764e+00 2.574e-01 6.853 7.24e-12 ***
## retired 7.058e-01 8.165e-02 8.645 < 2e-16 ***
## student 6.325e-01 9.669e-02 6.542 6.08e-11 ***
## Marital -2.155e-01 4.524e-02 -4.764 1.90e-06 ***
## Credit.default -6.039e-01 6.346e-02 -9.517 < 2e-16 ***
## Contact.Type -1.843e+00 7.233e-02 -25.474 < 2e-16 ***
## mar 2.988e+00 1.281e-01 23.326 < 2e-16 ***
## apr 1.575e+00 9.394e-02 16.770 < 2e-16 ***
## may 4.579e-01 8.261e-02 5.543 2.97e-08 ***
## jun 1.604e+00 9.352e-02 17.152 < 2e-16 ***
## jul -1.831e-01 8.668e-02 -2.112 0.034647 *
## aug -5.646e-01 8.473e-02 -6.663 2.68e-11 ***
## sep 5.385e-01 1.616e-01 3.332 0.000862 ***
## oct 5.948e-01 1.395e-01 4.263 2.01e-05 ***
## Wednesday 9.707e-02 5.072e-02 1.914 0.055669 .
## Duration 4.385e-03 7.046e-05 62.243 < 2e-16 ***
## Campaign -5.596e-02 1.152e-02 -4.858 1.19e-06 ***
## cons.conf.idx 1.318e-01 6.292e-03 20.947 < 2e-16 ***
## previous 4.663e-01 3.735e-02 12.485 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26173 on 39999 degrees of freedom
## Residual deviance: 17454 on 39981 degrees of freedom
## AIC: 17492
##
## Number of Fisher Scoring iterations: 6
vif(iteration11)
## retired student Marital Credit.default Contact.Type
## 1.070323 1.080880 1.112779 1.058664 2.099328
## mar apr may jun jul
## 1.374010 2.399577 3.156431 2.530315 2.011731
## aug sep oct Wednesday Duration
## 2.234660 1.296207 1.688622 1.011577 1.134212
## Campaign cons.conf.idx previous
## 1.039015 3.154452 1.218036
anova(object=iteration9,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Term.Deposit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 39999 26173
## retired 1 203.2 39998 25970 < 2.2e-16 ***
## student 1 172.6 39997 25797 < 2.2e-16 ***
## Marital 1 91.8 39996 25706 < 2.2e-16 ***
## Credit.default 1 333.9 39995 25372 < 2.2e-16 ***
## Contact.Type 1 586.8 39994 24785 < 2.2e-16 ***
## mar 1 395.6 39993 24389 < 2.2e-16 ***
## apr 1 151.4 39992 24238 < 2.2e-16 ***
## may 1 28.9 39991 24209 7.712e-08 ***
## jun 1 180.0 39990 24029 < 2.2e-16 ***
## jul 1 79.9 39989 23949 < 2.2e-16 ***
## aug 1 113.4 39988 23836 < 2.2e-16 ***
## sep 1 71.9 39987 23764 < 2.2e-16 ***
## oct 1 273.8 39986 23490 < 2.2e-16 ***
## Wednesday 1 7.5 39985 23482 0.006215 **
## Duration 1 5198.6 39984 18284 < 2.2e-16 ***
## Campaign 1 45.6 39983 18238 1.439e-11 ***
## emp.var.rate 1 1483.0 39982 16755 < 2.2e-16 ***
## cons.price.idx 1 523.3 39981 16232 < 2.2e-16 ***
## cons.conf.idx 1 8.8 39980 16223 0.003016 **
## euribor3m 1 105.4 39979 16118 < 2.2e-16 ***
## previous 1 10.0 39978 16108 0.001524 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Concordance(iteration11)
## $concordance
## [1] 0.957157
##
## $num_concordant
## [1] 3865
##
## $discordance
## [1] 0.04284299
##
## $num_discordant
## [1] 173
##
## $tie_rate
## [1] 0
##
## $num_tied
## [1] 0