Logistic Regression

Bank Dataset

Assignment 15

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\4 Logistic\\Assignment\\bank_data.csv")

colnames(mydata)
##  [1] "age"             "default"         "balance"        
##  [4] "housing"         "loan"            "duration"       
##  [7] "campaign"        "pdays"           "previous"       
## [10] "poutfailure"     "poutother"       "poutsuccess"    
## [13] "poutunknown"     "con_cellular"    "con_telephone"  
## [16] "con_unknown"     "divorced"        "married"        
## [19] "single"          "joadmin."        "joblue.collar"  
## [22] "joentrepreneur"  "johousemaid"     "jomanagement"   
## [25] "joretired"       "joself.employed" "joservices"     
## [28] "jostudent"       "jotechnician"    "jounemployed"   
## [31] "jounknown"       "y"
summary(mydata)
##       age           default           balance          housing      
##  Min.   :18.00   Min.   :0.00000   Min.   : -8019   Min.   :0.0000  
##  1st Qu.:33.00   1st Qu.:0.00000   1st Qu.:    72   1st Qu.:0.0000  
##  Median :39.00   Median :0.00000   Median :   448   Median :1.0000  
##  Mean   :40.94   Mean   :0.01803   Mean   :  1362   Mean   :0.5558  
##  3rd Qu.:48.00   3rd Qu.:0.00000   3rd Qu.:  1428   3rd Qu.:1.0000  
##  Max.   :95.00   Max.   :1.00000   Max.   :102127   Max.   :1.0000  
##       loan           duration         campaign          pdays      
##  Min.   :0.0000   Min.   :   0.0   Min.   : 1.000   Min.   : -1.0  
##  1st Qu.:0.0000   1st Qu.: 103.0   1st Qu.: 1.000   1st Qu.: -1.0  
##  Median :0.0000   Median : 180.0   Median : 2.000   Median : -1.0  
##  Mean   :0.1602   Mean   : 258.2   Mean   : 2.764   Mean   : 40.2  
##  3rd Qu.:0.0000   3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.: -1.0  
##  Max.   :1.0000   Max.   :4918.0   Max.   :63.000   Max.   :871.0  
##     previous         poutfailure       poutother       poutsuccess     
##  Min.   :  0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :  0.5803   Mean   :0.1084   Mean   :0.0407   Mean   :0.03342  
##  3rd Qu.:  0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :275.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   poutunknown      con_cellular    con_telephone      con_unknown   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :1.0000   Median :1.0000   Median :0.00000   Median :0.000  
##  Mean   :0.8175   Mean   :0.6477   Mean   :0.06428   Mean   :0.288  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.000  
##     divorced         married           single          joadmin.     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1152   Mean   :0.6019   Mean   :0.2829   Mean   :0.1144  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  joblue.collar    joentrepreneur     johousemaid       jomanagement   
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.2153   Mean   :0.03289   Mean   :0.02743   Mean   :0.2092  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##    joretired       joself.employed     joservices        jostudent      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.05008   Mean   :0.03493   Mean   :0.09188   Mean   :0.02075  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##   jotechnician    jounemployed       jounknown             y        
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.000   Median :0.00000   Median :0.00000   Median :0.000  
##  Mean   :0.168   Mean   :0.02882   Mean   :0.00637   Mean   :0.117  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000   Max.   :1.000
dim(mydata)
## [1] 45211    32
model <- glm(y~.,data = mydata,family = binomial)
summary(model)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.6748  -0.4060  -0.2731  -0.1625   3.4400  
## 
## Coefficients: (4 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -3.855e+00  2.447e-01 -15.754  < 2e-16 ***
## age              3.567e-04  2.123e-03   0.168 0.866617    
## default         -1.822e-01  1.613e-01  -1.130 0.258582    
## balance          1.872e-05  4.848e-06   3.860 0.000113 ***
## housing         -7.756e-01  3.953e-02 -19.618  < 2e-16 ***
## loan            -5.720e-01  5.811e-02  -9.843  < 2e-16 ***
## duration         4.048e-03  6.264e-05  64.619  < 2e-16 ***
## campaign        -1.093e-01  9.905e-03 -11.037  < 2e-16 ***
## pdays            1.441e-04  3.006e-04   0.479 0.631590    
## previous         1.042e-02  6.435e-03   1.620 0.105281    
## poutfailure      2.563e-01  9.038e-02   2.835 0.004576 ** 
## poutother        4.985e-01  1.028e-01   4.849 1.24e-06 ***
## poutsuccess      2.565e+00  8.318e-02  30.836  < 2e-16 ***
## poutunknown             NA         NA      NA       NA    
## con_cellular     1.166e+00  5.762e-02  20.232  < 2e-16 ***
## con_telephone    1.067e+00  8.825e-02  12.093  < 2e-16 ***
## con_unknown             NA         NA      NA       NA    
## divorced        -1.762e-01  6.530e-02  -2.697 0.006986 ** 
## married         -3.394e-01  4.448e-02  -7.631 2.32e-14 ***
## single                  NA         NA      NA       NA    
## joadmin.         2.965e-01  2.263e-01   1.310 0.190041    
## joblue.collar   -1.671e-01  2.256e-01  -0.741 0.458886    
## joentrepreneur  -1.073e-01  2.463e-01  -0.436 0.663056    
## johousemaid     -2.776e-01  2.495e-01  -1.113 0.265815    
## jomanagement     2.380e-01  2.232e-01   1.067 0.286143    
## joretired        6.315e-01  2.301e-01   2.745 0.006052 ** 
## joself.employed  1.962e-02  2.392e-01   0.082 0.934620    
## joservices      -3.688e-02  2.301e-01  -0.160 0.872676    
## jostudent        8.087e-01  2.395e-01   3.377 0.000734 ***
## jotechnician     6.383e-02  2.247e-01   0.284 0.776342    
## jounemployed     8.167e-02  2.397e-01   0.341 0.733322    
## jounknown               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45210  degrees of freedom
## Residual deviance: 22640  on 45183  degrees of freedom
## AIC: 22696
## 
## Number of Fisher Scoring iterations: 6
mydata <- mydata[,-c(13,16,19,31)]


model2 <- glm(y~.,data = mydata)
summary(model2)
## 
## Call:
## glm(formula = y ~ ., data = mydata)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.29345  -0.11582  -0.04883   0.01842   1.06743  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.755e-02  1.776e-02  -1.552 0.120766    
## age              1.741e-04  1.570e-04   1.109 0.267383    
## default         -1.037e-02  9.753e-03  -1.063 0.287572    
## balance          1.959e-06  4.303e-07   4.552 5.33e-06 ***
## housing         -5.666e-02  2.844e-03 -19.925  < 2e-16 ***
## loan            -3.314e-02  3.565e-03  -9.296  < 2e-16 ***
## duration         4.733e-04  5.038e-06  93.953  < 2e-16 ***
## campaign        -2.083e-03  4.219e-04  -4.936 8.02e-07 ***
## pdays           -2.589e-05  2.726e-05  -0.950 0.342170    
## previous         1.213e-03  6.651e-04   1.824 0.068120 .  
## poutfailure      2.984e-02  8.145e-03   3.664 0.000248 ***
## poutother        5.788e-02  9.544e-03   6.064 1.34e-09 ***
## poutsuccess      4.753e-01  8.896e-03  53.426  < 2e-16 ***
## con_cellular     5.555e-02  3.137e-03  17.705  < 2e-16 ***
## con_telephone    4.941e-02  5.830e-03   8.474  < 2e-16 ***
## divorced        -1.531e-02  4.850e-03  -3.156 0.001601 ** 
## married         -2.668e-02  3.305e-03  -8.074 7.00e-16 ***
## joadmin.         2.297e-02  1.672e-02   1.374 0.169376    
## joblue.collar   -4.980e-03  1.652e-02  -0.301 0.763053    
## joentrepreneur  -3.181e-03  1.774e-02  -0.179 0.857741    
## johousemaid     -1.559e-02  1.796e-02  -0.868 0.385371    
## jomanagement     2.133e-02  1.649e-02   1.294 0.195809    
## joretired        6.599e-02  1.730e-02   3.814 0.000137 ***
## joself.employed  2.451e-03  1.764e-02   0.139 0.889468    
## joservices       1.654e-03  1.683e-02   0.098 0.921715    
## jostudent        1.132e-01  1.876e-02   6.034 1.61e-09 ***
## jotechnician     5.941e-03  1.656e-02   0.359 0.719819    
## jounemployed     1.554e-02  1.791e-02   0.868 0.385450    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07511589)
## 
##     Null deviance: 4670.3  on 45210  degrees of freedom
## Residual deviance: 3394.0  on 45183  degrees of freedom
## AIC: 11294
## 
## Number of Fisher Scoring iterations: 2
library(MASS)
#stepAIC(model)

FinalModel <- glm(y ~ balance + housing + loan + duration + campaign + (1/sqrt(previous)) + poutfailure + poutother + poutsuccess + con_cellular + con_telephone + divorced + married + joadmin. + joblue.collar + johousemaid + jomanagement + joretired + jostudent, family = binomial,data = mydata)

summary(FinalModel)
## 
## Call:
## glm(formula = y ~ balance + housing + loan + duration + campaign + 
##     (1/sqrt(previous)) + poutfailure + poutother + poutsuccess + 
##     con_cellular + con_telephone + divorced + married + joadmin. + 
##     joblue.collar + johousemaid + jomanagement + joretired + 
##     jostudent, family = binomial, data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.6659  -0.4061  -0.2734  -0.1628   3.4004  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -3.823e+00  7.525e-02 -50.801  < 2e-16 ***
## balance        1.903e-05  4.811e-06   3.954 7.67e-05 ***
## housing       -7.763e-01  3.862e-02 -20.101  < 2e-16 ***
## loan          -5.816e-01  5.782e-02 -10.058  < 2e-16 ***
## duration       4.047e-03  6.259e-05  64.649  < 2e-16 ***
## campaign      -1.088e-01  9.881e-03 -11.011  < 2e-16 ***
## poutfailure    3.206e-01  5.432e-02   5.901 3.60e-09 ***
## poutother      5.715e-01  7.554e-02   7.565 3.87e-14 ***
## poutsuccess    2.625e+00  6.370e-02  41.219  < 2e-16 ***
## con_cellular   1.170e+00  5.753e-02  20.330  < 2e-16 ***
## con_telephone  1.074e+00  8.756e-02  12.267  < 2e-16 ***
## divorced      -1.770e-01  6.116e-02  -2.895 0.003792 ** 
## married       -3.403e-01  4.053e-02  -8.396  < 2e-16 ***
## joadmin.       2.743e-01  5.870e-02   4.674 2.96e-06 ***
## joblue.collar -1.897e-01  5.562e-02  -3.410 0.000649 ***
## johousemaid   -3.006e-01  1.228e-01  -2.447 0.014395 *  
## jomanagement   2.149e-01  4.694e-02   4.579 4.68e-06 ***
## joretired      6.150e-01  7.148e-02   8.604  < 2e-16 ***
## jostudent      7.789e-01  9.494e-02   8.204 2.32e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45210  degrees of freedom
## Residual deviance: 22648  on 45192  degrees of freedom
## AIC: 22686
## 
## Number of Fisher Scoring iterations: 6
#vif(FinalModel)

exp(coef(FinalModel))
##   (Intercept)       balance       housing          loan      duration 
##    0.02186255    1.00001903    0.46012374    0.55901143    1.00405472 
##      campaign   poutfailure     poutother   poutsuccess  con_cellular 
##    0.89690644    1.37794412    1.77086670   13.81123294    3.22091301 
## con_telephone      divorced       married      joadmin. joblue.collar 
##    2.92712022    0.83774286    0.71154805    1.31565109    0.82721637 
##   johousemaid  jomanagement     joretired     jostudent 
##    0.74038523    1.23975840    1.84967292    2.17908489
prob <- predict(FinalModel,type=c("response"),mydata)
prob<-as.data.frame(prob)
final <- cbind(prob,mydata)
confusion <- table(prob > 0.5,mydata$y)
table(prob > 0.5)
## 
## FALSE  TRUE 
## 42598  2613
confusion
##        
##             0     1
##   FALSE 39017  3581
##   TRUE    905  1708
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 0.9007764
#install.packages("ROCR")
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.5.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rocrpred<-prediction(prob,mydata$y)
rocrperf<-performance(rocrpred,'tpr','fpr')
plot(rocrperf,colorize=T,text.adj=c(-0.2,1.0))