Suppose we are interested in the factors that influence whether a political candidate wins an election. The outcome (response) variable is binary (0/1); win or lose. The predictor variables of interest are the amount of money spent on the campaign, the amount of time spent campaigning negatively and whether or not the candidate is an incumbent.

library(data.table)

election_data <- fread("C:/Users/Pawan Srivastav/Desktop/Data Science/Data Sets/Data Sets/Logistic Regression/election_data.csv")

#View(election_data)
setkey(election_data,`Election-id`)
summary(election_data)
##   Election-id        Result         Year        Amount Spent  
##  Min.   :122.0   Min.   :0.0   Min.   :32.00   Min.   :2.930  
##  1st Qu.:202.2   1st Qu.:0.0   1st Qu.:39.25   1st Qu.:3.618  
##  Median :362.5   Median :1.0   Median :43.00   Median :4.005  
##  Mean   :451.6   Mean   :0.6   Mean   :43.30   Mean   :4.229  
##  3rd Qu.:710.2   3rd Qu.:1.0   3rd Qu.:49.50   3rd Qu.:4.470  
##  Max.   :965.0   Max.   :1.0   Max.   :52.00   Max.   :6.320  
##  NA's   :1       NA's   :1     NA's   :1       NA's   :1      
##  Popularity Rank
##  Min.   :1.00   
##  1st Qu.:2.00   
##  Median :3.00   
##  Mean   :2.70   
##  3rd Qu.:3.75   
##  Max.   :4.00   
##  NA's   :1
colnames(election_data)
## [1] "Election-id"     "Result"          "Year"            "Amount Spent"   
## [5] "Popularity Rank"
plot(election_data)

attach(election_data)
election_response <- glm(Result ~ Year+`Amount Spent`+`Popularity Rank`, data = election_data)
summary(election_response)
## 
## Call:
## glm(formula = Result ~ Year + `Amount Spent` + `Popularity Rank`, 
##     data = election_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.36265  -0.15265  -0.09902   0.08992   0.55615  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        0.65329    1.31682   0.496   0.6375  
## Year               0.01021    0.02151   0.475   0.6517  
## `Amount Spent`     0.07523    0.12208   0.616   0.5604  
## `Popularity Rank` -0.30137    0.13057  -2.308   0.0604 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1432053)
## 
##     Null deviance: 2.40000  on 9  degrees of freedom
## Residual deviance: 0.85923  on 6  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 13.836
## 
## Number of Fisher Scoring iterations: 2
# Residual Deviance is less than Null Deviance that's mean input variable are significance.

library(MASS)
stepAIC(election_response) # Checking best fit model
## Start:  AIC=13.84
## Result ~ Year + `Amount Spent` + `Popularity Rank`
## 
##                     Df Deviance    AIC
## - Year               1  0.89152 12.205
## - `Amount Spent`     1  0.91361 12.449
## <none>                  0.85923 13.836
## - `Popularity Rank`  1  1.62217 18.191
## 
## Step:  AIC=12.2
## Result ~ `Amount Spent` + `Popularity Rank`
## 
##                     Df Deviance    AIC
## - `Amount Spent`     1  0.94215 10.757
## <none>                  0.89152 12.205
## - `Popularity Rank`  1  2.18851 19.185
## 
## Step:  AIC=10.76
## Result ~ `Popularity Rank`
## 
##                     Df Deviance    AIC
## <none>                  0.94215 10.757
## - `Popularity Rank`  1  2.40000 18.108
## 
## Call:  glm(formula = Result ~ `Popularity Rank`, data = election_data)
## 
## Coefficients:
##       (Intercept)  `Popularity Rank`  
##            1.5372            -0.3471  
## 
## Degrees of Freedom: 9 Total (i.e. Null);  8 Residual
##   (1 observation deleted due to missingness)
## Null Deviance:       2.4 
## Residual Deviance: 0.9421    AIC: 10.76
library(car)
## Loading required package: carData
vif(election_response)
##              Year    `Amount Spent` `Popularity Rank` 
##          1.389879          1.043188          1.440479
exp(coef(election_response))
##       (Intercept)              Year    `Amount Spent` `Popularity Rank` 
##         1.9218592         1.0102668         1.0781268         0.7398019
# Creating COnfusion matrix to check the accuracy

prob <- as.data.frame(predict(election_response, type = c("response"), election_data))

final <- cbind(election_data,prob)

confusion <- table(prob>0.5, election_data$Result)
table(prob>0.5)
## 
## FALSE  TRUE 
##     5     5
confusion
##        
##         0 1
##   FALSE 4 1
##   TRUE  0 5
Accuracy <- sum(diag(confusion)/sum(confusion))

Accuracy
## [1] 0.9
# So My model accuracy is 90% so I can consider this model

Output variable -> y

y -> Whether the client has subscribed a term deposit or not

Binomial (“yes” or “no”)

library(data.table)

bank_data <- fread("C:/Users/Pawan Srivastav/Desktop/Data Science/Data Sets/Data Sets/Logistic Regression/bank_data.csv")
#View(bank_data)
summary(bank_data)
##       age           default           balance          housing      
##  Min.   :18.00   Min.   :0.00000   Min.   : -8019   Min.   :0.0000  
##  1st Qu.:33.00   1st Qu.:0.00000   1st Qu.:    72   1st Qu.:0.0000  
##  Median :39.00   Median :0.00000   Median :   448   Median :1.0000  
##  Mean   :40.94   Mean   :0.01803   Mean   :  1362   Mean   :0.5558  
##  3rd Qu.:48.00   3rd Qu.:0.00000   3rd Qu.:  1428   3rd Qu.:1.0000  
##  Max.   :95.00   Max.   :1.00000   Max.   :102127   Max.   :1.0000  
##       loan           duration         campaign          pdays      
##  Min.   :0.0000   Min.   :   0.0   Min.   : 1.000   Min.   : -1.0  
##  1st Qu.:0.0000   1st Qu.: 103.0   1st Qu.: 1.000   1st Qu.: -1.0  
##  Median :0.0000   Median : 180.0   Median : 2.000   Median : -1.0  
##  Mean   :0.1602   Mean   : 258.2   Mean   : 2.764   Mean   : 40.2  
##  3rd Qu.:0.0000   3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.: -1.0  
##  Max.   :1.0000   Max.   :4918.0   Max.   :63.000   Max.   :871.0  
##     previous         poutfailure       poutother       poutsuccess     
##  Min.   :  0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:  0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :  0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :  0.5803   Mean   :0.1084   Mean   :0.0407   Mean   :0.03342  
##  3rd Qu.:  0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :275.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   poutunknown      con_cellular    con_telephone      con_unknown   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :1.0000   Median :1.0000   Median :0.00000   Median :0.000  
##  Mean   :0.8175   Mean   :0.6477   Mean   :0.06428   Mean   :0.288  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.000  
##     divorced         married           single          joadmin.     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1152   Mean   :0.6019   Mean   :0.2829   Mean   :0.1144  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  joblue.collar    joentrepreneur     johousemaid       jomanagement   
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.2153   Mean   :0.03289   Mean   :0.02743   Mean   :0.2092  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##    joretired       joself.employed     joservices        jostudent      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.05008   Mean   :0.03493   Mean   :0.09188   Mean   :0.02075  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##   jotechnician    jounemployed       jounknown             y        
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000  
##  Median :0.000   Median :0.00000   Median :0.00000   Median :0.000  
##  Mean   :0.168   Mean   :0.02882   Mean   :0.00637   Mean   :0.117  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000   Max.   :1.000
str(bank_data)
## Classes 'data.table' and 'data.frame':   45211 obs. of  32 variables:
##  $ age            : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ default        : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ balance        : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing        : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ loan           : int  0 0 1 0 0 0 1 0 0 0 ...
##  $ duration       : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays          : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutfailure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutother      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutsuccess    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutunknown    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ con_cellular   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ con_telephone  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ con_unknown    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ divorced       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ married        : int  1 0 1 1 0 1 0 0 1 0 ...
##  $ single         : int  0 1 0 0 1 0 1 0 0 1 ...
##  $ joadmin.       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ joblue.collar  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ joentrepreneur : int  0 0 1 0 0 0 0 1 0 0 ...
##  $ johousemaid    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jomanagement   : int  1 0 0 0 0 1 1 0 0 0 ...
##  $ joretired      : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ joself.employed: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ joservices     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jostudent      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jotechnician   : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ jounemployed   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jounknown      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ y              : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>
attach(bank_data)
## The following object is masked from package:MASS:
## 
##     housing
y_model <- glm(y ~ age +balance + duration + campaign + pdays + previous + factor(default) + factor(housing) + factor(loan)
               + factor(poutfailure) + factor(poutother) + factor(poutsuccess) + factor(poutunknown)
               + factor(con_cellular) + factor(con_telephone) + factor(con_unknown) + factor(divorced)
               + factor(married) + factor(single) + factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur)
               + factor(johousemaid) + factor(jomanagement) + factor(joretired) + factor(joself.employed) + factor(joservices)
               + factor(jostudent) + factor(jotechnician) + factor(jounemployed) + factor(jounknown), data = bank_data)
summary(y_model)
## 
## Call:
## glm(formula = y ~ age + balance + duration + campaign + pdays + 
##     previous + factor(default) + factor(housing) + factor(loan) + 
##     factor(poutfailure) + factor(poutother) + factor(poutsuccess) + 
##     factor(poutunknown) + factor(con_cellular) + factor(con_telephone) + 
##     factor(con_unknown) + factor(divorced) + factor(married) + 
##     factor(single) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) + 
##     factor(joretired) + factor(joself.employed) + factor(joservices) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed) + 
##     factor(jounknown), data = bank_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.29345  -0.11582  -0.04883   0.01842   1.06743  
## 
## Coefficients: (4 not defined because of singularities)
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.755e-02  1.776e-02  -1.552 0.120766    
## age                       1.741e-04  1.570e-04   1.109 0.267383    
## balance                   1.959e-06  4.303e-07   4.552 5.33e-06 ***
## duration                  4.733e-04  5.038e-06  93.953  < 2e-16 ***
## campaign                 -2.083e-03  4.219e-04  -4.936 8.02e-07 ***
## pdays                    -2.589e-05  2.726e-05  -0.950 0.342170    
## previous                  1.213e-03  6.651e-04   1.824 0.068120 .  
## factor(default)1         -1.037e-02  9.753e-03  -1.063 0.287572    
## factor(housing)1         -5.666e-02  2.844e-03 -19.925  < 2e-16 ***
## factor(loan)1            -3.314e-02  3.565e-03  -9.296  < 2e-16 ***
## factor(poutfailure)1      2.984e-02  8.145e-03   3.664 0.000248 ***
## factor(poutother)1        5.788e-02  9.544e-03   6.064 1.34e-09 ***
## factor(poutsuccess)1      4.753e-01  8.896e-03  53.426  < 2e-16 ***
## factor(poutunknown)1             NA         NA      NA       NA    
## factor(con_cellular)1     5.555e-02  3.137e-03  17.705  < 2e-16 ***
## factor(con_telephone)1    4.941e-02  5.830e-03   8.474  < 2e-16 ***
## factor(con_unknown)1             NA         NA      NA       NA    
## factor(divorced)1        -1.531e-02  4.850e-03  -3.156 0.001601 ** 
## factor(married)1         -2.668e-02  3.305e-03  -8.074 7.00e-16 ***
## factor(single)1                  NA         NA      NA       NA    
## factor(joadmin.)1         2.297e-02  1.672e-02   1.374 0.169376    
## factor(joblue.collar)1   -4.980e-03  1.652e-02  -0.301 0.763053    
## factor(joentrepreneur)1  -3.181e-03  1.774e-02  -0.179 0.857741    
## factor(johousemaid)1     -1.559e-02  1.796e-02  -0.868 0.385371    
## factor(jomanagement)1     2.133e-02  1.649e-02   1.294 0.195809    
## factor(joretired)1        6.599e-02  1.730e-02   3.814 0.000137 ***
## factor(joself.employed)1  2.451e-03  1.764e-02   0.139 0.889468    
## factor(joservices)1       1.654e-03  1.683e-02   0.098 0.921715    
## factor(jostudent)1        1.132e-01  1.876e-02   6.034 1.61e-09 ***
## factor(jotechnician)1     5.941e-03  1.656e-02   0.359 0.719819    
## factor(jounemployed)1     1.554e-02  1.791e-02   0.868 0.385450    
## factor(jounknown)1               NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07511589)
## 
##     Null deviance: 4670.3  on 45210  degrees of freedom
## Residual deviance: 3394.0  on 45183  degrees of freedom
## AIC: 11294
## 
## Number of Fisher Scoring iterations: 2
library(MASS)
library(car)

stepAIC(y_model)
## Start:  AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(poutunknown) + 
##     factor(con_cellular) + factor(con_telephone) + factor(con_unknown) + 
##     factor(divorced) + factor(married) + factor(single) + factor(joadmin.) + 
##     factor(joblue.collar) + factor(joentrepreneur) + factor(johousemaid) + 
##     factor(jomanagement) + factor(joretired) + factor(joself.employed) + 
##     factor(joservices) + factor(jostudent) + factor(jotechnician) + 
##     factor(jounemployed) + factor(jounknown)
## 
## 
## Step:  AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(poutunknown) + 
##     factor(con_cellular) + factor(con_telephone) + factor(con_unknown) + 
##     factor(divorced) + factor(married) + factor(single) + factor(joadmin.) + 
##     factor(joblue.collar) + factor(joentrepreneur) + factor(johousemaid) + 
##     factor(jomanagement) + factor(joretired) + factor(joself.employed) + 
##     factor(joservices) + factor(jostudent) + factor(jotechnician) + 
##     factor(jounemployed)
## 
## 
## Step:  AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(poutunknown) + 
##     factor(con_cellular) + factor(con_telephone) + factor(con_unknown) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) + 
##     factor(joretired) + factor(joself.employed) + factor(joservices) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
## 
## Step:  AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(poutunknown) + 
##     factor(con_cellular) + factor(con_telephone) + factor(divorced) + 
##     factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) + 
##     factor(joretired) + factor(joself.employed) + factor(joservices) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
## 
## Step:  AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(con_cellular) + 
##     factor(con_telephone) + factor(divorced) + factor(married) + 
##     factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(joself.employed) + factor(joservices) + factor(jostudent) + 
##     factor(jotechnician) + factor(jounemployed)
## 
##                           Df Deviance   AIC
## - factor(joservices)       1   3394.0 11292
## - factor(joself.employed)  1   3394.0 11292
## - factor(joentrepreneur)   1   3394.0 11292
## - factor(joblue.collar)    1   3394.0 11293
## - factor(jotechnician)     1   3394.0 11293
## - factor(jounemployed)     1   3394.0 11293
## - factor(johousemaid)      1   3394.0 11293
## - pdays                    1   3394.0 11293
## - factor(default)          1   3394.0 11294
## - age                      1   3394.1 11294
## - factor(jomanagement)     1   3394.1 11294
## - factor(joadmin.)         1   3394.1 11294
## <none>                         3394.0 11294
## - previous                 1   3394.2 11296
## - factor(divorced)         1   3394.7 11302
## - factor(poutfailure)      1   3395.0 11306
## - factor(joretired)        1   3395.1 11307
## - balance                  1   3395.5 11313
## - campaign                 1   3395.8 11317
## - factor(jostudent)        1   3396.7 11329
## - factor(poutother)        1   3396.7 11329
## - factor(married)          1   3398.9 11358
## - factor(con_telephone)    1   3399.4 11364
## - factor(loan)             1   3400.5 11379
## - factor(con_cellular)     1   3417.5 11605
## - factor(housing)          1   3423.8 11688
## - factor(poutsuccess)      1   3608.4 14062
## - duration                 1   4057.0 19361
## 
## Step:  AIC=11292.5
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(con_cellular) + 
##     factor(con_telephone) + factor(divorced) + factor(married) + 
##     factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(joself.employed) + factor(jostudent) + factor(jotechnician) + 
##     factor(jounemployed)
## 
##                           Df Deviance   AIC
## - factor(joself.employed)  1   3394.0 11290
## - factor(joentrepreneur)   1   3394.0 11291
## - factor(jotechnician)     1   3394.0 11291
## - pdays                    1   3394.0 11291
## - factor(default)          1   3394.0 11292
## - age                      1   3394.1 11292
## - factor(joblue.collar)    1   3394.1 11292
## <none>                         3394.0 11292
## - factor(jounemployed)     1   3394.2 11293
## - previous                 1   3394.2 11294
## - factor(johousemaid)      1   3394.2 11294
## - factor(divorced)         1   3394.7 11300
## - factor(poutfailure)      1   3395.0 11304
## - factor(joadmin.)         1   3395.1 11305
## - factor(jomanagement)     1   3395.1 11306
## - balance                  1   3395.5 11311
## - campaign                 1   3395.8 11315
## - factor(poutother)        1   3396.7 11327
## - factor(married)          1   3398.9 11356
## - factor(joretired)        1   3399.1 11359
## - factor(con_telephone)    1   3399.4 11362
## - factor(loan)             1   3400.5 11377
## - factor(jostudent)        1   3402.9 11410
## - factor(con_cellular)     1   3417.5 11603
## - factor(housing)          1   3424.0 11688
## - factor(poutsuccess)      1   3608.4 14060
## - duration                 1   4057.0 19359
## 
## Step:  AIC=11290.51
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(con_cellular) + 
##     factor(con_telephone) + factor(divorced) + factor(married) + 
##     factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
##                          Df Deviance   AIC
## - factor(joentrepreneur)  1   3394.0 11289
## - factor(jotechnician)    1   3394.0 11289
## - pdays                   1   3394.0 11289
## - factor(default)         1   3394.0 11290
## - age                     1   3394.1 11290
## <none>                        3394.0 11290
## - factor(joblue.collar)   1   3394.1 11291
## - factor(jounemployed)    1   3394.2 11291
## - previous                1   3394.2 11292
## - factor(johousemaid)     1   3394.3 11293
## - factor(divorced)        1   3394.7 11298
## - factor(poutfailure)     1   3395.0 11302
## - factor(joadmin.)        1   3395.2 11305
## - factor(jomanagement)    1   3395.3 11307
## - balance                 1   3395.5 11309
## - campaign                1   3395.8 11313
## - factor(poutother)       1   3396.7 11325
## - factor(married)         1   3398.9 11354
## - factor(con_telephone)   1   3399.4 11360
## - factor(joretired)       1   3399.5 11362
## - factor(loan)            1   3400.5 11375
## - factor(jostudent)       1   3403.3 11413
## - factor(con_cellular)    1   3417.6 11602
## - factor(housing)         1   3424.0 11687
## - factor(poutsuccess)     1   3608.4 14059
## - duration                1   4057.1 19357
## 
## Step:  AIC=11288.9
## y ~ age + balance + duration + campaign + pdays + previous + 
##     factor(default) + factor(housing) + factor(loan) + factor(poutfailure) + 
##     factor(poutother) + factor(poutsuccess) + factor(con_cellular) + 
##     factor(con_telephone) + factor(divorced) + factor(married) + 
##     factor(joadmin.) + factor(joblue.collar) + factor(johousemaid) + 
##     factor(jomanagement) + factor(joretired) + factor(jostudent) + 
##     factor(jotechnician) + factor(jounemployed)
## 
##                         Df Deviance   AIC
## - pdays                  1   3394.1 11288
## - factor(default)        1   3394.1 11288
## - age                    1   3394.1 11288
## - factor(jotechnician)   1   3394.1 11288
## - factor(joblue.collar)  1   3394.1 11289
## <none>                       3394.0 11289
## - factor(jounemployed)   1   3394.2 11290
## - previous               1   3394.2 11290
## - factor(johousemaid)    1   3394.3 11291
## - factor(divorced)       1   3394.7 11297
## - factor(poutfailure)    1   3395.0 11300
## - factor(joadmin.)       1   3395.5 11307
## - balance                1   3395.5 11308
## - factor(jomanagement)   1   3395.7 11310
## - campaign               1   3395.8 11311
## - factor(poutother)      1   3396.8 11324
## - factor(married)        1   3398.9 11352
## - factor(con_telephone)  1   3399.4 11359
## - factor(joretired)      1   3400.0 11367
## - factor(loan)           1   3400.5 11374
## - factor(jostudent)      1   3403.7 11416
## - factor(con_cellular)   1   3417.6 11600
## - factor(housing)        1   3424.0 11686
## - factor(poutsuccess)    1   3608.5 14057
## - duration               1   4057.1 19355
## 
## Step:  AIC=11287.8
## y ~ age + balance + duration + campaign + previous + factor(default) + 
##     factor(housing) + factor(loan) + factor(poutfailure) + factor(poutother) + 
##     factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
##                         Df Deviance   AIC
## - factor(default)        1   3394.1 11287
## - age                    1   3394.2 11287
## - factor(jotechnician)   1   3394.2 11287
## - factor(joblue.collar)  1   3394.2 11288
## <none>                       3394.1 11288
## - factor(jounemployed)   1   3394.3 11289
## - previous               1   3394.3 11289
## - factor(johousemaid)    1   3394.3 11290
## - factor(divorced)       1   3394.8 11296
## - factor(joadmin.)       1   3395.6 11306
## - balance                1   3395.6 11307
## - factor(jomanagement)   1   3395.8 11309
## - factor(poutfailure)    1   3395.9 11310
## - campaign               1   3395.9 11310
## - factor(poutother)      1   3398.0 11338
## - factor(married)        1   3399.0 11351
## - factor(con_telephone)  1   3399.4 11357
## - factor(joretired)      1   3400.1 11366
## - factor(loan)           1   3400.6 11372
## - factor(jostudent)      1   3403.8 11415
## - factor(con_cellular)   1   3417.6 11598
## - factor(housing)        1   3424.6 11691
## - factor(poutsuccess)    1   3681.4 14960
## - duration               1   4057.2 19354
## 
## Step:  AIC=11286.97
## y ~ age + balance + duration + campaign + previous + factor(housing) + 
##     factor(loan) + factor(poutfailure) + factor(poutother) + 
##     factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
##                         Df Deviance   AIC
## - age                    1   3394.2 11286
## - factor(jotechnician)   1   3394.3 11286
## - factor(joblue.collar)  1   3394.3 11287
## <none>                       3394.1 11287
## - factor(jounemployed)   1   3394.4 11288
## - previous               1   3394.4 11288
## - factor(johousemaid)    1   3394.4 11289
## - factor(divorced)       1   3394.9 11295
## - factor(joadmin.)       1   3395.7 11305
## - balance                1   3395.8 11307
## - factor(jomanagement)   1   3395.9 11308
## - factor(poutfailure)    1   3396.0 11310
## - campaign               1   3396.0 11310
## - factor(poutother)      1   3398.1 11338
## - factor(married)        1   3399.0 11350
## - factor(con_telephone)  1   3399.6 11357
## - factor(joretired)      1   3400.2 11366
## - factor(loan)           1   3400.8 11374
## - factor(jostudent)      1   3403.9 11415
## - factor(con_cellular)   1   3417.7 11598
## - factor(housing)        1   3424.7 11690
## - factor(poutsuccess)    1   3681.8 14963
## - duration               1   4057.4 19355
## 
## Step:  AIC=11286.21
## y ~ balance + duration + campaign + previous + factor(housing) + 
##     factor(loan) + factor(poutfailure) + factor(poutother) + 
##     factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jotechnician) + factor(jounemployed)
## 
##                         Df Deviance   AIC
## - factor(jotechnician)   1   3394.3 11286
## - factor(joblue.collar)  1   3394.4 11286
## <none>                       3394.2 11286
## - factor(jounemployed)   1   3394.5 11287
## - factor(johousemaid)    1   3394.5 11288
## - previous               1   3394.5 11288
## - factor(divorced)       1   3394.9 11293
## - factor(joadmin.)       1   3395.7 11304
## - balance                1   3395.9 11307
## - factor(jomanagement)   1   3396.0 11308
## - campaign               1   3396.1 11309
## - factor(poutfailure)    1   3396.1 11309
## - factor(poutother)      1   3398.2 11337
## - factor(married)        1   3399.4 11353
## - factor(con_telephone)  1   3399.9 11359
## - factor(loan)           1   3400.9 11373
## - factor(joretired)      1   3402.0 11388
## - factor(jostudent)      1   3403.9 11413
## - factor(con_cellular)   1   3417.7 11596
## - factor(housing)        1   3426.1 11706
## - factor(poutsuccess)    1   3682.3 14967
## - duration               1   4057.4 19353
## 
## Step:  AIC=11285.57
## y ~ balance + duration + campaign + previous + factor(housing) + 
##     factor(loan) + factor(poutfailure) + factor(poutother) + 
##     factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jounemployed)
## 
##                         Df Deviance   AIC
## <none>                       3394.3 11286
## - factor(jounemployed)   1   3394.5 11286
## - previous               1   3394.6 11287
## - factor(johousemaid)    1   3394.7 11288
## - factor(joblue.collar)  1   3394.8 11289
## - factor(divorced)       1   3395.0 11293
## - factor(joadmin.)       1   3395.8 11303
## - balance                1   3396.0 11306
## - campaign               1   3396.2 11308
## - factor(poutfailure)    1   3396.2 11308
## - factor(jomanagement)   1   3396.2 11309
## - factor(poutother)      1   3398.3 11336
## - factor(married)        1   3399.6 11353
## - factor(con_telephone)  1   3400.0 11358
## - factor(loan)           1   3401.1 11373
## - factor(joretired)      1   3402.4 11391
## - factor(jostudent)      1   3404.1 11414
## - factor(con_cellular)   1   3418.0 11598
## - factor(housing)        1   3426.3 11707
## - factor(poutsuccess)    1   3682.4 14967
## - duration               1   4057.4 19351
## 
## Call:  glm(formula = y ~ balance + duration + campaign + previous + 
##     factor(housing) + factor(loan) + factor(poutfailure) + factor(poutother) + 
##     factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) + 
##     factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) + 
##     factor(johousemaid) + factor(jomanagement) + factor(joretired) + 
##     factor(jostudent) + factor(jounemployed), data = bank_data)
## 
## Coefficients:
##            (Intercept)                 balance                duration  
##             -1.793e-02               2.030e-06               4.733e-04  
##               campaign                previous        factor(housing)1  
##             -2.085e-03               1.231e-03              -5.741e-02  
##          factor(loan)1    factor(poutfailure)1      factor(poutother)1  
##             -3.358e-02               2.377e-02               5.201e-02  
##   factor(poutsuccess)1   factor(con_cellular)1  factor(con_telephone)1  
##              4.713e-01               5.554e-02               5.011e-02  
##      factor(divorced)1        factor(married)1       factor(joadmin.)1  
##             -1.379e-02              -2.550e-02               1.958e-02  
## factor(joblue.collar)1    factor(johousemaid)1   factor(jomanagement)1  
##             -8.475e-03              -1.816e-02               1.807e-02  
##     factor(joretired)1      factor(jostudent)1   factor(jounemployed)1  
##              6.591e-02               1.083e-01               1.215e-02  
## 
## Degrees of Freedom: 45210 Total (i.e. Null);  45190 Residual
## Null Deviance:       4670 
## Residual Deviance: 3394  AIC: 11290
prob_y <- as.data.frame(predict(y_model, type = c("response"), bank_data))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
final_y <- cbind(bank_data, prob_y)

confusion_y <- table(prob_y>0.5, bank_data$y)

table(prob_y>0.5)
## 
## FALSE  TRUE 
## 42986  2225
confusion_y
##        
##             0     1
##   FALSE 39150  3836
##   TRUE    772  1453
accuracy_y <- sum(diag(confusion_y)/sum(confusion_y))

accuracy_y
## [1] 0.8980779
# So My model accuracy is 89.8% so I can consider this model

I have a dataset containing family information of married couples, which have around 10 variables & 600+ observations. Independent variables are ~ gender, age, years married, children, religion etc. I have one response variable which is number of extra marital affairs. Now, I want to know what all factor influence the chances of extra marital affair.

Since extra marital affair is a binary variable (either a person will have or not), so we can fit logistic regression model here to predict the probability of extra marital affair.

library(data.table)

# install.packages('AER')
data(Affairs,package="AER")
summary(Affairs)
##     affairs          gender         age         yearsmarried    children 
##  Min.   : 0.000   female:315   Min.   :17.50   Min.   : 0.125   no :171  
##  1st Qu.: 0.000   male  :286   1st Qu.:27.00   1st Qu.: 4.000   yes:430  
##  Median : 0.000                Median :32.00   Median : 7.000            
##  Mean   : 1.456                Mean   :32.49   Mean   : 8.178            
##  3rd Qu.: 0.000                3rd Qu.:37.00   3rd Qu.:15.000            
##  Max.   :12.000                Max.   :57.00   Max.   :15.000            
##  religiousness     education       occupation        rating     
##  Min.   :1.000   Min.   : 9.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:14.00   1st Qu.:3.000   1st Qu.:3.000  
##  Median :3.000   Median :16.00   Median :5.000   Median :4.000  
##  Mean   :3.116   Mean   :16.17   Mean   :4.195   Mean   :3.932  
##  3rd Qu.:4.000   3rd Qu.:18.00   3rd Qu.:6.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :20.00   Max.   :7.000   Max.   :5.000
str(Affairs)
## 'data.frame':    601 obs. of  9 variables:
##  $ affairs      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender       : Factor w/ 2 levels "female","male": 2 1 1 2 2 1 1 2 1 2 ...
##  $ age          : num  37 27 32 57 22 32 22 57 32 22 ...
##  $ yearsmarried : num  10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
##  $ children     : Factor w/ 2 levels "no","yes": 1 1 2 2 1 1 1 2 2 1 ...
##  $ religiousness: int  3 4 1 5 2 2 2 2 4 4 ...
##  $ education    : num  18 14 12 18 17 17 12 14 16 14 ...
##  $ occupation   : int  7 6 1 6 6 5 1 4 1 4 ...
##  $ rating       : int  4 4 4 5 3 5 3 4 2 5 ...
gender_F <- ifelse(Affairs$gender=="male",1,0)
Children_F <- ifelse(Affairs$children=="yes",1,0)

Affairs <- cbind(Affairs,gender_F,Children_F)
attach(Affairs)
## The following objects are masked _by_ .GlobalEnv:
## 
##     Children_F, gender_F
## The following object is masked from bank_data:
## 
##     age
colnames(Affairs)
##  [1] "affairs"       "gender"        "age"           "yearsmarried" 
##  [5] "children"      "religiousness" "education"     "occupation"   
##  [9] "rating"        "gender_F"      "Children_F"
y_model <- glm(affairs ~ age + yearsmarried + religiousness + education +
                 occupation + rating + factor(gender_F) + factor(Children_F), data= Affairs)
summary(y_model)
## 
## Call:
## glm(formula = affairs ~ age + yearsmarried + religiousness + 
##     education + occupation + rating + factor(gender_F) + factor(Children_F), 
##     data = Affairs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0503  -1.7226  -0.7947   0.2101  12.7036  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          5.87201    1.13750   5.162 3.34e-07 ***
## age                 -0.05098    0.02262  -2.254   0.0246 *  
## yearsmarried         0.16947    0.04122   4.111 4.50e-05 ***
## religiousness       -0.47761    0.11173  -4.275 2.23e-05 ***
## education           -0.01375    0.06414  -0.214   0.8303    
## occupation           0.10492    0.08888   1.180   0.2383    
## rating              -0.71188    0.12001  -5.932 5.09e-09 ***
## factor(gender_F)1    0.05409    0.30049   0.180   0.8572    
## factor(Children_F)1 -0.14262    0.35020  -0.407   0.6840    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 9.575934)
## 
##     Null deviance: 6529.1  on 600  degrees of freedom
## Residual deviance: 5669.0  on 592  degrees of freedom
## AIC: 3074.3
## 
## Number of Fisher Scoring iterations: 2
library(MASS)
library(car)

stepAIC(y_model)
## Start:  AIC=3074.31
## affairs ~ age + yearsmarried + religiousness + education + occupation + 
##     rating + factor(gender_F) + factor(Children_F)
## 
##                      Df Deviance    AIC
## - factor(gender_F)    1   5669.3 3072.3
## - education           1   5669.4 3072.3
## - factor(Children_F)  1   5670.5 3072.5
## - occupation          1   5682.3 3073.7
## <none>                    5669.0 3074.3
## - age                 1   5717.6 3077.4
## - yearsmarried        1   5830.8 3089.2
## - religiousness       1   5843.9 3090.6
## - rating              1   6005.9 3107.0
## 
## Step:  AIC=3072.34
## affairs ~ age + yearsmarried + religiousness + education + occupation + 
##     rating + factor(Children_F)
## 
##                      Df Deviance    AIC
## - education           1   5669.6 3070.4
## - factor(Children_F)  1   5670.7 3070.5
## - occupation          1   5685.7 3072.1
## <none>                    5669.3 3072.3
## - age                 1   5718.2 3075.5
## - yearsmarried        1   5834.6 3087.6
## - religiousness       1   5844.0 3088.6
## - rating              1   6007.1 3105.1
## 
## Step:  AIC=3070.37
## affairs ~ age + yearsmarried + religiousness + occupation + rating + 
##     factor(Children_F)
## 
##                      Df Deviance    AIC
## - factor(Children_F)  1   5671.1 3068.5
## <none>                    5669.6 3070.4
## - occupation          1   5688.9 3070.4
## - age                 1   5719.3 3073.6
## - yearsmarried        1   5835.7 3085.7
## - religiousness       1   5844.0 3086.6
## - rating              1   6016.6 3104.1
## 
## Step:  AIC=3068.53
## affairs ~ age + yearsmarried + religiousness + occupation + rating
## 
##                 Df Deviance    AIC
## <none>               5671.1 3068.5
## - occupation     1   5692.3 3068.8
## - age            1   5720.5 3071.8
## - religiousness  1   5845.6 3084.8
## - yearsmarried   1   5854.5 3085.7
## - rating         1   6016.6 3102.1
## 
## Call:  glm(formula = affairs ~ age + yearsmarried + religiousness + 
##     occupation + rating, data = Affairs)
## 
## Coefficients:
##   (Intercept)            age   yearsmarried  religiousness     occupation  
##       5.60816       -0.05035        0.16185       -0.47632        0.10601  
##        rating  
##      -0.71224  
## 
## Degrees of Freedom: 600 Total (i.e. Null);  595 Residual
## Null Deviance:       6529 
## Residual Deviance: 5671  AIC: 3069
prob_y <- as.data.frame(predict(y_model, type = c("response"), Affairs))

final_y <- cbind(Affairs, prob_y)

confusion_y <- table(prob_y>0.5, Affairs$affairs)

table(prob_y>0.5)
## 
## FALSE  TRUE 
##   137   464
confusion_y
##        
##           0   1   2   3   7  12
##   FALSE 119  10   2   1   4   1
##   TRUE  332  24  15  18  38  37
accuracy_y <- sum(diag(confusion_y)/sum(confusion_y))

accuracy_y
## [1] 0.2379368