library(data.table)
election_data <- fread("C:/Users/Pawan Srivastav/Desktop/Data Science/Data Sets/Data Sets/Logistic Regression/election_data.csv")
#View(election_data)
setkey(election_data,`Election-id`)
summary(election_data)
## Election-id Result Year Amount Spent
## Min. :122.0 Min. :0.0 Min. :32.00 Min. :2.930
## 1st Qu.:202.2 1st Qu.:0.0 1st Qu.:39.25 1st Qu.:3.618
## Median :362.5 Median :1.0 Median :43.00 Median :4.005
## Mean :451.6 Mean :0.6 Mean :43.30 Mean :4.229
## 3rd Qu.:710.2 3rd Qu.:1.0 3rd Qu.:49.50 3rd Qu.:4.470
## Max. :965.0 Max. :1.0 Max. :52.00 Max. :6.320
## NA's :1 NA's :1 NA's :1 NA's :1
## Popularity Rank
## Min. :1.00
## 1st Qu.:2.00
## Median :3.00
## Mean :2.70
## 3rd Qu.:3.75
## Max. :4.00
## NA's :1
colnames(election_data)
## [1] "Election-id" "Result" "Year" "Amount Spent"
## [5] "Popularity Rank"
plot(election_data)
attach(election_data)
election_response <- glm(Result ~ Year+`Amount Spent`+`Popularity Rank`, data = election_data)
summary(election_response)
##
## Call:
## glm(formula = Result ~ Year + `Amount Spent` + `Popularity Rank`,
## data = election_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.36265 -0.15265 -0.09902 0.08992 0.55615
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.65329 1.31682 0.496 0.6375
## Year 0.01021 0.02151 0.475 0.6517
## `Amount Spent` 0.07523 0.12208 0.616 0.5604
## `Popularity Rank` -0.30137 0.13057 -2.308 0.0604 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1432053)
##
## Null deviance: 2.40000 on 9 degrees of freedom
## Residual deviance: 0.85923 on 6 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 13.836
##
## Number of Fisher Scoring iterations: 2
# Residual Deviance is less than Null Deviance that's mean input variable are significance.
library(MASS)
stepAIC(election_response) # Checking best fit model
## Start: AIC=13.84
## Result ~ Year + `Amount Spent` + `Popularity Rank`
##
## Df Deviance AIC
## - Year 1 0.89152 12.205
## - `Amount Spent` 1 0.91361 12.449
## <none> 0.85923 13.836
## - `Popularity Rank` 1 1.62217 18.191
##
## Step: AIC=12.2
## Result ~ `Amount Spent` + `Popularity Rank`
##
## Df Deviance AIC
## - `Amount Spent` 1 0.94215 10.757
## <none> 0.89152 12.205
## - `Popularity Rank` 1 2.18851 19.185
##
## Step: AIC=10.76
## Result ~ `Popularity Rank`
##
## Df Deviance AIC
## <none> 0.94215 10.757
## - `Popularity Rank` 1 2.40000 18.108
##
## Call: glm(formula = Result ~ `Popularity Rank`, data = election_data)
##
## Coefficients:
## (Intercept) `Popularity Rank`
## 1.5372 -0.3471
##
## Degrees of Freedom: 9 Total (i.e. Null); 8 Residual
## (1 observation deleted due to missingness)
## Null Deviance: 2.4
## Residual Deviance: 0.9421 AIC: 10.76
library(car)
## Loading required package: carData
vif(election_response)
## Year `Amount Spent` `Popularity Rank`
## 1.389879 1.043188 1.440479
exp(coef(election_response))
## (Intercept) Year `Amount Spent` `Popularity Rank`
## 1.9218592 1.0102668 1.0781268 0.7398019
# Creating COnfusion matrix to check the accuracy
prob <- as.data.frame(predict(election_response, type = c("response"), election_data))
final <- cbind(election_data,prob)
confusion <- table(prob>0.5, election_data$Result)
table(prob>0.5)
##
## FALSE TRUE
## 5 5
confusion
##
## 0 1
## FALSE 4 1
## TRUE 0 5
Accuracy <- sum(diag(confusion)/sum(confusion))
Accuracy
## [1] 0.9
# So My model accuracy is 90% so I can consider this model
library(data.table)
bank_data <- fread("C:/Users/Pawan Srivastav/Desktop/Data Science/Data Sets/Data Sets/Logistic Regression/bank_data.csv")
#View(bank_data)
summary(bank_data)
## age default balance housing
## Min. :18.00 Min. :0.00000 Min. : -8019 Min. :0.0000
## 1st Qu.:33.00 1st Qu.:0.00000 1st Qu.: 72 1st Qu.:0.0000
## Median :39.00 Median :0.00000 Median : 448 Median :1.0000
## Mean :40.94 Mean :0.01803 Mean : 1362 Mean :0.5558
## 3rd Qu.:48.00 3rd Qu.:0.00000 3rd Qu.: 1428 3rd Qu.:1.0000
## Max. :95.00 Max. :1.00000 Max. :102127 Max. :1.0000
## loan duration campaign pdays
## Min. :0.0000 Min. : 0.0 Min. : 1.000 Min. : -1.0
## 1st Qu.:0.0000 1st Qu.: 103.0 1st Qu.: 1.000 1st Qu.: -1.0
## Median :0.0000 Median : 180.0 Median : 2.000 Median : -1.0
## Mean :0.1602 Mean : 258.2 Mean : 2.764 Mean : 40.2
## 3rd Qu.:0.0000 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.: -1.0
## Max. :1.0000 Max. :4918.0 Max. :63.000 Max. :871.0
## previous poutfailure poutother poutsuccess
## Min. : 0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 0.5803 Mean :0.1084 Mean :0.0407 Mean :0.03342
## 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :275.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## poutunknown con_cellular con_telephone con_unknown
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :1.0000 Median :1.0000 Median :0.00000 Median :0.000
## Mean :0.8175 Mean :0.6477 Mean :0.06428 Mean :0.288
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.000
## divorced married single joadmin.
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
## Mean :0.1152 Mean :0.6019 Mean :0.2829 Mean :0.1144
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## joblue.collar joentrepreneur johousemaid jomanagement
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.2153 Mean :0.03289 Mean :0.02743 Mean :0.2092
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## joretired joself.employed joservices jostudent
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.05008 Mean :0.03493 Mean :0.09188 Mean :0.02075
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## jotechnician jounemployed jounknown y
## Min. :0.000 Min. :0.00000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.000 Median :0.00000 Median :0.00000 Median :0.000
## Mean :0.168 Mean :0.02882 Mean :0.00637 Mean :0.117
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000
## Max. :1.000 Max. :1.00000 Max. :1.00000 Max. :1.000
str(bank_data)
## Classes 'data.table' and 'data.frame': 45211 obs. of 32 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ default : int 0 0 0 0 0 0 0 1 0 0 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : int 1 1 1 1 0 1 1 1 1 1 ...
## $ loan : int 0 0 1 0 0 0 1 0 0 0 ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutfailure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutother : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutsuccess : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutunknown : int 1 1 1 1 1 1 1 1 1 1 ...
## $ con_cellular : int 0 0 0 0 0 0 0 0 0 0 ...
## $ con_telephone : int 0 0 0 0 0 0 0 0 0 0 ...
## $ con_unknown : int 1 1 1 1 1 1 1 1 1 1 ...
## $ divorced : int 0 0 0 0 0 0 0 1 0 0 ...
## $ married : int 1 0 1 1 0 1 0 0 1 0 ...
## $ single : int 0 1 0 0 1 0 1 0 0 1 ...
## $ joadmin. : int 0 0 0 0 0 0 0 0 0 0 ...
## $ joblue.collar : int 0 0 0 1 0 0 0 0 0 0 ...
## $ joentrepreneur : int 0 0 1 0 0 0 0 1 0 0 ...
## $ johousemaid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jomanagement : int 1 0 0 0 0 1 1 0 0 0 ...
## $ joretired : int 0 0 0 0 0 0 0 0 1 0 ...
## $ joself.employed: int 0 0 0 0 0 0 0 0 0 0 ...
## $ joservices : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jostudent : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jotechnician : int 0 1 0 0 0 0 0 0 0 1 ...
## $ jounemployed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ jounknown : int 0 0 0 0 1 0 0 0 0 0 ...
## $ y : int 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
attach(bank_data)
## The following object is masked from package:MASS:
##
## housing
y_model <- glm(y ~ age +balance + duration + campaign + pdays + previous + factor(default) + factor(housing) + factor(loan)
+ factor(poutfailure) + factor(poutother) + factor(poutsuccess) + factor(poutunknown)
+ factor(con_cellular) + factor(con_telephone) + factor(con_unknown) + factor(divorced)
+ factor(married) + factor(single) + factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur)
+ factor(johousemaid) + factor(jomanagement) + factor(joretired) + factor(joself.employed) + factor(joservices)
+ factor(jostudent) + factor(jotechnician) + factor(jounemployed) + factor(jounknown), data = bank_data)
summary(y_model)
##
## Call:
## glm(formula = y ~ age + balance + duration + campaign + pdays +
## previous + factor(default) + factor(housing) + factor(loan) +
## factor(poutfailure) + factor(poutother) + factor(poutsuccess) +
## factor(poutunknown) + factor(con_cellular) + factor(con_telephone) +
## factor(con_unknown) + factor(divorced) + factor(married) +
## factor(single) + factor(joadmin.) + factor(joblue.collar) +
## factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) +
## factor(joretired) + factor(joself.employed) + factor(joservices) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed) +
## factor(jounknown), data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.29345 -0.11582 -0.04883 0.01842 1.06743
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.755e-02 1.776e-02 -1.552 0.120766
## age 1.741e-04 1.570e-04 1.109 0.267383
## balance 1.959e-06 4.303e-07 4.552 5.33e-06 ***
## duration 4.733e-04 5.038e-06 93.953 < 2e-16 ***
## campaign -2.083e-03 4.219e-04 -4.936 8.02e-07 ***
## pdays -2.589e-05 2.726e-05 -0.950 0.342170
## previous 1.213e-03 6.651e-04 1.824 0.068120 .
## factor(default)1 -1.037e-02 9.753e-03 -1.063 0.287572
## factor(housing)1 -5.666e-02 2.844e-03 -19.925 < 2e-16 ***
## factor(loan)1 -3.314e-02 3.565e-03 -9.296 < 2e-16 ***
## factor(poutfailure)1 2.984e-02 8.145e-03 3.664 0.000248 ***
## factor(poutother)1 5.788e-02 9.544e-03 6.064 1.34e-09 ***
## factor(poutsuccess)1 4.753e-01 8.896e-03 53.426 < 2e-16 ***
## factor(poutunknown)1 NA NA NA NA
## factor(con_cellular)1 5.555e-02 3.137e-03 17.705 < 2e-16 ***
## factor(con_telephone)1 4.941e-02 5.830e-03 8.474 < 2e-16 ***
## factor(con_unknown)1 NA NA NA NA
## factor(divorced)1 -1.531e-02 4.850e-03 -3.156 0.001601 **
## factor(married)1 -2.668e-02 3.305e-03 -8.074 7.00e-16 ***
## factor(single)1 NA NA NA NA
## factor(joadmin.)1 2.297e-02 1.672e-02 1.374 0.169376
## factor(joblue.collar)1 -4.980e-03 1.652e-02 -0.301 0.763053
## factor(joentrepreneur)1 -3.181e-03 1.774e-02 -0.179 0.857741
## factor(johousemaid)1 -1.559e-02 1.796e-02 -0.868 0.385371
## factor(jomanagement)1 2.133e-02 1.649e-02 1.294 0.195809
## factor(joretired)1 6.599e-02 1.730e-02 3.814 0.000137 ***
## factor(joself.employed)1 2.451e-03 1.764e-02 0.139 0.889468
## factor(joservices)1 1.654e-03 1.683e-02 0.098 0.921715
## factor(jostudent)1 1.132e-01 1.876e-02 6.034 1.61e-09 ***
## factor(jotechnician)1 5.941e-03 1.656e-02 0.359 0.719819
## factor(jounemployed)1 1.554e-02 1.791e-02 0.868 0.385450
## factor(jounknown)1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07511589)
##
## Null deviance: 4670.3 on 45210 degrees of freedom
## Residual deviance: 3394.0 on 45183 degrees of freedom
## AIC: 11294
##
## Number of Fisher Scoring iterations: 2
library(MASS)
library(car)
stepAIC(y_model)
## Start: AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(poutunknown) +
## factor(con_cellular) + factor(con_telephone) + factor(con_unknown) +
## factor(divorced) + factor(married) + factor(single) + factor(joadmin.) +
## factor(joblue.collar) + factor(joentrepreneur) + factor(johousemaid) +
## factor(jomanagement) + factor(joretired) + factor(joself.employed) +
## factor(joservices) + factor(jostudent) + factor(jotechnician) +
## factor(jounemployed) + factor(jounknown)
##
##
## Step: AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(poutunknown) +
## factor(con_cellular) + factor(con_telephone) + factor(con_unknown) +
## factor(divorced) + factor(married) + factor(single) + factor(joadmin.) +
## factor(joblue.collar) + factor(joentrepreneur) + factor(johousemaid) +
## factor(jomanagement) + factor(joretired) + factor(joself.employed) +
## factor(joservices) + factor(jostudent) + factor(jotechnician) +
## factor(jounemployed)
##
##
## Step: AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(poutunknown) +
## factor(con_cellular) + factor(con_telephone) + factor(con_unknown) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) +
## factor(joretired) + factor(joself.employed) + factor(joservices) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
##
## Step: AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(poutunknown) +
## factor(con_cellular) + factor(con_telephone) + factor(divorced) +
## factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(joentrepreneur) + factor(johousemaid) + factor(jomanagement) +
## factor(joretired) + factor(joself.employed) + factor(joservices) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
##
## Step: AIC=11294.49
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(con_cellular) +
## factor(con_telephone) + factor(divorced) + factor(married) +
## factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(joself.employed) + factor(joservices) + factor(jostudent) +
## factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - factor(joservices) 1 3394.0 11292
## - factor(joself.employed) 1 3394.0 11292
## - factor(joentrepreneur) 1 3394.0 11292
## - factor(joblue.collar) 1 3394.0 11293
## - factor(jotechnician) 1 3394.0 11293
## - factor(jounemployed) 1 3394.0 11293
## - factor(johousemaid) 1 3394.0 11293
## - pdays 1 3394.0 11293
## - factor(default) 1 3394.0 11294
## - age 1 3394.1 11294
## - factor(jomanagement) 1 3394.1 11294
## - factor(joadmin.) 1 3394.1 11294
## <none> 3394.0 11294
## - previous 1 3394.2 11296
## - factor(divorced) 1 3394.7 11302
## - factor(poutfailure) 1 3395.0 11306
## - factor(joretired) 1 3395.1 11307
## - balance 1 3395.5 11313
## - campaign 1 3395.8 11317
## - factor(jostudent) 1 3396.7 11329
## - factor(poutother) 1 3396.7 11329
## - factor(married) 1 3398.9 11358
## - factor(con_telephone) 1 3399.4 11364
## - factor(loan) 1 3400.5 11379
## - factor(con_cellular) 1 3417.5 11605
## - factor(housing) 1 3423.8 11688
## - factor(poutsuccess) 1 3608.4 14062
## - duration 1 4057.0 19361
##
## Step: AIC=11292.5
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(con_cellular) +
## factor(con_telephone) + factor(divorced) + factor(married) +
## factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(joself.employed) + factor(jostudent) + factor(jotechnician) +
## factor(jounemployed)
##
## Df Deviance AIC
## - factor(joself.employed) 1 3394.0 11290
## - factor(joentrepreneur) 1 3394.0 11291
## - factor(jotechnician) 1 3394.0 11291
## - pdays 1 3394.0 11291
## - factor(default) 1 3394.0 11292
## - age 1 3394.1 11292
## - factor(joblue.collar) 1 3394.1 11292
## <none> 3394.0 11292
## - factor(jounemployed) 1 3394.2 11293
## - previous 1 3394.2 11294
## - factor(johousemaid) 1 3394.2 11294
## - factor(divorced) 1 3394.7 11300
## - factor(poutfailure) 1 3395.0 11304
## - factor(joadmin.) 1 3395.1 11305
## - factor(jomanagement) 1 3395.1 11306
## - balance 1 3395.5 11311
## - campaign 1 3395.8 11315
## - factor(poutother) 1 3396.7 11327
## - factor(married) 1 3398.9 11356
## - factor(joretired) 1 3399.1 11359
## - factor(con_telephone) 1 3399.4 11362
## - factor(loan) 1 3400.5 11377
## - factor(jostudent) 1 3402.9 11410
## - factor(con_cellular) 1 3417.5 11603
## - factor(housing) 1 3424.0 11688
## - factor(poutsuccess) 1 3608.4 14060
## - duration 1 4057.0 19359
##
## Step: AIC=11290.51
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(con_cellular) +
## factor(con_telephone) + factor(divorced) + factor(married) +
## factor(joadmin.) + factor(joblue.collar) + factor(joentrepreneur) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - factor(joentrepreneur) 1 3394.0 11289
## - factor(jotechnician) 1 3394.0 11289
## - pdays 1 3394.0 11289
## - factor(default) 1 3394.0 11290
## - age 1 3394.1 11290
## <none> 3394.0 11290
## - factor(joblue.collar) 1 3394.1 11291
## - factor(jounemployed) 1 3394.2 11291
## - previous 1 3394.2 11292
## - factor(johousemaid) 1 3394.3 11293
## - factor(divorced) 1 3394.7 11298
## - factor(poutfailure) 1 3395.0 11302
## - factor(joadmin.) 1 3395.2 11305
## - factor(jomanagement) 1 3395.3 11307
## - balance 1 3395.5 11309
## - campaign 1 3395.8 11313
## - factor(poutother) 1 3396.7 11325
## - factor(married) 1 3398.9 11354
## - factor(con_telephone) 1 3399.4 11360
## - factor(joretired) 1 3399.5 11362
## - factor(loan) 1 3400.5 11375
## - factor(jostudent) 1 3403.3 11413
## - factor(con_cellular) 1 3417.6 11602
## - factor(housing) 1 3424.0 11687
## - factor(poutsuccess) 1 3608.4 14059
## - duration 1 4057.1 19357
##
## Step: AIC=11288.9
## y ~ age + balance + duration + campaign + pdays + previous +
## factor(default) + factor(housing) + factor(loan) + factor(poutfailure) +
## factor(poutother) + factor(poutsuccess) + factor(con_cellular) +
## factor(con_telephone) + factor(divorced) + factor(married) +
## factor(joadmin.) + factor(joblue.collar) + factor(johousemaid) +
## factor(jomanagement) + factor(joretired) + factor(jostudent) +
## factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - pdays 1 3394.1 11288
## - factor(default) 1 3394.1 11288
## - age 1 3394.1 11288
## - factor(jotechnician) 1 3394.1 11288
## - factor(joblue.collar) 1 3394.1 11289
## <none> 3394.0 11289
## - factor(jounemployed) 1 3394.2 11290
## - previous 1 3394.2 11290
## - factor(johousemaid) 1 3394.3 11291
## - factor(divorced) 1 3394.7 11297
## - factor(poutfailure) 1 3395.0 11300
## - factor(joadmin.) 1 3395.5 11307
## - balance 1 3395.5 11308
## - factor(jomanagement) 1 3395.7 11310
## - campaign 1 3395.8 11311
## - factor(poutother) 1 3396.8 11324
## - factor(married) 1 3398.9 11352
## - factor(con_telephone) 1 3399.4 11359
## - factor(joretired) 1 3400.0 11367
## - factor(loan) 1 3400.5 11374
## - factor(jostudent) 1 3403.7 11416
## - factor(con_cellular) 1 3417.6 11600
## - factor(housing) 1 3424.0 11686
## - factor(poutsuccess) 1 3608.5 14057
## - duration 1 4057.1 19355
##
## Step: AIC=11287.8
## y ~ age + balance + duration + campaign + previous + factor(default) +
## factor(housing) + factor(loan) + factor(poutfailure) + factor(poutother) +
## factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - factor(default) 1 3394.1 11287
## - age 1 3394.2 11287
## - factor(jotechnician) 1 3394.2 11287
## - factor(joblue.collar) 1 3394.2 11288
## <none> 3394.1 11288
## - factor(jounemployed) 1 3394.3 11289
## - previous 1 3394.3 11289
## - factor(johousemaid) 1 3394.3 11290
## - factor(divorced) 1 3394.8 11296
## - factor(joadmin.) 1 3395.6 11306
## - balance 1 3395.6 11307
## - factor(jomanagement) 1 3395.8 11309
## - factor(poutfailure) 1 3395.9 11310
## - campaign 1 3395.9 11310
## - factor(poutother) 1 3398.0 11338
## - factor(married) 1 3399.0 11351
## - factor(con_telephone) 1 3399.4 11357
## - factor(joretired) 1 3400.1 11366
## - factor(loan) 1 3400.6 11372
## - factor(jostudent) 1 3403.8 11415
## - factor(con_cellular) 1 3417.6 11598
## - factor(housing) 1 3424.6 11691
## - factor(poutsuccess) 1 3681.4 14960
## - duration 1 4057.2 19354
##
## Step: AIC=11286.97
## y ~ age + balance + duration + campaign + previous + factor(housing) +
## factor(loan) + factor(poutfailure) + factor(poutother) +
## factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - age 1 3394.2 11286
## - factor(jotechnician) 1 3394.3 11286
## - factor(joblue.collar) 1 3394.3 11287
## <none> 3394.1 11287
## - factor(jounemployed) 1 3394.4 11288
## - previous 1 3394.4 11288
## - factor(johousemaid) 1 3394.4 11289
## - factor(divorced) 1 3394.9 11295
## - factor(joadmin.) 1 3395.7 11305
## - balance 1 3395.8 11307
## - factor(jomanagement) 1 3395.9 11308
## - factor(poutfailure) 1 3396.0 11310
## - campaign 1 3396.0 11310
## - factor(poutother) 1 3398.1 11338
## - factor(married) 1 3399.0 11350
## - factor(con_telephone) 1 3399.6 11357
## - factor(joretired) 1 3400.2 11366
## - factor(loan) 1 3400.8 11374
## - factor(jostudent) 1 3403.9 11415
## - factor(con_cellular) 1 3417.7 11598
## - factor(housing) 1 3424.7 11690
## - factor(poutsuccess) 1 3681.8 14963
## - duration 1 4057.4 19355
##
## Step: AIC=11286.21
## y ~ balance + duration + campaign + previous + factor(housing) +
## factor(loan) + factor(poutfailure) + factor(poutother) +
## factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jotechnician) + factor(jounemployed)
##
## Df Deviance AIC
## - factor(jotechnician) 1 3394.3 11286
## - factor(joblue.collar) 1 3394.4 11286
## <none> 3394.2 11286
## - factor(jounemployed) 1 3394.5 11287
## - factor(johousemaid) 1 3394.5 11288
## - previous 1 3394.5 11288
## - factor(divorced) 1 3394.9 11293
## - factor(joadmin.) 1 3395.7 11304
## - balance 1 3395.9 11307
## - factor(jomanagement) 1 3396.0 11308
## - campaign 1 3396.1 11309
## - factor(poutfailure) 1 3396.1 11309
## - factor(poutother) 1 3398.2 11337
## - factor(married) 1 3399.4 11353
## - factor(con_telephone) 1 3399.9 11359
## - factor(loan) 1 3400.9 11373
## - factor(joretired) 1 3402.0 11388
## - factor(jostudent) 1 3403.9 11413
## - factor(con_cellular) 1 3417.7 11596
## - factor(housing) 1 3426.1 11706
## - factor(poutsuccess) 1 3682.3 14967
## - duration 1 4057.4 19353
##
## Step: AIC=11285.57
## y ~ balance + duration + campaign + previous + factor(housing) +
## factor(loan) + factor(poutfailure) + factor(poutother) +
## factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jounemployed)
##
## Df Deviance AIC
## <none> 3394.3 11286
## - factor(jounemployed) 1 3394.5 11286
## - previous 1 3394.6 11287
## - factor(johousemaid) 1 3394.7 11288
## - factor(joblue.collar) 1 3394.8 11289
## - factor(divorced) 1 3395.0 11293
## - factor(joadmin.) 1 3395.8 11303
## - balance 1 3396.0 11306
## - campaign 1 3396.2 11308
## - factor(poutfailure) 1 3396.2 11308
## - factor(jomanagement) 1 3396.2 11309
## - factor(poutother) 1 3398.3 11336
## - factor(married) 1 3399.6 11353
## - factor(con_telephone) 1 3400.0 11358
## - factor(loan) 1 3401.1 11373
## - factor(joretired) 1 3402.4 11391
## - factor(jostudent) 1 3404.1 11414
## - factor(con_cellular) 1 3418.0 11598
## - factor(housing) 1 3426.3 11707
## - factor(poutsuccess) 1 3682.4 14967
## - duration 1 4057.4 19351
##
## Call: glm(formula = y ~ balance + duration + campaign + previous +
## factor(housing) + factor(loan) + factor(poutfailure) + factor(poutother) +
## factor(poutsuccess) + factor(con_cellular) + factor(con_telephone) +
## factor(divorced) + factor(married) + factor(joadmin.) + factor(joblue.collar) +
## factor(johousemaid) + factor(jomanagement) + factor(joretired) +
## factor(jostudent) + factor(jounemployed), data = bank_data)
##
## Coefficients:
## (Intercept) balance duration
## -1.793e-02 2.030e-06 4.733e-04
## campaign previous factor(housing)1
## -2.085e-03 1.231e-03 -5.741e-02
## factor(loan)1 factor(poutfailure)1 factor(poutother)1
## -3.358e-02 2.377e-02 5.201e-02
## factor(poutsuccess)1 factor(con_cellular)1 factor(con_telephone)1
## 4.713e-01 5.554e-02 5.011e-02
## factor(divorced)1 factor(married)1 factor(joadmin.)1
## -1.379e-02 -2.550e-02 1.958e-02
## factor(joblue.collar)1 factor(johousemaid)1 factor(jomanagement)1
## -8.475e-03 -1.816e-02 1.807e-02
## factor(joretired)1 factor(jostudent)1 factor(jounemployed)1
## 6.591e-02 1.083e-01 1.215e-02
##
## Degrees of Freedom: 45210 Total (i.e. Null); 45190 Residual
## Null Deviance: 4670
## Residual Deviance: 3394 AIC: 11290
prob_y <- as.data.frame(predict(y_model, type = c("response"), bank_data))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
final_y <- cbind(bank_data, prob_y)
confusion_y <- table(prob_y>0.5, bank_data$y)
table(prob_y>0.5)
##
## FALSE TRUE
## 42986 2225
confusion_y
##
## 0 1
## FALSE 39150 3836
## TRUE 772 1453
accuracy_y <- sum(diag(confusion_y)/sum(confusion_y))
accuracy_y
## [1] 0.8980779
# So My model accuracy is 89.8% so I can consider this model
Since extra marital affair is a binary variable (either a person will have or not), so we can fit logistic regression model here to predict the probability of extra marital affair.
library(data.table)
# install.packages('AER')
data(Affairs,package="AER")
summary(Affairs)
## affairs gender age yearsmarried children
## Min. : 0.000 female:315 Min. :17.50 Min. : 0.125 no :171
## 1st Qu.: 0.000 male :286 1st Qu.:27.00 1st Qu.: 4.000 yes:430
## Median : 0.000 Median :32.00 Median : 7.000
## Mean : 1.456 Mean :32.49 Mean : 8.178
## 3rd Qu.: 0.000 3rd Qu.:37.00 3rd Qu.:15.000
## Max. :12.000 Max. :57.00 Max. :15.000
## religiousness education occupation rating
## Min. :1.000 Min. : 9.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:14.00 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :16.00 Median :5.000 Median :4.000
## Mean :3.116 Mean :16.17 Mean :4.195 Mean :3.932
## 3rd Qu.:4.000 3rd Qu.:18.00 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :5.000 Max. :20.00 Max. :7.000 Max. :5.000
str(Affairs)
## 'data.frame': 601 obs. of 9 variables:
## $ affairs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 1 2 2 1 1 2 1 2 ...
## $ age : num 37 27 32 57 22 32 22 57 32 22 ...
## $ yearsmarried : num 10 4 15 15 0.75 1.5 0.75 15 15 1.5 ...
## $ children : Factor w/ 2 levels "no","yes": 1 1 2 2 1 1 1 2 2 1 ...
## $ religiousness: int 3 4 1 5 2 2 2 2 4 4 ...
## $ education : num 18 14 12 18 17 17 12 14 16 14 ...
## $ occupation : int 7 6 1 6 6 5 1 4 1 4 ...
## $ rating : int 4 4 4 5 3 5 3 4 2 5 ...
gender_F <- ifelse(Affairs$gender=="male",1,0)
Children_F <- ifelse(Affairs$children=="yes",1,0)
Affairs <- cbind(Affairs,gender_F,Children_F)
attach(Affairs)
## The following objects are masked _by_ .GlobalEnv:
##
## Children_F, gender_F
## The following object is masked from bank_data:
##
## age
colnames(Affairs)
## [1] "affairs" "gender" "age" "yearsmarried"
## [5] "children" "religiousness" "education" "occupation"
## [9] "rating" "gender_F" "Children_F"
y_model <- glm(affairs ~ age + yearsmarried + religiousness + education +
occupation + rating + factor(gender_F) + factor(Children_F), data= Affairs)
summary(y_model)
##
## Call:
## glm(formula = affairs ~ age + yearsmarried + religiousness +
## education + occupation + rating + factor(gender_F) + factor(Children_F),
## data = Affairs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.0503 -1.7226 -0.7947 0.2101 12.7036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.87201 1.13750 5.162 3.34e-07 ***
## age -0.05098 0.02262 -2.254 0.0246 *
## yearsmarried 0.16947 0.04122 4.111 4.50e-05 ***
## religiousness -0.47761 0.11173 -4.275 2.23e-05 ***
## education -0.01375 0.06414 -0.214 0.8303
## occupation 0.10492 0.08888 1.180 0.2383
## rating -0.71188 0.12001 -5.932 5.09e-09 ***
## factor(gender_F)1 0.05409 0.30049 0.180 0.8572
## factor(Children_F)1 -0.14262 0.35020 -0.407 0.6840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 9.575934)
##
## Null deviance: 6529.1 on 600 degrees of freedom
## Residual deviance: 5669.0 on 592 degrees of freedom
## AIC: 3074.3
##
## Number of Fisher Scoring iterations: 2
library(MASS)
library(car)
stepAIC(y_model)
## Start: AIC=3074.31
## affairs ~ age + yearsmarried + religiousness + education + occupation +
## rating + factor(gender_F) + factor(Children_F)
##
## Df Deviance AIC
## - factor(gender_F) 1 5669.3 3072.3
## - education 1 5669.4 3072.3
## - factor(Children_F) 1 5670.5 3072.5
## - occupation 1 5682.3 3073.7
## <none> 5669.0 3074.3
## - age 1 5717.6 3077.4
## - yearsmarried 1 5830.8 3089.2
## - religiousness 1 5843.9 3090.6
## - rating 1 6005.9 3107.0
##
## Step: AIC=3072.34
## affairs ~ age + yearsmarried + religiousness + education + occupation +
## rating + factor(Children_F)
##
## Df Deviance AIC
## - education 1 5669.6 3070.4
## - factor(Children_F) 1 5670.7 3070.5
## - occupation 1 5685.7 3072.1
## <none> 5669.3 3072.3
## - age 1 5718.2 3075.5
## - yearsmarried 1 5834.6 3087.6
## - religiousness 1 5844.0 3088.6
## - rating 1 6007.1 3105.1
##
## Step: AIC=3070.37
## affairs ~ age + yearsmarried + religiousness + occupation + rating +
## factor(Children_F)
##
## Df Deviance AIC
## - factor(Children_F) 1 5671.1 3068.5
## <none> 5669.6 3070.4
## - occupation 1 5688.9 3070.4
## - age 1 5719.3 3073.6
## - yearsmarried 1 5835.7 3085.7
## - religiousness 1 5844.0 3086.6
## - rating 1 6016.6 3104.1
##
## Step: AIC=3068.53
## affairs ~ age + yearsmarried + religiousness + occupation + rating
##
## Df Deviance AIC
## <none> 5671.1 3068.5
## - occupation 1 5692.3 3068.8
## - age 1 5720.5 3071.8
## - religiousness 1 5845.6 3084.8
## - yearsmarried 1 5854.5 3085.7
## - rating 1 6016.6 3102.1
##
## Call: glm(formula = affairs ~ age + yearsmarried + religiousness +
## occupation + rating, data = Affairs)
##
## Coefficients:
## (Intercept) age yearsmarried religiousness occupation
## 5.60816 -0.05035 0.16185 -0.47632 0.10601
## rating
## -0.71224
##
## Degrees of Freedom: 600 Total (i.e. Null); 595 Residual
## Null Deviance: 6529
## Residual Deviance: 5671 AIC: 3069
prob_y <- as.data.frame(predict(y_model, type = c("response"), Affairs))
final_y <- cbind(Affairs, prob_y)
confusion_y <- table(prob_y>0.5, Affairs$affairs)
table(prob_y>0.5)
##
## FALSE TRUE
## 137 464
confusion_y
##
## 0 1 2 3 7 12
## FALSE 119 10 2 1 4 1
## TRUE 332 24 15 18 38 37
accuracy_y <- sum(diag(confusion_y)/sum(confusion_y))
accuracy_y
## [1] 0.2379368