Logistic Regression

Affairs dataset

Assignment 18

#install.packages('AER')
library(AER)
## Warning: package 'AER' was built under R version 3.5.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.5.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.5.1
## Loading required package: survival
data(Affairs,package="AER")

mydata <- Affairs
attach(mydata)
summary(mydata)
##     affairs          gender         age         yearsmarried    children 
##  Min.   : 0.000   female:315   Min.   :17.50   Min.   : 0.125   no :171  
##  1st Qu.: 0.000   male  :286   1st Qu.:27.00   1st Qu.: 4.000   yes:430  
##  Median : 0.000                Median :32.00   Median : 7.000            
##  Mean   : 1.456                Mean   :32.49   Mean   : 8.178            
##  3rd Qu.: 0.000                3rd Qu.:37.00   3rd Qu.:15.000            
##  Max.   :12.000                Max.   :57.00   Max.   :15.000            
##  religiousness     education       occupation        rating     
##  Min.   :1.000   Min.   : 9.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:14.00   1st Qu.:3.000   1st Qu.:3.000  
##  Median :3.000   Median :16.00   Median :5.000   Median :4.000  
##  Mean   :3.116   Mean   :16.17   Mean   :4.195   Mean   :3.932  
##  3rd Qu.:4.000   3rd Qu.:18.00   3rd Qu.:6.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :20.00   Max.   :7.000   Max.   :5.000
colnames(mydata)
## [1] "affairs"       "gender"        "age"           "yearsmarried" 
## [5] "children"      "religiousness" "education"     "occupation"   
## [9] "rating"
CPaffairs <- NULL
CPaffairs <- ifelse(affairs>0,1,0)
mydata[,"CPaffairs"] <- CPaffairs

CPgender <- NULL
CPgender <- ifelse(gender=="male",1,0)
mydata[,"CPgender"] <- CPgender

CPchildren <- NULL
CPchildren <- ifelse(children=="yes",1,0)
mydata[,"CPchildren"] <- CPchildren

mydata <- mydata[,-c(1,2,5)]

plot(mydata)

cor(mydata)
##                       age yearsmarried religiousness    education
## age            1.00000000   0.77754585   0.193776931  0.134596015
## yearsmarried   0.77754585   1.00000000   0.218260672  0.040002716
## religiousness  0.19377693   0.21826067   1.000000000 -0.042571079
## education      0.13459601   0.04000272  -0.042571079  1.000000000
## occupation     0.16641254   0.04459201  -0.039722324  0.533605242
## rating        -0.19899990  -0.24311883   0.024295777  0.109303473
## CPaffairs      0.05732152   0.14030120  -0.130090072  0.019285840
## CPgender       0.19064108   0.03028252   0.007679445  0.397504680
## CPchildren     0.42193082   0.57285736   0.129351259 -0.006985882
##                occupation       rating   CPaffairs     CPgender
## age            0.16641254 -0.198999899  0.05732152  0.190641080
## yearsmarried   0.04459201 -0.243118827  0.14030120  0.030282521
## religiousness -0.03972232  0.024295777 -0.13009007  0.007679445
## education      0.53360524  0.109303473  0.01928584  0.397504680
## occupation     1.00000000  0.017422273  0.03764236  0.467923152
## rating         0.01742227  1.000000000 -0.25381444 -0.007523748
## CPaffairs      0.03764236 -0.253814441  1.00000000  0.050955678
## CPgender       0.46792315 -0.007523748  0.05095568  1.000000000
## CPchildren    -0.09272712 -0.196275616  0.13360509  0.069222338
##                 CPchildren
## age            0.421930815
## yearsmarried   0.572857364
## religiousness  0.129351259
## education     -0.006985882
## occupation    -0.092727118
## rating        -0.196275616
## CPaffairs      0.133605091
## CPgender       0.069222338
## CPchildren     1.000000000
library(corpcor)

cor2pcor(cor(mydata))
##              [,1]         [,2]        [,3]         [,4]        [,5]
##  [1,]  1.00000000  0.716325696  0.03443158  0.034786185  0.07179874
##  [2,]  0.71632570  1.000000000  0.11742940 -0.001835041  0.03380241
##  [3,]  0.03443158  0.117429399  1.00000000 -0.047130873 -0.03372505
##  [4,]  0.03478618 -0.001835041 -0.04713087  1.000000000  0.41877583
##  [5,]  0.07179874  0.033802414 -0.03372505  0.418775832  1.00000000
##  [6,] -0.05300696 -0.073531639  0.05543208  0.138532776 -0.02880954
##  [7,] -0.10071835  0.118777157 -0.14670237  0.014809502  0.02052325
##  [8,]  0.19121521 -0.198155079  0.03450469  0.185846871  0.32457915
##  [9,] -0.04354499  0.411627444  0.01016725  0.029326573 -0.18545642
##              [,6]        [,7]        [,8]        [,9]
##  [1,] -0.05300696 -0.10071835  0.19121521 -0.04354499
##  [2,] -0.07353164  0.11877716 -0.19815508  0.41162744
##  [3,]  0.05543208 -0.14670237  0.03450469  0.01016725
##  [4,]  0.13853278  0.01480950  0.18584687  0.02932657
##  [5,] -0.02880954  0.02052325  0.32457915 -0.18545642
##  [6,]  1.00000000 -0.21940016 -0.01726631 -0.05872978
##  [7,] -0.21940016  1.00000000  0.04636879  0.04795731
##  [8,] -0.01726631  0.04636879  1.00000000  0.14413766
##  [9,] -0.05872978  0.04795731  0.14413766  1.00000000
model <- glm(CPaffairs ~ CPgender + age + yearsmarried + CPchildren + religiousness + education + occupation + rating, family = "binomial")
summary(model)
## 
## Call:
## glm(formula = CPaffairs ~ CPgender + age + yearsmarried + CPchildren + 
##     religiousness + education + occupation + rating, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5713  -0.7499  -0.5690  -0.2539   2.5191  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.37726    0.88776   1.551 0.120807    
## CPgender       0.28029    0.23909   1.172 0.241083    
## age           -0.04426    0.01825  -2.425 0.015301 *  
## yearsmarried   0.09477    0.03221   2.942 0.003262 ** 
## CPchildren     0.39767    0.29151   1.364 0.172508    
## religiousness -0.32472    0.08975  -3.618 0.000297 ***
## education      0.02105    0.05051   0.417 0.676851    
## occupation     0.03092    0.07178   0.431 0.666630    
## rating        -0.46845    0.09091  -5.153 2.56e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.38  on 600  degrees of freedom
## Residual deviance: 609.51  on 592  degrees of freedom
## AIC: 627.51
## 
## Number of Fisher Scoring iterations: 4
influenceIndexPlot(model)

influencePlot(model)

##        StudRes         Hat       CookD
## 170 -0.7935995 0.051429098 0.002266777
## 405 -0.8358598 0.076371364 0.003930464
## 483  2.4447086 0.006704172 0.013364040
## 552  2.2931376 0.013542106 0.018224209
## 568  2.5610182 0.009231835 0.023903607
model2 <- glm(CPaffairs ~ CPgender + age + yearsmarried + CPchildren + religiousness + education + occupation + rating, family = "binomial", data = mydata[-c(170,405,552,568),])
summary(model2)
## 
## Call:
## glm(formula = CPaffairs ~ CPgender + age + yearsmarried + CPchildren + 
##     religiousness + education + occupation + rating, family = "binomial", 
##     data = mydata[-c(170, 405, 552, 568), ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6370  -0.7386  -0.5496  -0.2402   2.4368  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.853750   0.912329   2.032 0.042165 *  
## CPgender       0.283377   0.243250   1.165 0.244034    
## age           -0.053385   0.019154  -2.787 0.005318 ** 
## yearsmarried   0.104626   0.032791   3.191 0.001419 ** 
## CPchildren     0.432033   0.298408   1.448 0.147676    
## religiousness -0.345729   0.091123  -3.794 0.000148 ***
## education      0.009618   0.052102   0.185 0.853541    
## occupation     0.042177   0.072621   0.581 0.561381    
## rating        -0.497776   0.092329  -5.391 6.99e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 668.67  on 596  degrees of freedom
## Residual deviance: 596.30  on 588  degrees of freedom
## AIC: 614.3
## 
## Number of Fisher Scoring iterations: 4
#Identification for Not performed Column
vif(model)
##      CPgender           age  yearsmarried    CPchildren religiousness 
##      1.429705      2.648541      3.052111      1.411823      1.076183 
##     education    occupation        rating 
##      1.535033      1.698276      1.079448
avPlots(model)

FinalModel <- glm(CPaffairs ~ age + yearsmarried + religiousness + rating, family = "binomial", data= mydata[-c(170,405,552,568),])
summary(FinalModel)
## 
## Call:
## glm(formula = CPaffairs ~ age + yearsmarried + religiousness + 
##     rating, family = "binomial", data = mydata[-c(170, 405, 552, 
##     568), ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6850  -0.7482  -0.5469  -0.2877   2.3187  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    2.28086    0.62989   3.621 0.000293 ***
## age           -0.04341    0.01818  -2.388 0.016953 *  
## yearsmarried   0.11070    0.02994   3.698 0.000218 ***
## religiousness -0.34786    0.09080  -3.831 0.000128 ***
## rating        -0.49651    0.09053  -5.485 4.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 668.67  on 596  degrees of freedom
## Residual deviance: 602.21  on 592  degrees of freedom
## AIC: 612.21
## 
## Number of Fisher Scoring iterations: 4
prob <- predict(FinalModel, mydata, type = "response")
prob <- as.data.frame(prob)

final <- cbind(mydata, prob)

confusion <- table(prob>0.5, CPaffairs)
confusion
##        CPaffairs
##           0   1
##   FALSE 430 125
##   TRUE   21  25
accuracy <- sum(diag(confusion)/sum(confusion))
accuracy
## [1] 0.7570715
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.5.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
rocrpred <- prediction(prob, CPaffairs)
rocrperf <- performance(rocrpred, 'tpr', 'fpr')

str(rocrperf)
## Formal class 'performance' [package "ROCR"] with 6 slots
##   ..@ x.name      : chr "False positive rate"
##   ..@ y.name      : chr "True positive rate"
##   ..@ alpha.name  : chr "Cutoff"
##   ..@ x.values    :List of 1
##   .. ..$ : num [1:277] 0 0 0.00443 0.00443 0.00443 ...
##   ..@ y.values    :List of 1
##   .. ..$ : num [1:277] 0 0.00667 0.00667 0.02 0.02667 ...
##   ..@ alpha.values:List of 1
##   .. ..$ : num [1:277] Inf 0.771 0.758 0.733 0.67 ...
plot(rocrperf,colorize=T, text.adj=c(0.2,-1.7))