DEM 7283 - Logistic Regression

Analysis
Nested model comparison
Stratified models
Logistic Regression as a Predictive Model
Classification methods and models
- using caret to create training and test sets.
Logistic regression for classification

In this example, we continue the discussion of the logistic regression model from last week. This week we examine model nesting and stratification.

We also look at using the logistic regression model for a binary classification model, commonly used in machine learning.

#load libraries
library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

#load brfss
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))


#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

#Poor or fair self rated health
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

Analysis

First make our analytical dataset and our survey design information

library(dplyr)
sub<-brfss_17%>%
  select(badhealth,mmsaname, bmi, agec,race_eth, marst, educ,white, black, hispanic, other, smoke, ins, mmsawt, ststr) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )

#Logit model
fit.logit<-svyglm(badhealth~race_eth+educ+agec,
                  design= des,
                  family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit)

## 
## Call:
## svyglm(formula = badhealth ~ race_eth + educ + agec, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.00287    0.06770 -29.585  < 2e-16 ***
## race_ethnh black     -0.03109    0.05058  -0.615   0.5387    
## race_ethnh multirace  0.06804    0.08741   0.778   0.4364    
## race_ethnh other     -0.17043    0.08313  -2.050   0.0404 *  
## race_ethnhwhite      -0.37482    0.04189  -8.949  < 2e-16 ***
## educ0Prim             0.88869    0.07067  12.575  < 2e-16 ***
## educ1somehs           0.67014    0.05069  13.220  < 2e-16 ***
## educ3somecol         -0.32651    0.03302  -9.889  < 2e-16 ***
## educ4colgrad         -1.16599    0.03459 -33.713  < 2e-16 ***
## agec(24,39]           0.49634    0.06571   7.554 4.24e-14 ***
## agec(39,59]           1.03889    0.06181  16.809  < 2e-16 ***
## agec(59,79]           1.34546    0.06167  21.817  < 2e-16 ***
## agec(79,99]           1.50892    0.07744  19.485  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9953381)
## 
## Number of Fisher Scoring iterations: 5

Nested model comparison

Often in a research setting we are interested in comparing several models, and almost never are we satisfied with a single solitary model. This is because the literature on our subjects often has multiple facets to it. So, for instance, we may be interested in how SES mediates the effects of race/ethnicity on health (see Shuey and Wilson 2008 and Sudano and Baker 2005 or Hummer 1993 for a few examples of these types of analysis).

Typically in these types of analysis, predictor variables are entered into the model in “blocks”. For example, let’s look at the self-rated health outcome (0=good or excellent health, 1= fair/poor health) from last week. But instead of entering all variables in the model simultaneously, we begin with the effect of race/ethnicity, then add the effect of SES then the effects of health behavior variables.

fit.logit1<-svyglm(badhealth~race_eth,design= des, family=binomial) #race only

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit2<-svyglm(badhealth~race_eth+educ,design= des, family=binomial) #race+education

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit3<-svyglm(badhealth~race_eth+educ+ins+smoke,design= des, family=binomial)#race+education+health behaviors

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit1)

## 
## Call:
## svyglm(formula = badhealth ~ race_eth, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.10678    0.03280 -33.742  < 2e-16 ***
## race_ethnh black     -0.22286    0.04652  -4.790 1.67e-06 ***
## race_ethnh multirace -0.34678    0.08110  -4.276 1.90e-05 ***
## race_ethnh other     -0.82381    0.07938 -10.378  < 2e-16 ***
## race_ethnhwhite      -0.64963    0.03597 -18.059  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000005)
## 
## Number of Fisher Scoring iterations: 4

In model 1 we see that Hispanics and blacks have a higher odds of reporting poor self rated health, compared to non-Hispanic whites, while the “other” group shows lower odds of reporting poor health.

Now, let’s see if, by controlling for education, some of these differences go away, or are reduced. The fancy word for when an effect is reduced is “attenuated”. We will also do a test to see if the model with the education term significantly improves the model fit. Traditionally this would be done using a likelihood ratio test, but in survey models, that’s not kosher

summary(fit.logit2)

## 
## Call:
## svyglm(formula = badhealth ~ race_eth + educ, design = des, family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.25928    0.04081 -30.858  < 2e-16 ***
## race_ethnh black      0.11161    0.04935   2.261 0.023731 *  
## race_ethnh multirace  0.09546    0.08542   1.118 0.263778    
## race_ethnh other     -0.18534    0.08571  -2.162 0.030592 *  
## race_ethnhwhite      -0.14924    0.03967  -3.762 0.000169 ***
## educ0Prim             1.12712    0.07035  16.023  < 2e-16 ***
## educ1somehs           0.71026    0.04937  14.386  < 2e-16 ***
## educ3somecol         -0.33133    0.03270 -10.132  < 2e-16 ***
## educ4colgrad         -1.10746    0.03392 -32.649  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9984191)
## 
## Number of Fisher Scoring iterations: 5

regTermTest(fit.logit2, test.terms = ~educ, method="Wald", df = NULL)

## Wald test for educ
##  in svyglm(formula = badhealth ~ race_eth + educ, design = des, family = binomial)
## F =  513.1387  on  4  and  199207  df: p= < 2.22e-16

so, we see the race effects in all groups attenuate (reduce in size) somewhat after considering education, so the differences in health are smaller once you control for education.

The F test also suggest that the second model fits the data better than the first. It is another of these omnibus tests that asks whether there is any variation in our outcome by education in the second model.

Next we consider the third model, which contains health behaviors of current smoking and insurance coverage:

summary(fit.logit3)

## 
## Call:
## svyglm(formula = badhealth ~ race_eth + educ + ins + smoke, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.57045    0.05817 -26.996  < 2e-16 ***
## race_ethnh black      0.04697    0.04956   0.948  0.34324    
## race_ethnh multirace -0.06338    0.08678  -0.730  0.46515    
## race_ethnh other     -0.21570    0.08552  -2.522  0.01166 *  
## race_ethnhwhite      -0.30536    0.04029  -7.579  3.5e-14 ***
## educ0Prim             1.13629    0.07059  16.096  < 2e-16 ***
## educ1somehs           0.63809    0.05022  12.706  < 2e-16 ***
## educ3somecol         -0.31346    0.03301  -9.495  < 2e-16 ***
## educ4colgrad         -1.00270    0.03491 -28.721  < 2e-16 ***
## ins                   0.15316    0.04674   3.277  0.00105 ** 
## smokeCurrent          0.67681    0.03581  18.898  < 2e-16 ***
## smokeFormer           0.50403    0.03106  16.229  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.994982)
## 
## Number of Fisher Scoring iterations: 5

regTermTest(fit.logit3, test.terms=~ins+smoke, method="Wald", df = NULL)

## Wald test for ins smoke
##  in svyglm(formula = badhealth ~ race_eth + educ + ins + smoke, design = des, 
##     family = binomial)
## F =  155.2489  on  3  and  199204  df: p= < 2.22e-16

In this model, we see the effects for Hispanics and blacks go back up. This is somewhat confusing, but is most likely related to the differential rates of smoking among those groups, compared to whites. Both current and former smokers are more likely to report poor/fair health, while insurance coverage does not affect the odds at all. Finally, we see that the model is significantly better fitting than the previous model.

myexp<-function(x){exp(x)}
stargazer(fit.logit1, fit.logit2, fit.logit3,type = "html", style="demography", covariate.labels =c("Black","MultRace" ,"Other","NHWhite", "PrimarySchool", "SomeHS","SomeColl", "CollGrad",  "Insurance", "Current Smoker", "Former Smoker"), ci = T, apply.coef = myexp )


	badhealth
	Model 1	Model 2	Model 3

Black	0.800^***	1.118^***	1.048^***
	(0.709, 0.891)	(1.021, 1.215)	(0.951, 1.145)
MultRace	0.707^***	1.100^***	0.939^***
	(0.548, 0.866)	(0.933, 1.268)	(0.769, 1.109)
Other	0.439^***	0.831^***	0.806^***
	(0.283, 0.594)	(0.663, 0.999)	(0.638, 0.974)
NHWhite	0.522^***	0.861^***	0.737^***
	(0.452, 0.593)	(0.784, 0.939)	(0.658, 0.816)
PrimarySchool		3.087^***	3.115^***
		(2.949, 3.225)	(2.977, 3.254)
SomeHS		2.035^***	1.893^***
		(1.938, 2.131)	(1.794, 1.991)
SomeColl		0.718^***	0.731^***
		(0.654, 0.782)	(0.666, 0.796)
CollGrad		0.330^***	0.367^***
		(0.264, 0.397)	(0.298, 0.435)
Insurance			1.166^***
			(1.074, 1.257)
Current Smoker			1.968^***
			(1.897, 2.038)
Former Smoker			1.655^***
			(1.595, 1.716)
Constant	0.331^***	0.284^***	0.208^***
	(0.266, 0.395)	(0.204, 0.364)	(0.094, 0.322)
N	200,568	200,568	200,568
Log Likelihood	-84,591.230	-79,950.350	-78,938.390
AIC	169,192.500	159,918.700	157,900.800

p < .05; p < .01; p < .001

Stratified models

Often in the literature, we will see models stratified by some predictor. This is usually because a specific hypothesis is stated regarding how the effect of a particular predictor varies by some categorical variable. In this case, we may be interested in considering if education or smoking universally affects the poor health outcome. We get at this by stratifying our analysis by race (in this example).

The easiest way to do this is to subset the data by race and run the models separately.

The first thing we do is test for the interaction of education and race. If this interaction is not significant, we have no justification for proceeding, because the effect of education does not vary by race group. This is the test of parallel slopes, a’ la the ANCOVA model

fit.logitint<-svyglm(badhealth~race_eth*educ+ins+smoke,design= des, family=binomial)#race*education interaction+health behaviors

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

regTermTest(fit.logitint, test.terms = ~race_eth:educ, method = "Wald", df=NULL)

## Wald test for race_eth:educ
##  in svyglm(formula = badhealth ~ race_eth * educ + ins + smoke, design = des, 
##     family = binomial)
## F =  2.664228  on  16  and  199188  df: p= 0.00031822

Here, the F-test does indicate that the interaction term in the model is significant, so the effects of education are not constant by race.

Now we stratify our models:

fit.unrestricted<-svyglm(badhealth~educ+ins+smoke,design= des, family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit.white<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, white==1), family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit.black<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, black==1), family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit.other1<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, race_eth=="nh other"), family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit.other2<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, race_eth=="nh multirace"), family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

fit.logit.hisp<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, hispanic==1), family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Here we examine the model results next to one another

stargazer(fit.logit.white, fit.logit.black, fit.logit.hisp, fit.logit.other1, fit.logit.other2,
          type = "html", style="demography",
          covariate.labels =c(  "Some HS","HS Graduate", "Some College", "College Grad", "Insurance", "Current Smoker", "Former Smoker","Intercept"),
          column.labels = c("NH White", "NH Black", "Hispanic", "NH Other 1", "NH Other 2"),
          ci=T , apply.coef = myexp)


	badhealth
	NH White	NH Black	Hispanic	NH Other 1	NH Other 2
	Model 1	Model 2	Model 3	Model 4	Model 5

Some HS	3.163^***	1.860^***	3.376^***	1.172^**	2.177^***
	(2.918, 3.408)	(1.475, 2.245)	(3.179, 3.573)	(0.368, 1.975)	(1.257, 3.098)
HS Graduate	2.132^***	1.701^***	1.662^***	2.700^***	1.661^***
	(2.005, 2.259)	(1.484, 1.918)	(1.456, 1.868)	(1.978, 3.423)	(1.020, 2.301)
Some College	0.723^***	0.764^***	0.688^***	0.896^***	0.795^***
	(0.649, 0.798)	(0.601, 0.927)	(0.501, 0.874)	(0.500, 1.292)	(0.426, 1.164)
College Grad	0.354^***	0.417^***	0.407^***	0.402^*	0.426^*
	(0.277, 0.431)	(0.233, 0.601)	(0.209, 0.606)	(0.041, 0.763)	(0.003, 0.848)
Insurance	1.020^***	1.243^***	1.244^***	1.425^***	1.594^***
	(0.900, 1.141)	(1.036, 1.451)	(1.075, 1.413)	(0.990, 1.859)	(1.157, 2.032)
Current Smoker	2.175^***	2.155^***	1.351^***	1.723^***	1.956^***
	(2.092, 2.259)	(1.994, 2.317)	(1.154, 1.549)	(1.346, 2.100)	(1.590, 2.323)
Former Smoker	1.624^***	2.203^***	1.389^***	2.439^***	2.046^***
	(1.556, 1.692)	(2.024, 2.383)	(1.224, 1.554)	(2.099, 2.780)	(1.681, 2.411)
Intercept	0.170^**	0.191	0.220^*	0.124	0.134
	(0.041, 0.298)	(-0.025, 0.407)	(0.024, 0.416)	(-0.360, 0.608)	(-0.382, 0.651)
N	150,655	20,285	17,979	7,964	3,685
Log Likelihood	-54,302.950	-9,109.037	-8,590.519	-2,611.371	-1,582.710
AIC	108,621.900	18,234.070	17,197.040	5,238.742	3,181.420

p < .05; p < .01; p < .001

beta.test<-function(model1, model2, betaname){
s1<-summary(model1)$coef
s2<-summary(model2)$coef
db <- ((s2[rownames(s2)==betaname,1]-s1[rownames(s1)==betaname,1]))^2
sd <-s2[rownames(s2)==betaname,2]^2+s1[rownames(s1)==betaname,2]^2
td <- db/sd
beta1=s1[rownames(s1)==betaname,1]
beta2=s2[rownames(s2)==betaname,1]
pv<-1- pchisq(td, df = 1)
print(list(beta=betaname,beta1=beta1, beta2=beta2, x2=td, pvalue=pv))
}

Here is an example of testing if the “Current Smoking” effect is the same among whites and blacks. This follows the logic set forth in Allison 2010, p 219

Test for \(\beta_{1j} = \beta_{1k}\) in two models \(j \text{ and } k\) \[z= \frac{\beta_{1j} - \beta_{1k}}{\left[ s.e.(\beta_{1j}) \right]^2+\left[ s.e.(\beta_{1k}) \right]^2}\]

Logistic Regression as a Predictive Model

Classification methods and models

In classification methods, we are typically interested in using some observed characteristics of a case to predict a binary categorical outcome. This can be extended to a multi-category outcome, but the largest number of applications involve a 1/0 outcome.

Below, we look at using Logistic regression

There are other methods for doing this that we will not examine but these are probably the easiest to understand.

In these examples, we will use the Demographic and Health Survey Model Data. These are based on the DHS survey, but are publicly available and are used to practice using the DHS data sets, but don’t represent a real country.

In this example, we will use the outcome of contraceptive choice (modern vs other/none) as our outcome.

library(haven)
dat<-url("https://github.com/coreysparks/data/blob/master/ZZIR62FL.DTA?raw=true")
model.dat<-read_dta(dat)

Here we recode some of our variables and limit our data to those women who are not currently pregnant and who are sexually active.

library(dplyr)

model.dat2<-model.dat%>%
  mutate(region = v024, 
         modcontra= as.factor(ifelse(v364 ==1,1, 0)),
         age = v012, 
         livchildren=v218,
         educ = v106,
         currpreg=v213,
         knowmodern=ifelse(v301==3, 1, 0),
         age2=v012^2)%>%
  filter(currpreg==0, v536>0)%>% #notpreg, sex active
  dplyr::select(caseid, region, modcontra,age, age2,livchildren, educ, knowmodern)

knitr::kable(head(model.dat2))

caseid	region	age	age2	livchildren	educ	knowmodern
1 1 2	2	30	900	4	0	1
1 4 2	2	42	1764	2	0	1
1 4 3	2	25	625	3	1	1
1 5 1	2	25	625	2	2	1
1 6 2	2	37	1369	2	0	1
1 6 3	2	17	289	0	2	0

using caret to create training and test sets.

In predictive models, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.

We use an 80% training fraction, which is standard.

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

set.seed(1115)
train<- createDataPartition(y = model.dat2$modcontra , p = .80, list=F)

model.dat2train<-model.dat2[train,]
model.dat2test<-model.dat2[-train,]

table(model.dat2train$modcontra)

## 
##    0    1 
## 4036 1409

prop.table(table(model.dat2train$modcontra))

## 
##         0         1 
## 0.7412305 0.2587695

summary(model.dat2train)

##     caseid              region      modcontra      age             age2       
##  Length:5445        Min.   :1.000   0:4036    Min.   :15.00   Min.   : 225.0  
##  Class :character   1st Qu.:1.000   1:1409    1st Qu.:21.00   1st Qu.: 441.0  
##  Mode  :character   Median :2.000             Median :29.00   Median : 841.0  
##                     Mean   :2.164             Mean   :29.78   Mean   : 976.8  
##                     3rd Qu.:3.000             3rd Qu.:37.00   3rd Qu.:1369.0  
##                     Max.   :4.000             Max.   :49.00   Max.   :2401.0  
##   livchildren          educ          knowmodern    
##  Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 1.000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median : 2.000   Median :0.0000   Median :1.0000  
##  Mean   : 2.546   Mean   :0.7381   Mean   :0.9442  
##  3rd Qu.: 4.000   3rd Qu.:2.0000   3rd Qu.:1.0000  
##  Max.   :10.000   Max.   :3.0000   Max.   :1.0000

Logistic regression for classification

Here we use a basic binomial GLM to estimate the probability of a woman using modern contraception. We use information on their region of residence, age, number of living children and level of education.

This model can be written: \[ln \left ( \frac{Pr(\text{Modern Contraception})}{1-Pr(\text{Modern Contraception})} \right ) = X' \beta\]

Which can be converted to the probability scale via the inverse logit transform:

\[Pr(\text{Modern Contraception}) = \frac{1}{1+exp (-X' \beta)}\]

glm1<-glm(modcontra~factor(region)+scale(age)+scale(age2)+scale(livchildren)+factor(educ), data=model.dat2train[,-1], family = binomial)
summary(glm1)

## 
## Call:
## glm(formula = modcontra ~ factor(region) + scale(age) + scale(age2) + 
##     scale(livchildren) + factor(educ), family = binomial, data = model.dat2train[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4073  -0.7103  -0.5734   1.0669   2.3413  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.91240    0.06807 -28.095  < 2e-16 ***
## factor(region)2     0.38755    0.08534   4.541 5.60e-06 ***
## factor(region)3     0.62565    0.09531   6.564 5.23e-11 ***
## factor(region)4     0.30066    0.09454   3.180 0.001471 ** 
## scale(age)          0.63678    0.26540   2.399 0.016425 *  
## scale(age2)        -0.98328    0.26194  -3.754 0.000174 ***
## scale(livchildren)  0.17004    0.05408   3.144 0.001665 ** 
## factor(educ)1       0.43835    0.10580   4.143 3.43e-05 ***
## factor(educ)2       1.38923    0.08646  16.068  < 2e-16 ***
## factor(educ)3       1.54061    0.16086   9.577  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6226.5  on 5444  degrees of freedom
## Residual deviance: 5629.0  on 5435  degrees of freedom
## AIC: 5649
## 
## Number of Fisher Scoring iterations: 4

We see that all the predictors are significantly related to our outcome

Next we see how the model performs in terms of accuracy of prediction. This is new comparison to how we typically use logistic regression.

We use the predict() function to get the estimated class probabilities for each case

tr_pred<- predict(glm1, newdata = model.dat2train, type = "response")
head(tr_pred)

##          1          2          3          4          5          6 
## 0.22002790 0.31137928 0.15091505 0.20389088 0.08726724 0.18808481

These are the estimated probability that each of these women used modern contraception, based on the model.

In order to create classes (uses modern vs doesn’t use modern contraception) we have to use a decision rule. A decision rule is when we choose a cut off point, or threshold value of the probability to classify each observation as belonging to one class or the other.

A basic decision rule is if \(Pr(y=\text{Modern Contraception} |X) >.5\) Then classify the observation as a modern contraception user, and otherwise not. This is what we will use here.

tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))

library(ggplot2)

pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$modcontra)

pred1%>%
  ggplot()+geom_density(aes(x=pr, color=gr, group=gr))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Threshold = .5")

pred1%>%
  ggplot()+geom_density(aes(x=pr, color=modcon, group=modcon))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Truth")

Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.

table( tr_predcl,model.dat2train$modcontra)

##          
## tr_predcl    0    1
##         0 3761 1142
##         1  275  267

This is great, but typically it’s easier to understand the model’s predictive ability by converting these to proportions. The confusionMatrix() function in caret can do this, plus other stuff.

This provides lots of output summarizing the classification results. At its core is the matrix of observed classes versus predicted classes. I got one depiction of this here and from the Wikipedia page

Confusion matrix

Lots of information on the predictive accuracy can be found from this 2x2 table:

Confusion matrix

Generally, we are interested in overall accuracy, sensitivity and specificity.

cm1<-confusionMatrix(data = tr_predcl,reference = model.dat2train$modcontra )
cm1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3761 1142
##          1  275  267
##                                           
##                Accuracy : 0.7398          
##                  95% CI : (0.7279, 0.7514)
##     No Information Rate : 0.7412          
##     P-Value [Acc > NIR] : 0.6046          
##                                           
##                   Kappa : 0.1517          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9319          
##             Specificity : 0.1895          
##          Pos Pred Value : 0.7671          
##          Neg Pred Value : 0.4926          
##              Prevalence : 0.7412          
##          Detection Rate : 0.6907          
##    Detection Prevalence : 0.9005          
##       Balanced Accuracy : 0.5607          
##                                           
##        'Positive' Class : 0               
##

Overall the model has a 73.976% accuracy, which isn’t bad! What is bad is some of the other measures. The sensitivity 93%, and the Specificity is 19 In other word the model is pretty good at predicting if you don’t use modern contraception, but not at predicting if you do.

We could try a different decision rule, in this case, I use the mean of the response as the cutoff value.

tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$modcontra==1)), 1, 0)) #mean of response

pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$modcontra)
pred2%>%
  ggplot()+geom_density(aes(x=pr, color=gr, group=gr))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Threshold = Mean")

pred2%>%
  ggplot()+geom_density(aes(x=pr, color=modcon, group=modcon))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Truth")

confusionMatrix(data = tr_predcl,model.dat2train$modcontra, positive = "1" )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2959  581
##          1 1077  828
##                                           
##                Accuracy : 0.6955          
##                  95% CI : (0.6831, 0.7077)
##     No Information Rate : 0.7412          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2878          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5877          
##             Specificity : 0.7332          
##          Pos Pred Value : 0.4346          
##          Neg Pred Value : 0.8359          
##              Prevalence : 0.2588          
##          Detection Rate : 0.1521          
##    Detection Prevalence : 0.3499          
##       Balanced Accuracy : 0.6604          
##                                           
##        'Positive' Class : 1               
##

Which produces the same accuracy, but decreases the sensitivity at the cost of increased specificity.

Next we do this on the test set to evaluate model performance outside of the training data

pred_test<-predict(glm1, newdata=model.dat2test, type="response")
pred_cl<-factor(ifelse(pred_test>mean(I(model.dat2test$modcontra==1)), 1, 0))

table(model.dat2test$modcontra,pred_cl)

##    pred_cl
##       0   1
##   0 720 288
##   1 154 198

confusionMatrix(data = pred_cl,model.dat2test$modcontra )

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 720 154
##          1 288 198
##                                           
##                Accuracy : 0.675           
##                  95% CI : (0.6494, 0.6999)
##     No Information Rate : 0.7412          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2463          
##                                           
##  Mcnemar's Test P-Value : 2.513e-10       
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.5625          
##          Pos Pred Value : 0.8238          
##          Neg Pred Value : 0.4074          
##              Prevalence : 0.7412          
##          Detection Rate : 0.5294          
##    Detection Prevalence : 0.6426          
##       Balanced Accuracy : 0.6384          
##                                           
##        'Positive' Class : 0               
##

In the test data, the model does about as well as it did on the training data, which is ideal.

DEM 7283 - Logistic Regression - Other Topics

Corey Sparks, PhD

2/24/2020