In this example, we continue the discussion of the logistic regression model from last week. This week we examine model nesting and stratification.
We also look at using the logistic regression model for a binary classification model, commonly used in machine learning.
#load libraries
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#load brfss
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
#Poor or fair self rated health
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#sex
brfss_17$male<-as.factor(ifelse(brfss_17$sex==1, "Male", "Female"))
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA", as.factor = T)
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places, so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
brfss_17$obese<-ifelse(brfss_17$bmi>=30, 1, 0)
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
First make our analytical dataset and our survey design information
library(dplyr)
sub<-brfss_17%>%
select(badhealth,mmsaname, bmi, agec,race_eth, marst, educ,white, black, hispanic, other, smoke, ins, mmsawt, ststr) %>%
filter( complete.cases(.))
#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data =sub )
#Logit model
fit.logit<-svyglm(badhealth~race_eth+educ+agec,
design= des,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = badhealth ~ race_eth + educ + agec, design = des,
## family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.00287 0.06770 -29.585 < 2e-16 ***
## race_ethnh black -0.03109 0.05058 -0.615 0.5387
## race_ethnh multirace 0.06804 0.08741 0.778 0.4364
## race_ethnh other -0.17043 0.08313 -2.050 0.0404 *
## race_ethnhwhite -0.37482 0.04189 -8.949 < 2e-16 ***
## educ0Prim 0.88869 0.07067 12.575 < 2e-16 ***
## educ1somehs 0.67014 0.05069 13.220 < 2e-16 ***
## educ3somecol -0.32651 0.03302 -9.889 < 2e-16 ***
## educ4colgrad -1.16599 0.03459 -33.713 < 2e-16 ***
## agec(24,39] 0.49634 0.06571 7.554 4.24e-14 ***
## agec(39,59] 1.03889 0.06181 16.809 < 2e-16 ***
## agec(59,79] 1.34546 0.06167 21.817 < 2e-16 ***
## agec(79,99] 1.50892 0.07744 19.485 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9953381)
##
## Number of Fisher Scoring iterations: 5
Often in a research setting we are interested in comparing several models, and almost never are we satisfied with a single solitary model. This is because the literature on our subjects often has multiple facets to it. So, for instance, we may be interested in how SES mediates the effects of race/ethnicity on health (see Shuey and Wilson 2008 and Sudano and Baker 2005 or Hummer 1993 for a few examples of these types of analysis).
Typically in these types of analysis, predictor variables are entered into the model in “blocks”. For example, let’s look at the self-rated health outcome (0=good or excellent health, 1= fair/poor health) from last week. But instead of entering all variables in the model simultaneously, we begin with the effect of race/ethnicity, then add the effect of SES then the effects of health behavior variables.
fit.logit1<-svyglm(badhealth~race_eth,design= des, family=binomial) #race only
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit2<-svyglm(badhealth~race_eth+educ,design= des, family=binomial) #race+education
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit3<-svyglm(badhealth~race_eth+educ+ins+smoke,design= des, family=binomial)#race+education+health behaviors
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
##
## Call:
## svyglm(formula = badhealth ~ race_eth, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.10678 0.03280 -33.742 < 2e-16 ***
## race_ethnh black -0.22286 0.04652 -4.790 1.67e-06 ***
## race_ethnh multirace -0.34678 0.08110 -4.276 1.90e-05 ***
## race_ethnh other -0.82381 0.07938 -10.378 < 2e-16 ***
## race_ethnhwhite -0.64963 0.03597 -18.059 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000005)
##
## Number of Fisher Scoring iterations: 4
In model 1 we see that Hispanics and blacks have a higher odds of reporting poor self rated health, compared to non-Hispanic whites, while the “other” group shows lower odds of reporting poor health.
Now, let’s see if, by controlling for education, some of these differences go away, or are reduced. The fancy word for when an effect is reduced is “attenuated”. We will also do a test to see if the model with the education term significantly improves the model fit. Traditionally this would be done using a likelihood ratio test, but in survey models, that’s not kosher
summary(fit.logit2)
##
## Call:
## svyglm(formula = badhealth ~ race_eth + educ, design = des, family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.25928 0.04081 -30.858 < 2e-16 ***
## race_ethnh black 0.11161 0.04935 2.261 0.023731 *
## race_ethnh multirace 0.09546 0.08542 1.118 0.263778
## race_ethnh other -0.18534 0.08571 -2.162 0.030592 *
## race_ethnhwhite -0.14924 0.03967 -3.762 0.000169 ***
## educ0Prim 1.12712 0.07035 16.023 < 2e-16 ***
## educ1somehs 0.71026 0.04937 14.386 < 2e-16 ***
## educ3somecol -0.33133 0.03270 -10.132 < 2e-16 ***
## educ4colgrad -1.10746 0.03392 -32.649 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9984191)
##
## Number of Fisher Scoring iterations: 5
regTermTest(fit.logit2, test.terms = ~educ, method="Wald", df = NULL)
## Wald test for educ
## in svyglm(formula = badhealth ~ race_eth + educ, design = des, family = binomial)
## F = 513.1387 on 4 and 199207 df: p= < 2.22e-16
so, we see the race effects in all groups attenuate (reduce in size) somewhat after considering education, so the differences in health are smaller once you control for education.
The F test also suggest that the second model fits the data better than the first. It is another of these omnibus tests that asks whether there is any variation in our outcome by education in the second model.
Next we consider the third model, which contains health behaviors of current smoking and insurance coverage:
summary(fit.logit3)
##
## Call:
## svyglm(formula = badhealth ~ race_eth + educ + ins + smoke, design = des,
## family = binomial)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = sub)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.57045 0.05817 -26.996 < 2e-16 ***
## race_ethnh black 0.04697 0.04956 0.948 0.34324
## race_ethnh multirace -0.06338 0.08678 -0.730 0.46515
## race_ethnh other -0.21570 0.08552 -2.522 0.01166 *
## race_ethnhwhite -0.30536 0.04029 -7.579 3.5e-14 ***
## educ0Prim 1.13629 0.07059 16.096 < 2e-16 ***
## educ1somehs 0.63809 0.05022 12.706 < 2e-16 ***
## educ3somecol -0.31346 0.03301 -9.495 < 2e-16 ***
## educ4colgrad -1.00270 0.03491 -28.721 < 2e-16 ***
## ins 0.15316 0.04674 3.277 0.00105 **
## smokeCurrent 0.67681 0.03581 18.898 < 2e-16 ***
## smokeFormer 0.50403 0.03106 16.229 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.994982)
##
## Number of Fisher Scoring iterations: 5
regTermTest(fit.logit3, test.terms=~ins+smoke, method="Wald", df = NULL)
## Wald test for ins smoke
## in svyglm(formula = badhealth ~ race_eth + educ + ins + smoke, design = des,
## family = binomial)
## F = 155.2489 on 3 and 199204 df: p= < 2.22e-16
In this model, we see the effects for Hispanics and blacks go back up. This is somewhat confusing, but is most likely related to the differential rates of smoking among those groups, compared to whites. Both current and former smokers are more likely to report poor/fair health, while insurance coverage does not affect the odds at all. Finally, we see that the model is significantly better fitting than the previous model.
myexp<-function(x){exp(x)}
stargazer(fit.logit1, fit.logit2, fit.logit3,type = "html", style="demography", covariate.labels =c("Black","MultRace" ,"Other","NHWhite", "PrimarySchool", "SomeHS","SomeColl", "CollGrad", "Insurance", "Current Smoker", "Former Smoker"), ci = T, apply.coef = myexp )
| badhealth | |||
| Model 1 | Model 2 | Model 3 | |
| Black | 0.800*** | 1.118*** | 1.048*** |
| (0.709, 0.891) | (1.021, 1.215) | (0.951, 1.145) | |
| MultRace | 0.707*** | 1.100*** | 0.939*** |
| (0.548, 0.866) | (0.933, 1.268) | (0.769, 1.109) | |
| Other | 0.439*** | 0.831*** | 0.806*** |
| (0.283, 0.594) | (0.663, 0.999) | (0.638, 0.974) | |
| NHWhite | 0.522*** | 0.861*** | 0.737*** |
| (0.452, 0.593) | (0.784, 0.939) | (0.658, 0.816) | |
| PrimarySchool | 3.087*** | 3.115*** | |
| (2.949, 3.225) | (2.977, 3.254) | ||
| SomeHS | 2.035*** | 1.893*** | |
| (1.938, 2.131) | (1.794, 1.991) | ||
| SomeColl | 0.718*** | 0.731*** | |
| (0.654, 0.782) | (0.666, 0.796) | ||
| CollGrad | 0.330*** | 0.367*** | |
| (0.264, 0.397) | (0.298, 0.435) | ||
| Insurance | 1.166*** | ||
| (1.074, 1.257) | |||
| Current Smoker | 1.968*** | ||
| (1.897, 2.038) | |||
| Former Smoker | 1.655*** | ||
| (1.595, 1.716) | |||
| Constant | 0.331*** | 0.284*** | 0.208*** |
| (0.266, 0.395) | (0.204, 0.364) | (0.094, 0.322) | |
| N | 200,568 | 200,568 | 200,568 |
| Log Likelihood | -84,591.230 | -79,950.350 | -78,938.390 |
| AIC | 169,192.500 | 159,918.700 | 157,900.800 |
| p < .05; p < .01; p < .001 | |||
Often in the literature, we will see models stratified by some predictor. This is usually because a specific hypothesis is stated regarding how the effect of a particular predictor varies by some categorical variable. In this case, we may be interested in considering if education or smoking universally affects the poor health outcome. We get at this by stratifying our analysis by race (in this example).
The easiest way to do this is to subset the data by race and run the models separately.
The first thing we do is test for the interaction of education and race. If this interaction is not significant, we have no justification for proceeding, because the effect of education does not vary by race group. This is the test of parallel slopes, a’ la the ANCOVA model
fit.logitint<-svyglm(badhealth~race_eth*educ+ins+smoke,design= des, family=binomial)#race*education interaction+health behaviors
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
regTermTest(fit.logitint, test.terms = ~race_eth:educ, method = "Wald", df=NULL)
## Wald test for race_eth:educ
## in svyglm(formula = badhealth ~ race_eth * educ + ins + smoke, design = des,
## family = binomial)
## F = 2.664228 on 16 and 199188 df: p= 0.00031822
Here, the F-test does indicate that the interaction term in the model is significant, so the effects of education are not constant by race.
Now we stratify our models:
fit.unrestricted<-svyglm(badhealth~educ+ins+smoke,design= des, family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit.white<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, white==1), family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit.black<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, black==1), family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit.other1<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, race_eth=="nh other"), family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit.other2<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, race_eth=="nh multirace"), family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
fit.logit.hisp<-svyglm(badhealth~(educ+ins+smoke),design= subset(des, hispanic==1), family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Here we examine the model results next to one another
stargazer(fit.logit.white, fit.logit.black, fit.logit.hisp, fit.logit.other1, fit.logit.other2,
type = "html", style="demography",
covariate.labels =c( "Some HS","HS Graduate", "Some College", "College Grad", "Insurance", "Current Smoker", "Former Smoker","Intercept"),
column.labels = c("NH White", "NH Black", "Hispanic", "NH Other 1", "NH Other 2"),
ci=T , apply.coef = myexp)
| badhealth | |||||
| NH White | NH Black | Hispanic | NH Other 1 | NH Other 2 | |
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5 | |
| Some HS | 3.163*** | 1.860*** | 3.376*** | 1.172** | 2.177*** |
| (2.918, 3.408) | (1.475, 2.245) | (3.179, 3.573) | (0.368, 1.975) | (1.257, 3.098) | |
| HS Graduate | 2.132*** | 1.701*** | 1.662*** | 2.700*** | 1.661*** |
| (2.005, 2.259) | (1.484, 1.918) | (1.456, 1.868) | (1.978, 3.423) | (1.020, 2.301) | |
| Some College | 0.723*** | 0.764*** | 0.688*** | 0.896*** | 0.795*** |
| (0.649, 0.798) | (0.601, 0.927) | (0.501, 0.874) | (0.500, 1.292) | (0.426, 1.164) | |
| College Grad | 0.354*** | 0.417*** | 0.407*** | 0.402* | 0.426* |
| (0.277, 0.431) | (0.233, 0.601) | (0.209, 0.606) | (0.041, 0.763) | (0.003, 0.848) | |
| Insurance | 1.020*** | 1.243*** | 1.244*** | 1.425*** | 1.594*** |
| (0.900, 1.141) | (1.036, 1.451) | (1.075, 1.413) | (0.990, 1.859) | (1.157, 2.032) | |
| Current Smoker | 2.175*** | 2.155*** | 1.351*** | 1.723*** | 1.956*** |
| (2.092, 2.259) | (1.994, 2.317) | (1.154, 1.549) | (1.346, 2.100) | (1.590, 2.323) | |
| Former Smoker | 1.624*** | 2.203*** | 1.389*** | 2.439*** | 2.046*** |
| (1.556, 1.692) | (2.024, 2.383) | (1.224, 1.554) | (2.099, 2.780) | (1.681, 2.411) | |
| Intercept | 0.170** | 0.191 | 0.220* | 0.124 | 0.134 |
| (0.041, 0.298) | (-0.025, 0.407) | (0.024, 0.416) | (-0.360, 0.608) | (-0.382, 0.651) | |
| N | 150,655 | 20,285 | 17,979 | 7,964 | 3,685 |
| Log Likelihood | -54,302.950 | -9,109.037 | -8,590.519 | -2,611.371 | -1,582.710 |
| AIC | 108,621.900 | 18,234.070 | 17,197.040 | 5,238.742 | 3,181.420 |
| p < .05; p < .01; p < .001 | |||||
beta.test<-function(model1, model2, betaname){
s1<-summary(model1)$coef
s2<-summary(model2)$coef
db <- ((s2[rownames(s2)==betaname,1]-s1[rownames(s1)==betaname,1]))^2
sd <-s2[rownames(s2)==betaname,2]^2+s1[rownames(s1)==betaname,2]^2
td <- db/sd
beta1=s1[rownames(s1)==betaname,1]
beta2=s2[rownames(s2)==betaname,1]
pv<-1- pchisq(td, df = 1)
print(list(beta=betaname,beta1=beta1, beta2=beta2, x2=td, pvalue=pv))
}
Here is an example of testing if the “Current Smoking” effect is the same among whites and blacks. This follows the logic set forth in Allison 2010, p 219
Test for \(\beta_{1j} = \beta_{1k}\) in two models \(j \text{ and } k\) \[z= \frac{\beta_{1j} - \beta_{1k}}{\left[ s.e.(\beta_{1j}) \right]^2+\left[ s.e.(\beta_{1k}) \right]^2}\]
In classification methods, we are typically interested in using some observed characteristics of a case to predict a binary categorical outcome. This can be extended to a multi-category outcome, but the largest number of applications involve a 1/0 outcome.
Below, we look at using Logistic regression
There are other methods for doing this that we will not examine but these are probably the easiest to understand.
In these examples, we will use the Demographic and Health Survey Model Data. These are based on the DHS survey, but are publicly available and are used to practice using the DHS data sets, but don’t represent a real country.
In this example, we will use the outcome of contraceptive choice (modern vs other/none) as our outcome.
library(haven)
dat<-url("https://github.com/coreysparks/data/blob/master/ZZIR62FL.DTA?raw=true")
model.dat<-read_dta(dat)
Here we recode some of our variables and limit our data to those women who are not currently pregnant and who are sexually active.
library(dplyr)
model.dat2<-model.dat%>%
mutate(region = v024,
modcontra= as.factor(ifelse(v364 ==1,1, 0)),
age = v012,
livchildren=v218,
educ = v106,
currpreg=v213,
knowmodern=ifelse(v301==3, 1, 0),
age2=v012^2)%>%
filter(currpreg==0, v536>0)%>% #notpreg, sex active
dplyr::select(caseid, region, modcontra,age, age2,livchildren, educ, knowmodern)
knitr::kable(head(model.dat2))
| caseid | region | modcontra | age | age2 | livchildren | educ | knowmodern |
|---|---|---|---|---|---|---|---|
| 1 1 2 | 2 | 0 | 30 | 900 | 4 | 0 | 1 |
| 1 4 2 | 2 | 0 | 42 | 1764 | 2 | 0 | 1 |
| 1 4 3 | 2 | 0 | 25 | 625 | 3 | 1 | 1 |
| 1 5 1 | 2 | 0 | 25 | 625 | 2 | 2 | 1 |
| 1 6 2 | 2 | 0 | 37 | 1369 | 2 | 0 | 1 |
| 1 6 3 | 2 | 0 | 17 | 289 | 0 | 2 | 0 |
In predictive models, we split the data into two sets, a training set and a test set. The training set will be used to estimate the model parameters, and the test set will be used to validate the model’s predictive ability.
We use an 80% training fraction, which is standard.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1115)
train<- createDataPartition(y = model.dat2$modcontra , p = .80, list=F)
model.dat2train<-model.dat2[train,]
model.dat2test<-model.dat2[-train,]
table(model.dat2train$modcontra)
##
## 0 1
## 4036 1409
prop.table(table(model.dat2train$modcontra))
##
## 0 1
## 0.7412305 0.2587695
summary(model.dat2train)
## caseid region modcontra age age2
## Length:5445 Min. :1.000 0:4036 Min. :15.00 Min. : 225.0
## Class :character 1st Qu.:1.000 1:1409 1st Qu.:21.00 1st Qu.: 441.0
## Mode :character Median :2.000 Median :29.00 Median : 841.0
## Mean :2.164 Mean :29.78 Mean : 976.8
## 3rd Qu.:3.000 3rd Qu.:37.00 3rd Qu.:1369.0
## Max. :4.000 Max. :49.00 Max. :2401.0
## livchildren educ knowmodern
## Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:1.0000
## Median : 2.000 Median :0.0000 Median :1.0000
## Mean : 2.546 Mean :0.7381 Mean :0.9442
## 3rd Qu.: 4.000 3rd Qu.:2.0000 3rd Qu.:1.0000
## Max. :10.000 Max. :3.0000 Max. :1.0000
Here we use a basic binomial GLM to estimate the probability of a woman using modern contraception. We use information on their region of residence, age, number of living children and level of education.
This model can be written: \[ln \left ( \frac{Pr(\text{Modern Contraception})}{1-Pr(\text{Modern Contraception})} \right ) = X' \beta\]
Which can be converted to the probability scale via the inverse logit transform:
\[Pr(\text{Modern Contraception}) = \frac{1}{1+exp (-X' \beta)}\]
glm1<-glm(modcontra~factor(region)+scale(age)+scale(age2)+scale(livchildren)+factor(educ), data=model.dat2train[,-1], family = binomial)
summary(glm1)
##
## Call:
## glm(formula = modcontra ~ factor(region) + scale(age) + scale(age2) +
## scale(livchildren) + factor(educ), family = binomial, data = model.dat2train[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4073 -0.7103 -0.5734 1.0669 2.3413
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.91240 0.06807 -28.095 < 2e-16 ***
## factor(region)2 0.38755 0.08534 4.541 5.60e-06 ***
## factor(region)3 0.62565 0.09531 6.564 5.23e-11 ***
## factor(region)4 0.30066 0.09454 3.180 0.001471 **
## scale(age) 0.63678 0.26540 2.399 0.016425 *
## scale(age2) -0.98328 0.26194 -3.754 0.000174 ***
## scale(livchildren) 0.17004 0.05408 3.144 0.001665 **
## factor(educ)1 0.43835 0.10580 4.143 3.43e-05 ***
## factor(educ)2 1.38923 0.08646 16.068 < 2e-16 ***
## factor(educ)3 1.54061 0.16086 9.577 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6226.5 on 5444 degrees of freedom
## Residual deviance: 5629.0 on 5435 degrees of freedom
## AIC: 5649
##
## Number of Fisher Scoring iterations: 4
We see that all the predictors are significantly related to our outcome
Next we see how the model performs in terms of accuracy of prediction. This is new comparison to how we typically use logistic regression.
We use the predict() function to get the estimated class probabilities for each case
tr_pred<- predict(glm1, newdata = model.dat2train, type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.22002790 0.31137928 0.15091505 0.20389088 0.08726724 0.18808481
These are the estimated probability that each of these women used modern contraception, based on the model.
In order to create classes (uses modern vs doesn’t use modern contraception) we have to use a decision rule. A decision rule is when we choose a cut off point, or threshold value of the probability to classify each observation as belonging to one class or the other.
A basic decision rule is if \(Pr(y=\text{Modern Contraception} |X) >.5\) Then classify the observation as a modern contraception user, and otherwise not. This is what we will use here.
tr_predcl<-factor(ifelse(tr_pred>.5, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$modcontra)
pred1%>%
ggplot()+geom_density(aes(x=pr, color=gr, group=gr))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Threshold = .5")
pred1%>%
ggplot()+geom_density(aes(x=pr, color=modcon, group=modcon))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Truth")
Next we need to see how we did. A simple cross tab of the observed classes versus the predicted classes is called the confusion matrix.
table( tr_predcl,model.dat2train$modcontra)
##
## tr_predcl 0 1
## 0 3761 1142
## 1 275 267
This is great, but typically it’s easier to understand the model’s predictive ability by converting these to proportions. The confusionMatrix() function in caret can do this, plus other stuff.
This provides lots of output summarizing the classification results. At its core is the matrix of observed classes versus predicted classes. I got one depiction of this here and from the Wikipedia page
Confusion matrix
Lots of information on the predictive accuracy can be found from this 2x2 table:
Confusion matrix
Generally, we are interested in overall accuracy, sensitivity and specificity.
cm1<-confusionMatrix(data = tr_predcl,reference = model.dat2train$modcontra )
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3761 1142
## 1 275 267
##
## Accuracy : 0.7398
## 95% CI : (0.7279, 0.7514)
## No Information Rate : 0.7412
## P-Value [Acc > NIR] : 0.6046
##
## Kappa : 0.1517
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9319
## Specificity : 0.1895
## Pos Pred Value : 0.7671
## Neg Pred Value : 0.4926
## Prevalence : 0.7412
## Detection Rate : 0.6907
## Detection Prevalence : 0.9005
## Balanced Accuracy : 0.5607
##
## 'Positive' Class : 0
##
Overall the model has a 73.976% accuracy, which isn’t bad! What is bad is some of the other measures. The sensitivity 93%, and the Specificity is 19 In other word the model is pretty good at predicting if you don’t use modern contraception, but not at predicting if you do.
We could try a different decision rule, in this case, I use the mean of the response as the cutoff value.
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$modcontra==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred, gr=tr_predcl, modcon=model.dat2train$modcontra)
pred2%>%
ggplot()+geom_density(aes(x=pr, color=gr, group=gr))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Threshold = Mean")
pred2%>%
ggplot()+geom_density(aes(x=pr, color=modcon, group=modcon))+ggtitle(label = "Probability of Modern Contraception", subtitle = "Truth")
confusionMatrix(data = tr_predcl,model.dat2train$modcontra, positive = "1" )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2959 581
## 1 1077 828
##
## Accuracy : 0.6955
## 95% CI : (0.6831, 0.7077)
## No Information Rate : 0.7412
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2878
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5877
## Specificity : 0.7332
## Pos Pred Value : 0.4346
## Neg Pred Value : 0.8359
## Prevalence : 0.2588
## Detection Rate : 0.1521
## Detection Prevalence : 0.3499
## Balanced Accuracy : 0.6604
##
## 'Positive' Class : 1
##
Which produces the same accuracy, but decreases the sensitivity at the cost of increased specificity.
Next we do this on the test set to evaluate model performance outside of the training data
pred_test<-predict(glm1, newdata=model.dat2test, type="response")
pred_cl<-factor(ifelse(pred_test>mean(I(model.dat2test$modcontra==1)), 1, 0))
table(model.dat2test$modcontra,pred_cl)
## pred_cl
## 0 1
## 0 720 288
## 1 154 198
confusionMatrix(data = pred_cl,model.dat2test$modcontra )
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 720 154
## 1 288 198
##
## Accuracy : 0.675
## 95% CI : (0.6494, 0.6999)
## No Information Rate : 0.7412
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2463
##
## Mcnemar's Test P-Value : 2.513e-10
##
## Sensitivity : 0.7143
## Specificity : 0.5625
## Pos Pred Value : 0.8238
## Neg Pred Value : 0.4074
## Prevalence : 0.7412
## Detection Rate : 0.5294
## Detection Prevalence : 0.6426
## Balanced Accuracy : 0.6384
##
## 'Positive' Class : 0
##
In the test data, the model does about as well as it did on the training data, which is ideal.