#load libraries
library(car, quietly = T)
library(stargazer, quietly = T)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey, quietly = T)
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr, quietly = T)
library(dplyr, quietly = T)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2, quietly = T)
library(haven)
dat<-url("https://github.com/coreysparks/data/blob/master/ZZIR62FL.DTA?raw=true")
model.dat<-read_dta(dat)
model.dat<-zap_labels(model.dat)
Here we recode some of our variables and limit our data to those women who are not currently pregnant and who are sexually active.
library(dplyr)
model.dat2<-model.dat%>%
mutate(region = v024,
usecondom= as.factor(ifelse(v761 ==1,1, 0)),
age = v012,
heardofaids=v750,
educ = v106,
sexuallyactive=v536,
knowwheretogetcondom=ifelse(v769==8, 1, 0),
age2=v012^2)%>%
filter(v536>0)%>% #not sexually active
dplyr::select(caseid, region, usecondom,age, age2,heardofaids, educ, knowwheretogetcondom, sexuallyactive)
The binary variable that I selected was v761, which is a yes or no answer to whether they used a condom was used during last sexual encounter. It was recoded using an ifelse statement to change yes to 1 and no to 0.
knitr::kable(head(model.dat2))
| caseid | region | usecondom | age | age2 | heardofaids | educ | knowwheretogetcondom | sexuallyactive |
|---|---|---|---|---|---|---|---|---|
| 1 1 2 | 2 | 0 | 30 | 900 | 0 | 0 | 0 | 2 |
| 1 3 2 | 2 | 0 | 22 | 484 | 1 | 2 | NA | 1 |
| 1 4 2 | 2 | NA | 42 | 1764 | 1 | 0 | 0 | 3 |
| 1 4 3 | 2 | 1 | 25 | 625 | 1 | 1 | NA | 2 |
| 1 5 1 | 2 | 1 | 25 | 625 | 1 | 2 | NA | 2 |
| 1 6 2 | 2 | 0 | 37 | 1369 | 1 | 0 | 0 | 3 |
We use an 80% training fraction, which is standard.
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
set.seed(1115)
train<- createDataPartition(y = model.dat2$usecondom,
p = .80,
list=F)
model.dat2train<-model.dat2[train,]
model.dat2test<-model.dat2[-train,]
table(model.dat2train$usecondom)
##
## 0 1
## 4985 157
prop.table(table(model.dat2$usecondom))
##
## 0 1
## 0.96950366 0.03049634
summary(model.dat2train)
## caseid region usecondom age
## Length:6053 Min. :1.000 0 :4985 Min. :15.00
## Class :character 1st Qu.:1.000 1 : 157 1st Qu.:21.00
## Mode :character Median :2.000 NA's: 911 Median :28.00
## Mean :2.167 Mean :29.45
## 3rd Qu.:3.000 3rd Qu.:36.00
## Max. :4.000 Max. :49.00
##
## age2 heardofaids educ knowwheretogetcondom
## Min. : 225.0 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 441.0 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 784.0 Median :1.0000 Median :0.0000 Median :0.0000
## Mean : 953.6 Mean :0.9532 Mean :0.7274 Mean :0.0217
## 3rd Qu.:1296.0 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:0.0000
## Max. :2401.0 Max. :1.0000 Max. :3.0000 Max. :1.0000
## NA's :9 NA's :1957
## sexuallyactive
## Min. :1.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.661
## 3rd Qu.:2.000
## Max. :3.000
##
The average age of the sample is 29 and a half years old.
glm1<-glm(usecondom~factor(region)+scale(age)+scale(age2)+scale(heardofaids)+factor(educ),
data=model.dat2train[,-1],
family = binomial)
summary(glm1)
##
## Call:
## glm(formula = usecondom ~ factor(region) + scale(age) + scale(age2) +
## scale(heardofaids) + factor(educ), family = binomial, data = model.dat2train[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5162 -0.2944 -0.2158 -0.1864 2.9790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.86631 0.17239 -22.428 < 2e-16 ***
## factor(region)2 0.01615 0.22285 0.072 0.94222
## factor(region)3 0.36669 0.22640 1.620 0.10531
## factor(region)4 0.06107 0.24013 0.254 0.79926
## scale(age) 0.02706 0.63697 0.042 0.96612
## scale(age2) -0.30577 0.65912 -0.464 0.64272
## scale(heardofaids) -0.15106 0.07691 -1.964 0.04953 *
## factor(educ)1 -0.30973 0.33617 -0.921 0.35686
## factor(educ)2 0.60739 0.21164 2.870 0.00411 **
## factor(educ)3 1.27343 0.31048 4.101 4.11e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1404.2 on 5133 degrees of freedom
## Residual deviance: 1352.5 on 5124 degrees of freedom
## (919 observations deleted due to missingness)
## AIC: 1372.5
##
## Number of Fisher Scoring iterations: 7
tr_pred<- predict(glm1,
newdata = model.dat2train,
type = "response")
head(tr_pred)
## 1 2 3 4 5 6
## 0.04152918 0.01376714 0.01739274 0.01663881 0.09540707 0.02558860
tr_predcl<-factor(ifelse(tr_pred>.2, 1, 0))
library(ggplot2)
pred1<-data.frame(pr=tr_pred,
gr=tr_predcl,
modcon=model.dat2train$usecondom)
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=gr, fill=gr))+
ggtitle(label = "Probability of condom use",
subtitle = "Threshold = .2")+
geom_vline(xintercept=.2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
pred1%>%
ggplot()+
geom_histogram(aes(x=pr, color=modcon, fill=modcon))+
ggtitle(label = "Probability of condom use",
subtitle = "Truth")+
geom_vline(xintercept=.2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
table( tr_predcl,
model.dat2train$usecondom)
##
## tr_predcl 0 1
## 0 4977 157
cm1<-confusionMatrix(data = tr_predcl,
reference = model.dat2train$usecondom )
## Warning in confusionMatrix.default(data = tr_predcl, reference =
## model.dat2train$usecondom): Levels are not in the same order for reference and
## data. Refactoring data to match.
cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4977 157
## 1 0 0
##
## Accuracy : 0.9694
## 95% CI : (0.9643, 0.974)
## No Information Rate : 0.9694
## P-Value [Acc > NIR] : 0.5212
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9694
## Neg Pred Value : NaN
## Prevalence : 0.9694
## Detection Rate : 0.9694
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$usecondom==1)), 1, 0)) #mean of response
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
usecon=model.dat2train$usecondom)
pred2%>%
ggplot(aes(x=pr, fill=usecon))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of condom use",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$usecondom==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_vline).
pred2%>%
ggplot(aes(x=pr, fill=usecon))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "Probability of Modern Contraception",
subtitle = "Truth")+
geom_vline(xintercept=mean(I(model.dat2train$usecondom==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_vline).
Which produces the same accuracy, but decreases the sensitivity at the cost of increased specificity.
Next we do this on the test set to evaluate model performance outside of the training data
tr_predcl<-factor(ifelse(tr_pred> .5, 1, 0))
library(ggplot2)
predl<-data.frame(pr=tr_pred,
gr=tr_predcl,
usecon = model.dat2train$usecondom)
pred_test<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$usecondom==1)), 1, 0))
table(model.dat2test$usecondom,pred_cl)
## < table of extent 2 x 0 >
table( tr_predcl,
model.dat2train$usecondom)
##
## tr_predcl 0 1
## 0 4977 157
cml<-confusionMatrix(data = tr_predcl,
reference = model.dat2train$usecondom )
## Warning in confusionMatrix.default(data = tr_predcl, reference =
## model.dat2train$usecondom): Levels are not in the same order for reference and
## data. Refactoring data to match.
confusionMatrix(data = tr_predcl,
model.dat2train$usecondom,
positive = "1" )
## Warning in confusionMatrix.default(data = tr_predcl,
## model.dat2train$usecondom, : Levels are not in the same order for reference and
## data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4977 157
## 1 0 0
##
## Accuracy : 0.9694
## 95% CI : (0.9643, 0.974)
## No Information Rate : 0.9694
## P-Value [Acc > NIR] : 0.5212
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.96942
## Prevalence : 0.03058
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
Overall the model has a 96.9% accuracy, which makes me think that there is an error in the data. The sensitivity is showing 0 and the specificity is 1.
cml<-confusionMatrix(data = tr_predcl,
reference = model.dat2train$usecondom)
## Warning in confusionMatrix.default(data = tr_predcl, reference =
## model.dat2train$usecondom): Levels are not in the same order for reference and
## data. Refactoring data to match.
cml
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4977 157
## 1 0 0
##
## Accuracy : 0.9694
## 95% CI : (0.9643, 0.974)
## No Information Rate : 0.9694
## P-Value [Acc > NIR] : 0.5212
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9694
## Neg Pred Value : NaN
## Prevalence : 0.9694
## Detection Rate : 0.9694
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
tr_predcl<-factor(ifelse(tr_pred>mean(I(model.dat2train$usecondom==1)), 1, 0))
pred2<-data.frame(pr=tr_pred,
gr=tr_predcl,
usecon = model.dat2train$usecondom)
pred_test<-predict(glm1,
newdata=model.dat2test,
type="response")
pred_cl<-factor(ifelse(pred_test > mean( I(model.dat2test$usecondom==1)), 1, 0))
table(model.dat2test$usecondom,pred_cl)
## < table of extent 2 x 0 >
pred2%>%
ggplot(aes(x=pr, fill= usecon))+
geom_histogram(position="identity",
alpha=.2)+
ggtitle(label = "probability of condom use",
subtitle = "truth")+
geom_vline(xintercept = mean(I(model.dat2train$usecondom==1)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_vline).