library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
library(haven)
#read in data
uganda16 <- read_dta("C:/Users/rlutt/Downloads/UGIR7BFL.DTA")
uganda16<-zap_labels(uganda16)
#recodes
#bank account
uganda16$bank<-car::Recode(uganda16$v170, recodes= "0= 'No'; 1= 'Yes'")
#age groups
uganda16$agegroup <- car::Recode(uganda16$v013, recodes = "1='15-19'; 2='20-24';3='25-29'; 4 = '30-34';5='35-39';6='40-44';7='45+' ", as.factor=T)
#fertility preferences
uganda16$wantanotherchild<-ifelse(uganda16$v602!=9&uganda16$v602==1,1,0)
#education level
uganda16$educationlevel <- car::recode(uganda16$v106,
recodes = "0 = 'none'; 1 = 'primEW'; 2:3='secondary and above' ",
as.factor=T)
#internet
uganda16$internet<-as.factor(uganda16$v171a)
uganda16$internet<-car::Recode(uganda16$v171a, recodes= "0='never'; 1= 'in the past year'; 2='over a year ago'; 3= 'yes, but unsure when'", as.factor=T)
#contraception
uganda16$contraception<-as.factor(uganda16$v313)
uganda16$contraception<-car::Recode(uganda16$v313, recodes= "0='none'; 1='folkloric method'; 2='traditional method' ;3= 'modern method'", as.factor=T)
#has another kid
uganda16$currentchildren<-uganda16$v202+uganda16$v203
# survey design variables
uganda16$psu <- uganda16$v021
uganda16$strata <- uganda16$v022
uganda16$pwt <- uganda16$v005/1000000
##unsure if I should code amount of children as continuous or as a factor
##1) Define a binary outcome of your choosing
##Likelihood of wanting another child
##2) Fit a predictive logistic regression model using as many predictor variables as you think you need
##contraception, has children, education, bank, internet, contraception
library(caret)
set.seed(1014)
train<- createDataPartition(y = uganda16$wantanotherchild, p = .80, list=F)
fptrain<-uganda16[train,]
fptest<-uganda16[-train,]
table(fptrain$fertilitypreference)
## Warning: Unknown or uninitialised column: `fertilitypreference`.
## < table of extent 0 >
prop.table(table(fptrain$wantanotherchild))
##
## 0 1
## 0.3779804 0.6220196
glm<-glm(fptrain$wantanotherchild~factor(fptrain$agegroup)+factor(fptrain$educationlevel)+factor(fptrain$internet)+factor(fptrain$contraception) +factor(fptrain$currentchildren),
data=fptrain[,-1],
family = binomial)
summary(glm)
##
## Call:
## glm(formula = fptrain$wantanotherchild ~ factor(fptrain$agegroup) +
## factor(fptrain$educationlevel) + factor(fptrain$internet) +
## factor(fptrain$contraception) + factor(fptrain$currentchildren),
## family = binomial, data = fptrain[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6002 -0.5646 0.4424 0.5915 2.8888
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 2.17007 0.43906 4.943
## factor(fptrain$agegroup)20-24 -0.04694 0.08917 -0.526
## factor(fptrain$agegroup)25-29 -0.74060 0.08929 -8.294
## factor(fptrain$agegroup)30-34 -1.57173 0.09013 -17.439
## factor(fptrain$agegroup)35-39 -2.49377 0.09673 -25.780
## factor(fptrain$agegroup)40-44 -3.64784 0.11549 -31.587
## factor(fptrain$agegroup)45+ -4.54366 0.15730 -28.885
## factor(fptrain$educationlevel)primEW -0.17693 0.07272 -2.433
## factor(fptrain$educationlevel)secondary and above 0.14248 0.08373 1.702
## factor(fptrain$internet)never -0.35270 0.10235 -3.446
## factor(fptrain$internet)over a year ago -0.26134 0.23668 -1.104
## factor(fptrain$contraception)modern method 0.47262 0.41939 1.127
## factor(fptrain$contraception)none 0.53610 0.41832 1.282
## factor(fptrain$contraception)traditional method 1.03331 0.43996 2.349
## factor(fptrain$currentchildren)1 -0.22083 0.07512 -2.940
## factor(fptrain$currentchildren)2 -0.47503 0.07915 -6.002
## factor(fptrain$currentchildren)3 -0.67025 0.08420 -7.960
## factor(fptrain$currentchildren)4 -1.04665 0.09157 -11.430
## factor(fptrain$currentchildren)5 -1.30457 0.10906 -11.962
## factor(fptrain$currentchildren)6 -1.43064 0.14174 -10.093
## factor(fptrain$currentchildren)7 -1.75376 0.20987 -8.357
## factor(fptrain$currentchildren)8 -1.82377 0.34939 -5.220
## factor(fptrain$currentchildren)9 -2.16498 0.74990 -2.887
## factor(fptrain$currentchildren)10 -1.93558 1.08352 -1.786
## factor(fptrain$currentchildren)11 -8.19891 119.46814 -0.069
## Pr(>|z|)
## (Intercept) 7.71e-07 ***
## factor(fptrain$agegroup)20-24 0.598600
## factor(fptrain$agegroup)25-29 < 2e-16 ***
## factor(fptrain$agegroup)30-34 < 2e-16 ***
## factor(fptrain$agegroup)35-39 < 2e-16 ***
## factor(fptrain$agegroup)40-44 < 2e-16 ***
## factor(fptrain$agegroup)45+ < 2e-16 ***
## factor(fptrain$educationlevel)primEW 0.014982 *
## factor(fptrain$educationlevel)secondary and above 0.088814 .
## factor(fptrain$internet)never 0.000569 ***
## factor(fptrain$internet)over a year ago 0.269502
## factor(fptrain$contraception)modern method 0.259775
## factor(fptrain$contraception)none 0.199996
## factor(fptrain$contraception)traditional method 0.018841 *
## factor(fptrain$currentchildren)1 0.003287 **
## factor(fptrain$currentchildren)2 1.95e-09 ***
## factor(fptrain$currentchildren)3 1.72e-15 ***
## factor(fptrain$currentchildren)4 < 2e-16 ***
## factor(fptrain$currentchildren)5 < 2e-16 ***
## factor(fptrain$currentchildren)6 < 2e-16 ***
## factor(fptrain$currentchildren)7 < 2e-16 ***
## factor(fptrain$currentchildren)8 1.79e-07 ***
## factor(fptrain$currentchildren)9 0.003889 **
## factor(fptrain$currentchildren)10 0.074036 .
## factor(fptrain$currentchildren)11 0.945285
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19633 on 14804 degrees of freedom
## Residual deviance: 12805 on 14780 degrees of freedom
## AIC: 12855
##
## Number of Fisher Scoring iterations: 9
3a) Does changing the decision rule threshold affect your classification accuracy?
fppred<- predict(glm,
newdata = fptrain,
type = "response")
head(fppred)
## 1 2 3 4 5 6
## 0.8226670 0.1249549 0.8937480 0.9238575 0.9238575 0.8937480
fppredict2<-factor(ifelse(fppred>.5, 1, 0))
length(fptrain$wantanotherchild)
## [1] 14805
length(fppredict2)
## [1] 14805
table(fppredict2, fptrain$wantanotherchild)
##
## fppredict2 0 1
## 0 3812 1029
## 1 1784 8180
confusionMatrix(data = fppredict2, reference = factor(fptrain$wantanotherchild))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3812 1029
## 1 1784 8180
##
## Accuracy : 0.81
## 95% CI : (0.8036, 0.8163)
## No Information Rate : 0.622
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5849
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6812
## Specificity : 0.8883
## Pos Pred Value : 0.7874
## Neg Pred Value : 0.8210
## Prevalence : 0.3780
## Detection Rate : 0.2575
## Detection Prevalence : 0.3270
## Balanced Accuracy : 0.7847
##
## 'Positive' Class : 0
##
#test with remaining
fppred2<- predict(glm,
newdata = fptest,
type = "response")
## Warning: 'newdata' had 3701 rows but variables found have 14805 rows
head(fppred2)
## 1 2 3 4 5 6
## 0.8226670 0.1249549 0.8937480 0.9238575 0.9238575 0.8937480
fptest2<-factor(ifelse(fppred2>.5, 1, 0))
confusionMatrix(data = fptest2, reference = factor(fptrain$wantanotherchild))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3812 1029
## 1 1784 8180
##
## Accuracy : 0.81
## 95% CI : (0.8036, 0.8163)
## No Information Rate : 0.622
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5849
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6812
## Specificity : 0.8883
## Pos Pred Value : 0.7874
## Neg Pred Value : 0.8210
## Prevalence : 0.3780
## Detection Rate : 0.2575
## Detection Prevalence : 0.3270
## Balanced Accuracy : 0.7847
##
## 'Positive' Class : 0
##