library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(haven)
#read in data
uganda16 <- read_dta("C:/Users/rlutt/Downloads/UGIR7BFL.DTA")
uganda16<-zap_labels(uganda16)
#recodes
#bank account
uganda16$bank<-car::Recode(uganda16$v170, recodes= "0= 'No'; 1= 'Yes'")
#age groups
uganda16$agegroup <- car::Recode(uganda16$v013, recodes = "1='15-19'; 2='20-24';3='25-29'; 4 = '30-34';5='35-39';6='40-44';7='45+' ", as.factor=T)
#fertility preferences
uganda16$wantanotherchild<-ifelse(uganda16$v602!=9&uganda16$v602==1,1,0)

#education level
uganda16$educationlevel <- car::recode(uganda16$v106, 
                            recodes = "0 = 'none'; 1 = 'primEW'; 2:3='secondary and above' ",
                            as.factor=T)
#internet
uganda16$internet<-as.factor(uganda16$v171a)
uganda16$internet<-car::Recode(uganda16$v171a, recodes= "0='never'; 1= 'in the past year'; 2='over a year ago'; 3= 'yes, but unsure when'", as.factor=T)

#contraception
uganda16$contraception<-as.factor(uganda16$v313)
uganda16$contraception<-car::Recode(uganda16$v313, recodes= "0='none'; 1='folkloric method'; 2='traditional method' ;3= 'modern method'", as.factor=T)

#has another kid
uganda16$currentchildren<-uganda16$v202+uganda16$v203

# survey design variables
uganda16$psu <- uganda16$v021
uganda16$strata <- uganda16$v022
uganda16$pwt <- uganda16$v005/1000000

##unsure if I should code amount of children as continuous or as a factor

##1) Define a binary outcome of your choosing

##Likelihood of wanting another child

##2) Fit a predictive logistic regression model using as many predictor variables as you think you need

##contraception, has children, education, bank, internet, contraception

  1. Use a 80% training/20% test split for your data.
library(caret)
set.seed(1014)
train<- createDataPartition(y = uganda16$wantanotherchild, p = .80, list=F)

fptrain<-uganda16[train,]
fptest<-uganda16[-train,]

table(fptrain$fertilitypreference)
## Warning: Unknown or uninitialised column: `fertilitypreference`.
## < table of extent 0 >
prop.table(table(fptrain$wantanotherchild))
## 
##         0         1 
## 0.3779804 0.6220196
  1. Report the % correct classification from the training data using the .5 decision rule and again using the mean of the outcome decision rule
glm<-glm(fptrain$wantanotherchild~factor(fptrain$agegroup)+factor(fptrain$educationlevel)+factor(fptrain$internet)+factor(fptrain$contraception) +factor(fptrain$currentchildren),
          data=fptrain[,-1],
          family = binomial)

summary(glm)
## 
## Call:
## glm(formula = fptrain$wantanotherchild ~ factor(fptrain$agegroup) + 
##     factor(fptrain$educationlevel) + factor(fptrain$internet) + 
##     factor(fptrain$contraception) + factor(fptrain$currentchildren), 
##     family = binomial, data = fptrain[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6002  -0.5646   0.4424   0.5915   2.8888  
## 
## Coefficients:
##                                                    Estimate Std. Error z value
## (Intercept)                                         2.17007    0.43906   4.943
## factor(fptrain$agegroup)20-24                      -0.04694    0.08917  -0.526
## factor(fptrain$agegroup)25-29                      -0.74060    0.08929  -8.294
## factor(fptrain$agegroup)30-34                      -1.57173    0.09013 -17.439
## factor(fptrain$agegroup)35-39                      -2.49377    0.09673 -25.780
## factor(fptrain$agegroup)40-44                      -3.64784    0.11549 -31.587
## factor(fptrain$agegroup)45+                        -4.54366    0.15730 -28.885
## factor(fptrain$educationlevel)primEW               -0.17693    0.07272  -2.433
## factor(fptrain$educationlevel)secondary and above   0.14248    0.08373   1.702
## factor(fptrain$internet)never                      -0.35270    0.10235  -3.446
## factor(fptrain$internet)over a year ago            -0.26134    0.23668  -1.104
## factor(fptrain$contraception)modern method          0.47262    0.41939   1.127
## factor(fptrain$contraception)none                   0.53610    0.41832   1.282
## factor(fptrain$contraception)traditional method     1.03331    0.43996   2.349
## factor(fptrain$currentchildren)1                   -0.22083    0.07512  -2.940
## factor(fptrain$currentchildren)2                   -0.47503    0.07915  -6.002
## factor(fptrain$currentchildren)3                   -0.67025    0.08420  -7.960
## factor(fptrain$currentchildren)4                   -1.04665    0.09157 -11.430
## factor(fptrain$currentchildren)5                   -1.30457    0.10906 -11.962
## factor(fptrain$currentchildren)6                   -1.43064    0.14174 -10.093
## factor(fptrain$currentchildren)7                   -1.75376    0.20987  -8.357
## factor(fptrain$currentchildren)8                   -1.82377    0.34939  -5.220
## factor(fptrain$currentchildren)9                   -2.16498    0.74990  -2.887
## factor(fptrain$currentchildren)10                  -1.93558    1.08352  -1.786
## factor(fptrain$currentchildren)11                  -8.19891  119.46814  -0.069
##                                                   Pr(>|z|)    
## (Intercept)                                       7.71e-07 ***
## factor(fptrain$agegroup)20-24                     0.598600    
## factor(fptrain$agegroup)25-29                      < 2e-16 ***
## factor(fptrain$agegroup)30-34                      < 2e-16 ***
## factor(fptrain$agegroup)35-39                      < 2e-16 ***
## factor(fptrain$agegroup)40-44                      < 2e-16 ***
## factor(fptrain$agegroup)45+                        < 2e-16 ***
## factor(fptrain$educationlevel)primEW              0.014982 *  
## factor(fptrain$educationlevel)secondary and above 0.088814 .  
## factor(fptrain$internet)never                     0.000569 ***
## factor(fptrain$internet)over a year ago           0.269502    
## factor(fptrain$contraception)modern method        0.259775    
## factor(fptrain$contraception)none                 0.199996    
## factor(fptrain$contraception)traditional method   0.018841 *  
## factor(fptrain$currentchildren)1                  0.003287 ** 
## factor(fptrain$currentchildren)2                  1.95e-09 ***
## factor(fptrain$currentchildren)3                  1.72e-15 ***
## factor(fptrain$currentchildren)4                   < 2e-16 ***
## factor(fptrain$currentchildren)5                   < 2e-16 ***
## factor(fptrain$currentchildren)6                   < 2e-16 ***
## factor(fptrain$currentchildren)7                   < 2e-16 ***
## factor(fptrain$currentchildren)8                  1.79e-07 ***
## factor(fptrain$currentchildren)9                  0.003889 ** 
## factor(fptrain$currentchildren)10                 0.074036 .  
## factor(fptrain$currentchildren)11                 0.945285    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19633  on 14804  degrees of freedom
## Residual deviance: 12805  on 14780  degrees of freedom
## AIC: 12855
## 
## Number of Fisher Scoring iterations: 9

3a) Does changing the decision rule threshold affect your classification accuracy?

fppred<- predict(glm,
                  newdata = fptrain,
                  type = "response")

head(fppred)
##         1         2         3         4         5         6 
## 0.8226670 0.1249549 0.8937480 0.9238575 0.9238575 0.8937480
  1. Report the % correct classification from the test data using the .5 decision rule and again useing the mean of the outcome decision rule
fppredict2<-factor(ifelse(fppred>.5, 1, 0))

length(fptrain$wantanotherchild)
## [1] 14805
length(fppredict2)
## [1] 14805
table(fppredict2, fptrain$wantanotherchild)
##           
## fppredict2    0    1
##          0 3812 1029
##          1 1784 8180
confusionMatrix(data = fppredict2, reference = factor(fptrain$wantanotherchild))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3812 1029
##          1 1784 8180
##                                           
##                Accuracy : 0.81            
##                  95% CI : (0.8036, 0.8163)
##     No Information Rate : 0.622           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5849          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6812          
##             Specificity : 0.8883          
##          Pos Pred Value : 0.7874          
##          Neg Pred Value : 0.8210          
##              Prevalence : 0.3780          
##          Detection Rate : 0.2575          
##    Detection Prevalence : 0.3270          
##       Balanced Accuracy : 0.7847          
##                                           
##        'Positive' Class : 0               
## 
#test with remaining
fppred2<- predict(glm,
                  newdata = fptest,
                  type = "response")
## Warning: 'newdata' had 3701 rows but variables found have 14805 rows
head(fppred2)
##         1         2         3         4         5         6 
## 0.8226670 0.1249549 0.8937480 0.9238575 0.9238575 0.8937480
fptest2<-factor(ifelse(fppred2>.5, 1, 0))
confusionMatrix(data = fptest2, reference = factor(fptrain$wantanotherchild))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3812 1029
##          1 1784 8180
##                                           
##                Accuracy : 0.81            
##                  95% CI : (0.8036, 0.8163)
##     No Information Rate : 0.622           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5849          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6812          
##             Specificity : 0.8883          
##          Pos Pred Value : 0.7874          
##          Neg Pred Value : 0.8210          
##              Prevalence : 0.3780          
##          Detection Rate : 0.2575          
##    Detection Prevalence : 0.3270          
##       Balanced Accuracy : 0.7847          
##                                           
##        'Positive' Class : 0               
##