library(haven)
library(foreign) 
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(broom)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(readr) 
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(car) 
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(alr3)

## 
## Attaching package: 'alr3'

## The following object is masked from 'package:MASS':
## 
##     forbes

library(zoo)
library(nortest)
library(plotrix)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:readr':
## 
##     col_factor

library(tableone)
library(Weighted.Desc.Stat)
library(mitools)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(VGAM)

## Loading required package: stats4

## Loading required package: splines

## 
## Attaching package: 'VGAM'

## The following object is masked from 'package:survey':
## 
##     calibrate

## The following object is masked from 'package:lmtest':
## 
##     lrtest

## The following object is masked from 'package:car':
## 
##     logit

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(sandwich)
library(mice)

## Warning: package 'mice' was built under R version 3.6.3

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

anes2016<-read_dta("C:\\Users\\Jaire\\OneDrive\\Desktop\\stats for dem data 2\\Homework 5\\ANES2016.dta")

Data

The American National Election Studies. These materials are based on work supported by the National Science Foundation under grant numbers SES 1444721, 2014-2017, the University of Michigan, and Stanford University. N = 4271

Outcome:

# Self-reated health

anes2016$SelfratedHlth<-recode(anes2016$V161115, recodes = "1='5'; 2='4'; ; 4='2';5='1';-9=NA", as.factor = T)
anes2016$SelfratedHlth<-recode(anes2016$SelfratedHlth, recodes = "1:2='Negative'; 3:5='Positive'", as.factor = T)
table(anes2016$SelfratedHlth)

## 
## Negative Positive 
##      750     3513

Predictors:

# Income

anes2016$Income<-recode(anes2016$V161361X, recodes = "-9:-5=NA; 1:14='49.9k or less';15:22='50k to 99.9k';23:28='100k to 150k'", as.factor = T )
table(anes2016$Income)

## 
##  100k to 150k 49.9k or less  50k to 99.9k 
##           988          1809          1272

# Generation

anes2016$Generation<-recode(anes2016$V161267C, recodes = "1928:1945=NA;1946:1964='BabyBoomer';1965:1980='X';1981:1996='Millenial'; 1997:1998='Z'; 1926:1927=NA; -9:-8=NA", as.factor = T)
table(anes2016$Generation)

## 
## BabyBoomer  Millenial          X          Z 
##       1473       1051       1046         67

# Working status
anes2016$Employed<-recode(anes2016$V161277, recodes = "-9=NA; 2:8='0'", as.factor = T) 
table(anes2016$Employed)

## 
##    0    1 
## 1708 2547

# College education
anes2016$CollegeEd<-recode(anes2016$V161270, recodes = "90:95=NA;-9=NA;1:10='0'; 11:16='1'", as.factor = T)
table(anes2016$CollegeEd)

## 
##    0    1 
## 1991 2236

# Racial Minority

anes2016$MinorityRace<-recode(anes2016$V161310X, recodes = "-9=NA; 1='0'; 2:6='1'", as.factor = T)
table(anes2016$MinorityRace)

## 
##    0    1 
## 3038 1200

Logit Regression Analysis Original Data:

sub1<-dplyr::select(anes2016, SelfratedHlth, Income, Generation, Employed, CollegeEd, MinorityRace) %>%
  filter(complete.cases(.))


options(survey.lonely.psu = "adjust")
des<-svydesign(nest = TRUE, ids=~anes2016$V160202, strata =~anes2016$V160201, weights =~anes2016$V160101, data=anes2016)

fit.logit1<-svyglm(SelfratedHlth~ Income+Generation+Employed+CollegeEd+MinorityRace, 
          design= des,
          family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit1)

## 
## Call:
## svyglm(formula = SelfratedHlth ~ Income + Generation + Employed + 
##     CollegeEd + MinorityRace, design = des, family = binomial)
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~anes2016$V160202, strata = ~anes2016$V160201, 
##     weights = ~anes2016$V160101, data = anes2016)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.41778    0.18900   7.501 1.02e-11 ***
## Income49.9k or less -1.22422    0.18287  -6.695 6.54e-10 ***
## Income50k to 99.9k  -0.08791    0.20620  -0.426 0.670620    
## GenerationMillenial  0.80325    0.15240   5.271 5.77e-07 ***
## GenerationX          0.10448    0.13100   0.798 0.426626    
## GenerationZ          1.58992    0.46924   3.388 0.000941 ***
## Employed1            0.77979    0.12082   6.454 2.17e-09 ***
## CollegeEd1           0.49084    0.11576   4.240 4.31e-05 ***
## MinorityRace1       -0.10761    0.12884  -0.835 0.405166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.082511)
## 
## Number of Fisher Scoring iterations: 5

Missingness Pattern:

summary(anes2016[, c("SelfratedHlth", "Income", "Generation", "Employed", "CollegeEd", "MinorityRace")])

##   SelfratedHlth            Income          Generation   Employed    CollegeEd  
##  Negative: 750   100k to 150k : 988   BabyBoomer:1473   0   :1708   0   :1991  
##  Positive:3513   49.9k or less:1809   Millenial :1051   1   :2547   1   :2236  
##  NA's    :   8   50k to 99.9k :1272   X         :1046   NA's:  16   NA's:  44  
##                  NA's         : 202   Z         :  67                          
##                                       NA's      : 634                          
##  MinorityRace
##  0   :3038   
##  1   :1200   
##  NA's:  33   
##              
##

The pattern of missingness for each indicator is shown above, I tanslate these into percentages of the sample missing:

Self-rated health data are missing for 0.18% of the sample.
Total family income data are missing for 4.7% of the sample.
Birth year data are missing for 14.8% of the sample.
Working status data are missing for 0.37% of the sample.
Highesti education level data are missing for 1.03% of the sample.
Self-identified race data are missing for .77% of the sample.

Self-rated health has the lowest number of missing cases among these indicators.

Modal Imputation:

Outcome:

# Self-rated health
table(anes2016$SelfratedHlth)

## 
## Negative Positive 
##      750     3513

# most common value
mcv.SelfratedHlth<-factor(names(which.max(table(anes2016$SelfratedHlth))), levels=levels(anes2016$SelfratedHlth))
mcv.SelfratedHlth

## [1] Positive
## Levels: Negative Positive

# Imputed cases and proportion 
anes2016$SelfratedHlth.imp<-as.factor(ifelse(is.na(anes2016$SelfratedHlth)==T, mcv.SelfratedHlth, anes2016$SelfratedHlth))
levels(anes2016$SelfratedHlth.imp)<-levels(anes2016$SelfratedHlth)

prop.table(table(anes2016$SelfratedHlth))

## 
##  Negative  Positive 
## 0.1759324 0.8240676

prop.table(table(anes2016$SelfratedHlth.imp))

## 
##  Negative  Positive 
## 0.1756029 0.8243971

barplot(prop.table(table(anes2016$SelfratedHlth)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$SelfratedHlth.imp)), main="Imputed Data", ylim=c(0, 1))

Predictors:

table(anes2016$Income)

## 
##  100k to 150k 49.9k or less  50k to 99.9k 
##           988          1809          1272

mcv.Income<-factor(names(which.max(table(anes2016$Income))), levels=levels(anes2016$Income))
mcv.Income

## [1] 49.9k or less
## Levels: 100k to 150k 49.9k or less 50k to 99.9k

anes2016$Income.imp<-as.factor(ifelse(is.na(anes2016$Income)==T, mcv.Income, anes2016$Income))
levels(anes2016$Income.imp)<-levels(anes2016$Income)

prop.table(table(anes2016$Income))

## 
##  100k to 150k 49.9k or less  50k to 99.9k 
##     0.2428115     0.4445810     0.3126075

prop.table(table(anes2016$Income))

## 
##  100k to 150k 49.9k or less  50k to 99.9k 
##     0.2428115     0.4445810     0.3126075

barplot(prop.table(table(anes2016$Income)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$Income.imp)), main="Imputed Data", ylim=c(0, 1))

table(anes2016$Generation)

## 
## BabyBoomer  Millenial          X          Z 
##       1473       1051       1046         67

mcv.Generation<-factor(names(which.max(table(anes2016$Generation))), levels=levels(anes2016$Generation))
mcv.Generation

## [1] BabyBoomer
## Levels: BabyBoomer Millenial X Z

anes2016$Generation.imp<-as.factor(ifelse(is.na(anes2016$Generation)==T, mcv.Generation, anes2016$Generation))
levels(anes2016$Generation.imp)<-levels(anes2016$Generation)

prop.table(table(anes2016$Generation))

## 
## BabyBoomer  Millenial          X          Z 
## 0.40500412 0.28897443 0.28759967 0.01842178

prop.table(table(anes2016$Generation))

## 
## BabyBoomer  Millenial          X          Z 
## 0.40500412 0.28897443 0.28759967 0.01842178

barplot(prop.table(table(anes2016$Generation)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$Generation.imp)), main="Imputed Data", ylim=c(0, 1))

table(anes2016$Employed)

## 
##    0    1 
## 1708 2547

mcv.Employed<-factor(names(which.max(table(anes2016$Employed))), levels=levels(anes2016$Employed))
mcv.Employed

## [1] 1
## Levels: 0 1

anes2016$Employed.imp<-as.factor(ifelse(is.na(anes2016$Employed)==T, mcv.Employed, anes2016$Employed))
levels(anes2016$Employed.imp)<-levels(anes2016$Employed)

prop.table(table(anes2016$Employed))

## 
##         0         1 
## 0.4014101 0.5985899

prop.table(table(anes2016$Employed))

## 
##         0         1 
## 0.4014101 0.5985899

barplot(prop.table(table(anes2016$Employed)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$Employed.imp)), main="Imputed Data", ylim=c(0, 1))

table(anes2016$CollegeEd)

## 
##    0    1 
## 1991 2236

mcv.CollegeEd<-factor(names(which.max(table(anes2016$CollegeEd))), levels=levels(anes2016$CollegeEd))
mcv.CollegeEd

## [1] 1
## Levels: 0 1

anes2016$CollegeEd.imp<-as.factor(ifelse(is.na(anes2016$CollegeEd)==T, mcv.CollegeEd, anes2016$CollegeEd))
levels(anes2016$CollegeEd.imp)<-levels(anes2016$CollegeEd)

prop.table(table(anes2016$CollegeEd))

## 
##         0         1 
## 0.4710196 0.5289804

prop.table(table(anes2016$CollegeEd))

## 
##         0         1 
## 0.4710196 0.5289804

barplot(prop.table(table(anes2016$CollegeEd)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$CollegeEd.imp)), main="Imputed Data", ylim=c(0, 1))

table(anes2016$MinorityRace)

## 
##    0    1 
## 3038 1200

mcv.MinorityRace<-factor(names(which.max(table(anes2016$MinorityRace))), levels=levels(anes2016$MinorityRace))
mcv.MinorityRace

## [1] 0
## Levels: 0 1

anes2016$MinorityRace.imp<-as.factor(ifelse(is.na(anes2016$MinorityRace)==T, mcv.MinorityRace, anes2016$MinorityRace))
levels(anes2016$MinorityRace.imp)<-levels(anes2016$MinorityRace)

prop.table(table(anes2016$MinorityRace))

## 
##         0         1 
## 0.7168476 0.2831524

prop.table(table(anes2016$MinorityRace))

## 
##         0         1 
## 0.7168476 0.2831524

barplot(prop.table(table(anes2016$MinorityRace)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(anes2016$MinorityRace.imp)), main="Imputed Data", ylim=c(0, 1))

Logit Regression Analysis Modal Imputation Data:

sub2<-dplyr::select(anes2016, SelfratedHlth.imp, Income.imp, Generation.imp, Employed.imp, CollegeEd.imp, MinorityRace.imp) %>%
  filter(complete.cases(.))


options(survey.lonely.psu = "adjust")
des<-svydesign(nest = TRUE, ids=~anes2016$V160202, strata =~anes2016$V160201, weights =~anes2016$V160101, data=anes2016)

fit.logit2<-svyglm(SelfratedHlth.imp~ Income.imp+Generation.imp+Employed.imp+CollegeEd.imp+MinorityRace.imp, 
          design= des,
          family=binomial)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.logit2)

## 
## Call:
## svyglm(formula = SelfratedHlth.imp ~ Income.imp + Generation.imp + 
##     Employed.imp + CollegeEd.imp + MinorityRace.imp, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~anes2016$V160202, strata = ~anes2016$V160201, 
##     weights = ~anes2016$V160101, data = anes2016)
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.35505    0.16302   8.312 1.34e-13 ***
## Income.imp49.9k or less -1.01197    0.15751  -6.425 2.51e-09 ***
## Income.imp50k to 99.9k  -0.02984    0.17608  -0.169  0.86569    
## Generation.impMillenial  0.65585    0.13737   4.774 4.95e-06 ***
## Generation.impX         -0.03743    0.11740  -0.319  0.75038    
## Generation.impZ          1.49678    0.45707   3.275  0.00137 ** 
## Employed.imp1            0.88109    0.10986   8.020 6.50e-13 ***
## CollegeEd.imp1           0.48587    0.10060   4.830 3.92e-06 ***
## MinorityRace.imp1       -0.15367    0.10820  -1.420  0.15803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.027584)
## 
## Number of Fisher Scoring iterations: 5

Comparison of Model and Original Data:

The dispersion parameter for model 2 (fit.logit2) using the imputed data is lower than the dispersion parameter of model 1 - difference -0.054927. Therefore, modal imputation improved the model fit of this analysis.

Multiple Imputation:

md.pattern(anes2016[,c("SelfratedHlth", "Income", "Generation", "Employed", "CollegeEd", "MinorityRace")])

##      SelfratedHlth Employed MinorityRace CollegeEd Income Generation    
## 3494             1        1            1         1      1          1   0
## 522              1        1            1         1      1          0   1
## 103              1        1            1         1      0          1   1
## 75               1        1            1         1      0          0   2
## 20               1        1            1         0      1          1   1
## 9                1        1            1         0      1          0   2
## 2                1        1            1         0      0          1   2
## 2                1        1            1         0      0          0   3
## 12               1        1            0         1      1          1   1
## 3                1        1            0         1      1          0   2
## 7                1        1            0         1      0          0   3
## 1                1        1            0         0      0          0   4
## 1                1        0            1         1      1          0   2
## 1                1        0            1         1      0          1   2
## 1                1        0            1         1      0          0   3
## 1                1        0            1         0      1          1   2
## 3                1        0            1         0      0          0   4
## 2                1        0            0         1      1          0   3
## 1                1        0            0         0      0          1   4
## 3                1        0            0         0      0          0   5
## 2                0        1            1         1      1          1   1
## 1                0        1            1         1      1          0   2
## 1                0        1            1         1      0          0   3
## 1                0        1            0         1      1          1   2
## 1                0        0            0         1      1          0   4
## 2                0        0            0         0      0          0   6
##                  8       16           33        44    202        634 937

md.pairs(anes2016[,c("SelfratedHlth", "Income", "Generation", "Employed", "CollegeEd", "MinorityRace")])

## $rr
##               SelfratedHlth Income Generation Employed CollegeEd MinorityRace
## SelfratedHlth          4263   4064       3634     4250      4221         4234
## Income                 4064   4069       3530     4064      4039         4050
## Generation             3634   3530       3637     3634      3613         3623
## Employed               4250   4064       3634     4255      4221         4231
## CollegeEd              4221   4039       3613     4221      4227         4201
## MinorityRace           4234   4050       3623     4231      4201         4238
## 
## $rm
##               SelfratedHlth Income Generation Employed CollegeEd MinorityRace
## SelfratedHlth             0    199        629       13        42           29
## Income                    5      0        539        5        30           19
## Generation                3    107          0        3        24           14
## Employed                  5    191        621        0        34           24
## CollegeEd                 6    188        614        6         0           26
## MinorityRace              4    188        615        7        37            0
## 
## $mr
##               SelfratedHlth Income Generation Employed CollegeEd MinorityRace
## SelfratedHlth             0      5          3        5         6            4
## Income                  199      0        107      191       188          188
## Generation              629    539          0      621       614          615
## Employed                 13      5          3        0         6            7
## CollegeEd                42     30         24       34         0           37
## MinorityRace             29     19         14       24        26            0
## 
## $mm
##               SelfratedHlth Income Generation Employed CollegeEd MinorityRace
## SelfratedHlth             8      3          5        3         2            4
## Income                    3    202         95       11        14           14
## Generation                5     95        634       13        20           19
## Employed                  3     11         13       16        10            9
## CollegeEd                 2     14         20       10        44            7
## MinorityRace              4     14         19        9         7           33

dat2<-anes2016
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)

imp<-mice(data = dat2[,c("SelfratedHlth", "Income", "Generation", "Employed", "CollegeEd", "MinorityRace")], seed = 22, m = 5)

## 
##  iter imp variable
##   1   1  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   1   2  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   1   3  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   1   4  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   1   5  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   2   1  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   2   2  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   2   3  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   2   4  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   2   5  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   3   1  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   3   2  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   3   3  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   3   4  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   3   5  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   4   1  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   4   2  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   4   3  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   4   4  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   4   5  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   5   1  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   5   2  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   5   3  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   5   4  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace
##   5   5  SelfratedHlth  Income  Generation  Employed  CollegeEd  MinorityRace

print(imp)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
## SelfratedHlth        Income    Generation      Employed     CollegeEd 
##      "logreg"     "polyreg"     "polyreg"      "logreg"      "logreg" 
##  MinorityRace 
##      "logreg" 
## PredictorMatrix:
##               SelfratedHlth Income Generation Employed CollegeEd MinorityRace
## SelfratedHlth             0      1          1        1         1            1
## Income                    1      0          1        1         1            1
## Generation                1      1          0        1         1            1
## Employed                  1      1          1        0         1            1
## CollegeEd                 1      1          1        1         0            1
## MinorityRace              1      1          1        1         1            0

head(imp$imp$SelfratedHlth)

##             1        2        3        4        5
## 982  Positive Negative Positive Positive Positive
## 1573 Positive Negative Negative Positive Positive
## 2190 Negative Negative Positive Negative Negative
## 2459 Positive Positive Positive Positive Positive
## 2465 Positive Positive Positive Positive Positive
## 3151 Negative Positive Positive Positive Negative

summary(imp$imp$SelfratedHlth)

##         1            2            3            4            5    
##  Negative:3   Negative:3   Negative:2   Negative:1   Negative:2  
##  Positive:5   Positive:5   Positive:6   Positive:7   Positive:6

summary(anes2016$SelfratedHlth)

## Negative Positive     NA's 
##      750     3513        8

head(imp$imp$Income)

##                1             2             3             4             5
## 23  100k to 150k 49.9k or less  50k to 99.9k  100k to 150k  100k to 150k
## 27  50k to 99.9k  100k to 150k  50k to 99.9k  100k to 150k  50k to 99.9k
## 33  50k to 99.9k 49.9k or less 49.9k or less  50k to 99.9k 49.9k or less
## 41 49.9k or less  50k to 99.9k 49.9k or less  50k to 99.9k 49.9k or less
## 61 49.9k or less 49.9k or less 49.9k or less 49.9k or less 49.9k or less
## 93 49.9k or less 49.9k or less  50k to 99.9k 49.9k or less  100k to 150k

summary(imp$imp$Income)

##              1                  2                   3                  4     
##  100k to 150k :54   100k to 150k : 43   100k to 150k :55   100k to 150k :46  
##  49.9k or less:96   49.9k or less:102   49.9k or less:92   49.9k or less:91  
##  50k to 99.9k :52   50k to 99.9k : 57   50k to 99.9k :55   50k to 99.9k :65  
##              5     
##  100k to 150k :45  
##  49.9k or less:87  
##  50k to 99.9k :70

summary(anes2016$Income)

##  100k to 150k 49.9k or less  50k to 99.9k          NA's 
##           988          1809          1272           202

head(imp$imp$Generation)

##             1          2          3          4          5
## 12  Millenial BabyBoomer BabyBoomer          Z BabyBoomer
## 29 BabyBoomer BabyBoomer BabyBoomer BabyBoomer          X
## 33 BabyBoomer BabyBoomer  Millenial BabyBoomer          Z
## 41  Millenial BabyBoomer          X BabyBoomer          X
## 57 BabyBoomer          X  Millenial          X  Millenial
## 62 BabyBoomer BabyBoomer          Z          X BabyBoomer

summary(imp$imp$Generation)

##           1                2                3                4      
##  BabyBoomer:340   BabyBoomer:346   BabyBoomer:338   BabyBoomer:333  
##  Millenial :148   Millenial :147   Millenial :148   Millenial :137  
##  X         :136   X         :121   X         :130   X         :145  
##  Z         : 10   Z         : 20   Z         : 18   Z         : 19  
##           5      
##  BabyBoomer:353  
##  Millenial :145  
##  X         :116  
##  Z         : 20

summary(anes2016$Generation)

## BabyBoomer  Millenial          X          Z       NA's 
##       1473       1051       1046         67        634

head(imp$imp$Employed)

##      1 2 3 4 5
## 982  0 0 0 1 0
## 1224 0 1 1 1 1
## 1799 0 0 1 0 0
## 1844 1 1 0 1 1
## 1885 0 0 1 0 1
## 2076 1 0 1 1 1

summary(imp$imp$Employed)

##  1     2      3      4      5    
##  0:7   0: 5   0: 4   0: 3   0:7  
##  1:9   1:11   1:12   1:13   1:9

summary(anes2016$Employed)

##    0    1 NA's 
## 1708 2547   16

head(imp$imp$CollegeEd)

##     1 2 3 4 5
## 49  0 0 1 1 1
## 132 1 1 0 1 1
## 259 1 0 1 0 0
## 429 1 1 0 0 0
## 691 0 0 1 1 1
## 816 1 0 0 1 1

summary(imp$imp$CollegeEd)

##  1      2      3      4      5     
##  0:17   0:24   0:23   0:16   0:21  
##  1:27   1:20   1:21   1:28   1:23

summary(anes2016$CollegeEd)

##    0    1 NA's 
## 1991 2236   44

head(imp$imp$MinorityRace)

##      1 2 3 4 5
## 572  0 0 1 0 0
## 904  1 0 0 0 0
## 982  1 1 0 0 1
## 1059 0 0 0 0 0
## 1232 0 1 0 0 0
## 1411 0 1 1 0 1

summary(imp$imp$MinorityRace)

##  1      2      3      4      5     
##  0:26   0:23   0:20   0:22   0:23  
##  1: 7   1:10   1:13   1:11   1:10

summary(anes2016$MinorityRace)

##    0    1 NA's 
## 3038 1200   33

library(lattice)
stripplot(imp,SelfratedHlth~Income|.imp, pch=20)

dat.imp <-complete(imp, action = 1)
head(dat.imp, n=10)

##    SelfratedHlth        Income Generation Employed CollegeEd MinorityRace
## 1       Positive 49.9k or less  Millenial        1         0            0
## 2       Negative  50k to 99.9k  Millenial        1         1            0
## 3       Positive 49.9k or less  Millenial        1         0            0
## 4       Positive  50k to 99.9k BabyBoomer        1         0            0
## 5       Positive 49.9k or less          X        1         0            0
## 6       Positive 49.9k or less BabyBoomer        1         1            0
## 7       Negative  50k to 99.9k BabyBoomer        1         0            1
## 8       Positive  100k to 150k BabyBoomer        1         0            0
## 9       Positive 49.9k or less          X        1         0            1
## 10      Positive  50k to 99.9k  Millenial        1         0            0

head(anes2016[,c("SelfratedHlth", "Income", "Generation", "Employed", "CollegeEd", "MinorityRace")], n=10)

## # A tibble: 10 x 6
##    SelfratedHlth Income        Generation Employed CollegeEd MinorityRace
##    <fct>         <fct>         <fct>      <fct>    <fct>     <fct>       
##  1 Positive      49.9k or less Millenial  1        0         0           
##  2 Negative      50k to 99.9k  Millenial  1        1         0           
##  3 Positive      49.9k or less Millenial  1        0         0           
##  4 Positive      50k to 99.9k  BabyBoomer 1        0         0           
##  5 Positive      49.9k or less X          1        0         0           
##  6 Positive      49.9k or less BabyBoomer 1        1         0           
##  7 Negative      50k to 99.9k  BabyBoomer 1        0         1           
##  8 Positive      100k to 150k  BabyBoomer 1        0         0           
##  9 Positive      49.9k or less X          1        0         1           
## 10 Positive      50k to 99.9k  Millenial  1        0         0

Most of the data are categorical and the resemblance is robust.

head(dat.imp[is.na(anes2016$SelfratedHlth)==T,], n=10)

##      SelfratedHlth        Income Generation Employed CollegeEd MinorityRace
## 982       Positive 49.9k or less BabyBoomer        0         1            1
## 1573      Positive 49.9k or less BabyBoomer        0         1            0
## 2190      Negative 49.9k or less BabyBoomer        0         1            0
## 2459      Positive  50k to 99.9k  Millenial        1         1            0
## 2465      Positive  100k to 150k  Millenial        1         1            0
## 3151      Negative 49.9k or less BabyBoomer        1         0            1
## 3788      Positive 49.9k or less          X        1         0            1
## 3922      Negative 49.9k or less BabyBoomer        0         1            0

After doing this, greater variation in missingness between the original and imputed data is apparent.

fit.SelfratedHlth<-with(data=imp ,expr=svyglm(SelfratedHlth~ Income+Generation+Employed+CollegeEd+MinorityRace,
          design= des,
          family=binomial))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.SelfratedHlth)

## # A tibble: 45 x 5
##    term                estimate std.error statistic  p.value
##    <chr>                  <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)           1.42       0.189     7.50  1.02e-11
##  2 Income49.9k or less  -1.22       0.183    -6.69  6.54e-10
##  3 Income50k to 99.9k   -0.0879     0.206    -0.426 6.71e- 1
##  4 GenerationMillenial   0.803      0.152     5.27  5.77e- 7
##  5 GenerationX           0.104      0.131     0.798 4.27e- 1
##  6 GenerationZ           1.59       0.469     3.39  9.41e- 4
##  7 Employed1             0.780      0.121     6.45  2.17e- 9
##  8 CollegeEd1            0.491      0.116     4.24  4.31e- 5
##  9 MinorityRace1        -0.108      0.129    -0.835 4.05e- 1
## 10 (Intercept)           1.42       0.189     7.50  1.02e-11
## # ... with 35 more rows

Comparison Mulitple Imputed and Original Data:

fit.imp<-svyglm(SelfratedHlth~ Income+Generation+Employed+CollegeEd+MinorityRace, design= des,
          family=binomial, data=dat.imp)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.imp)

## 
## Call:
## svyglm(formula = SelfratedHlth ~ Income + Generation + Employed + 
##     CollegeEd + MinorityRace, design = des, family = binomial, 
##     data = dat.imp)
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~anes2016$V160202, strata = ~anes2016$V160201, 
##     weights = ~anes2016$V160101, data = anes2016)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.41778    0.18900   7.501 1.02e-11 ***
## Income49.9k or less -1.22422    0.18287  -6.695 6.54e-10 ***
## Income50k to 99.9k  -0.08791    0.20620  -0.426 0.670620    
## GenerationMillenial  0.80325    0.15240   5.271 5.77e-07 ***
## GenerationX          0.10448    0.13100   0.798 0.426626    
## GenerationZ          1.58992    0.46924   3.388 0.000941 ***
## Employed1            0.77979    0.12082   6.454 2.17e-09 ***
## CollegeEd1           0.49084    0.11576   4.240 4.31e-05 ***
## MinorityRace1       -0.10761    0.12884  -0.835 0.405166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.082511)
## 
## Number of Fisher Scoring iterations: 5

summary(fit.logit1)

## 
## Call:
## svyglm(formula = SelfratedHlth ~ Income + Generation + Employed + 
##     CollegeEd + MinorityRace, design = des, family = binomial)
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~anes2016$V160202, strata = ~anes2016$V160201, 
##     weights = ~anes2016$V160101, data = anes2016)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          1.41778    0.18900   7.501 1.02e-11 ***
## Income49.9k or less -1.22422    0.18287  -6.695 6.54e-10 ***
## Income50k to 99.9k  -0.08791    0.20620  -0.426 0.670620    
## GenerationMillenial  0.80325    0.15240   5.271 5.77e-07 ***
## GenerationX          0.10448    0.13100   0.798 0.426626    
## GenerationZ          1.58992    0.46924   3.388 0.000941 ***
## Employed1            0.77979    0.12082   6.454 2.17e-09 ***
## CollegeEd1           0.49084    0.11576   4.240 4.31e-05 ***
## MinorityRace1       -0.10761    0.12884  -0.835 0.405166    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.082511)
## 
## Number of Fisher Scoring iterations: 5

summary(fit.logit2)

## 
## Call:
## svyglm(formula = SelfratedHlth.imp ~ Income.imp + Generation.imp + 
##     Employed.imp + CollegeEd.imp + MinorityRace.imp, design = des, 
##     family = binomial)
## 
## Survey design:
## svydesign(nest = TRUE, ids = ~anes2016$V160202, strata = ~anes2016$V160201, 
##     weights = ~anes2016$V160101, data = anes2016)
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.35505    0.16302   8.312 1.34e-13 ***
## Income.imp49.9k or less -1.01197    0.15751  -6.425 2.51e-09 ***
## Income.imp50k to 99.9k  -0.02984    0.17608  -0.169  0.86569    
## Generation.impMillenial  0.65585    0.13737   4.774 4.95e-06 ***
## Generation.impX         -0.03743    0.11740  -0.319  0.75038    
## Generation.impZ          1.49678    0.45707   3.275  0.00137 ** 
## Employed.imp1            0.88109    0.10986   8.020 6.50e-13 ***
## CollegeEd.imp1           0.48587    0.10060   4.830 3.92e-06 ***
## MinorityRace.imp1       -0.15367    0.10820  -1.420  0.15803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.027584)
## 
## Number of Fisher Scoring iterations: 5

Concluding remarks on model fit:

After attempting the multiple imputation I received an output that showed no difference from the original data (fit.logit1 & fit.imp). The modal imputation provided the best results for the data used in the analysis. Model 3 (fit.logit2) has a dispersion parameter closest to 1, and provides the best model fit to measure the variation in the outcome.

Multiple Imputation

Jairemy Edwards

4/27/2020

Data

The American National Election Studies. These materials are based on work supported by the National Science Foundation under grant numbers SES 1444721, 2014-2017, the University of Michigan, and Stanford University. N = 4271

Outcome:

Predictors:

Logit Regression Analysis Original Data:

Missingness Pattern:

Comparison of Model and Original Data:

Multiple Imputation:

Most of the data are categorical and the resemblance is robust.

After doing this, greater variation in missingness between the original and imputed data is apparent.

Comparison Mulitple Imputed and Original Data:

Concluding remarks on model fit: