library(foreign)
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.3
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart

Loading my merged NHANES data

load("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/nhanes_10_18_merged.Rdata")
summary(nhanes_all)
##       seqn           sdmvpsu         sdmvstra         weight      
##  Min.   : 51653   Min.   :1.000   Min.   : 75.0   Min.   :  2181  
##  1st Qu.: 63811   1st Qu.:1.000   1st Qu.: 91.0   1st Qu.:  8237  
##  Median : 78435   Median :2.000   Median :110.0   Median : 12551  
##  Mean   : 77733   Mean   :1.534   Mean   :110.8   Mean   : 19843  
##  3rd Qu.: 91538   3rd Qu.:2.000   3rd Qu.:131.0   3rd Qu.: 23325  
##  Max.   :102956   Max.   :3.000   Max.   :148.0   Max.   :216543  
##                                                                   
##     newpsu              year               sex          bmicat         
##  Length:12660       Length:12660       Female:6519   Length:12660      
##  Class :character   Class :character   Male  :6141   Class :character  
##  Mode  :character   Mode  :character                 Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##      bmxbmi            race_eth       ridageyr         agec       marst      
##  Min.   :13.40   1 white   :4964   Min.   :18.0   (0,17] :   0   NA's:12660  
##  1st Qu.:24.14   2 hispanic:3289   1st Qu.:33.0   (17,65]:9970               
##  Median :27.93   3 black   :2612   Median :48.0   (65,80]:2690               
##  Mean   :29.15   4 other   :1280   Mean   :48.5                              
##  3rd Qu.:32.70   NA's      : 515   3rd Qu.:63.0                              
##  Max.   :86.20                     Max.   :80.0                              
##  NA's   :146                                                                 
##          educ          pov              pov2          fpg           
##  1 lths    :2863   Length:12660       0   :8783   Length:12660      
##  2 hs      :2671   Class :character   1   :2644   Class :character  
##  3 some col:3585   Mode  :character   NA's:1233   Mode  :character  
##  4 col grad:2926                                                    
##  NA's      : 615                                                    
##                                                                     
##                                                                     
##      diab               pred           cur_smoke    doctor      pre_doctor 
##  Length:12660       Length:12660       0   :2898   0   :10978   0   :9829  
##  Class :character   Class :character   1   :2370   1   : 1674   1   : 825  
##  Mode  :character   Mode  :character   NA's:7392   NA's:    8   NA's:2006  
##                                                                            
##                                                                            
##                                                                            
## 

Measure an outcome variable and at least 5 predictors

# The outcome variable that will be measured is pre-diabetes or greater using levels of a1c. The predictors are bmi category, race/ethnicity, sex, education, and poverty.

Patterns of missingness

md.pattern(nhanes_all[,c("bmxbmi", "sex", "pov2","educ","race_eth")])

##       sex bmxbmi race_eth educ pov2     
## 10359   1      1        1    1    1    0
## 1064    1      1        1    1    0    1
## 511     1      1        1    0    1    1
## 73      1      1        1    0    0    2
## 413     1      1        0    1    1    1
## 78      1      1        0    1    0    2
## 14      1      1        0    0    1    2
## 2       1      1        0    0    0    3
## 111     1      0        1    1    1    1
## 12      1      0        1    1    0    2
## 13      1      0        1    0    1    2
## 2       1      0        1    0    0    3
## 6       1      0        0    1    1    2
## 2       1      0        0    1    0    3
##         0    146      515  615 1233 2509
# There were 1064 cases where poverty was the only missing variable, the next highest was education at 511 cases. The largest number of missing cases among pairs was 78 individuals who had missing values for race_eth and poverty.

Mean imputation for numeric data

summary(nhanes_all$bmxbmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.40   24.14   27.93   29.15   32.70   86.20     146
#what happens when we replace the missings with the mean?
nhanes_all$bmxbmi.imp.mean<-ifelse(is.na(nhanes_all$bmxbmi)==T, mean(nhanes_all$bmxbmi, na.rm=T), nhanes_all$bmxbmi)

mean(nhanes_all$bmxbmi)
## [1] NA
mean(nhanes_all$bmxbmi.imp.mean)
## [1] 29.14705

Multiple imputation

imp<-mice(data = nhanes_all[,c("bmxbmi", "sex", "pov2","educ","race_eth")], seed=22,  m = 5)
## 
##  iter imp variable
##   1   1  bmxbmi  pov2  educ  race_eth
##   1   2  bmxbmi  pov2  educ  race_eth
##   1   3  bmxbmi  pov2  educ  race_eth
##   1   4  bmxbmi  pov2  educ  race_eth
##   1   5  bmxbmi  pov2  educ  race_eth
##   2   1  bmxbmi  pov2  educ  race_eth
##   2   2  bmxbmi  pov2  educ  race_eth
##   2   3  bmxbmi  pov2  educ  race_eth
##   2   4  bmxbmi  pov2  educ  race_eth
##   2   5  bmxbmi  pov2  educ  race_eth
##   3   1  bmxbmi  pov2  educ  race_eth
##   3   2  bmxbmi  pov2  educ  race_eth
##   3   3  bmxbmi  pov2  educ  race_eth
##   3   4  bmxbmi  pov2  educ  race_eth
##   3   5  bmxbmi  pov2  educ  race_eth
##   4   1  bmxbmi  pov2  educ  race_eth
##   4   2  bmxbmi  pov2  educ  race_eth
##   4   3  bmxbmi  pov2  educ  race_eth
##   4   4  bmxbmi  pov2  educ  race_eth
##   4   5  bmxbmi  pov2  educ  race_eth
##   5   1  bmxbmi  pov2  educ  race_eth
##   5   2  bmxbmi  pov2  educ  race_eth
##   5   3  bmxbmi  pov2  educ  race_eth
##   5   4  bmxbmi  pov2  educ  race_eth
##   5   5  bmxbmi  pov2  educ  race_eth
print(imp)
## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##    bmxbmi       sex      pov2      educ  race_eth 
##     "pmm"        ""  "logreg" "polyreg" "polyreg" 
## PredictorMatrix:
##          bmxbmi sex pov2 educ race_eth
## bmxbmi        0   1    1    1        1
## sex           1   0    1    1        1
## pov2          1   1    0    1        1
## educ          1   1    1    0        1
## race_eth      1   1    1    1        0

Analysis with multiple imputed data

fit.logit3<-with(data=imp,expr=svyglm(pred2~race_eth+educ+sex+pov2+bmxbmi,
                  design= nodes,
                  family=binomial))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit3)
## # A tibble: 50 x 4
##    term               estimate std.error  nobs
##    <chr>                 <dbl>     <dbl> <int>
##  1 (Intercept)        -2.41          Inf 10284
##  2 race_eth2 hispanic  0.00606       Inf 10284
##  3 race_eth3 black     0.616         Inf 10284
##  4 race_eth4 other     0.442         Inf 10284
##  5 educ2 hs           -0.256         Inf 10284
##  6 educ3 some col     -0.694         Inf 10284
##  7 educ4 col grad     -0.813         Inf 10284
##  8 sexMale             0.0346        Inf 10284
##  9 pov21              -0.180         Inf 10284
## 10 bmxbmi              0.0725        Inf 10284
## # ... with 40 more rows
# Coefficients remain largely similar between the model using mean and modal imputation and the one with multiple imputation. The gradient seen in education remains present. The sign for the Hispanic coefficient changed from negative, in the mean/modal model, to positive in the multiple imputation model.

Looking at analysis using original missing data and imputed data

# After comparing the coefficients from imputed models to the original data with missingness, the multiple imputation appears to match results from the original the best. Regression coefficient signs matched across both models. The mean and modal imputation switched the coefficient signs of a few variables.