library(foreign)
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.3
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
Loading my merged NHANES data
load("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/nhanes_10_18_merged.Rdata")
summary(nhanes_all)
## seqn sdmvpsu sdmvstra weight
## Min. : 51653 Min. :1.000 Min. : 75.0 Min. : 2181
## 1st Qu.: 63811 1st Qu.:1.000 1st Qu.: 91.0 1st Qu.: 8237
## Median : 78435 Median :2.000 Median :110.0 Median : 12551
## Mean : 77733 Mean :1.534 Mean :110.8 Mean : 19843
## 3rd Qu.: 91538 3rd Qu.:2.000 3rd Qu.:131.0 3rd Qu.: 23325
## Max. :102956 Max. :3.000 Max. :148.0 Max. :216543
##
## newpsu year sex bmicat
## Length:12660 Length:12660 Female:6519 Length:12660
## Class :character Class :character Male :6141 Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## bmxbmi race_eth ridageyr agec marst
## Min. :13.40 1 white :4964 Min. :18.0 (0,17] : 0 NA's:12660
## 1st Qu.:24.14 2 hispanic:3289 1st Qu.:33.0 (17,65]:9970
## Median :27.93 3 black :2612 Median :48.0 (65,80]:2690
## Mean :29.15 4 other :1280 Mean :48.5
## 3rd Qu.:32.70 NA's : 515 3rd Qu.:63.0
## Max. :86.20 Max. :80.0
## NA's :146
## educ pov pov2 fpg
## 1 lths :2863 Length:12660 0 :8783 Length:12660
## 2 hs :2671 Class :character 1 :2644 Class :character
## 3 some col:3585 Mode :character NA's:1233 Mode :character
## 4 col grad:2926
## NA's : 615
##
##
## diab pred cur_smoke doctor pre_doctor
## Length:12660 Length:12660 0 :2898 0 :10978 0 :9829
## Class :character Class :character 1 :2370 1 : 1674 1 : 825
## Mode :character Mode :character NA's:7392 NA's: 8 NA's:2006
##
##
##
##
# The outcome variable that will be measured is pre-diabetes or greater using levels of a1c. The predictors are bmi category, race/ethnicity, sex, education, and poverty.
md.pattern(nhanes_all[,c("bmxbmi", "sex", "pov2","educ","race_eth")])
## sex bmxbmi race_eth educ pov2
## 10359 1 1 1 1 1 0
## 1064 1 1 1 1 0 1
## 511 1 1 1 0 1 1
## 73 1 1 1 0 0 2
## 413 1 1 0 1 1 1
## 78 1 1 0 1 0 2
## 14 1 1 0 0 1 2
## 2 1 1 0 0 0 3
## 111 1 0 1 1 1 1
## 12 1 0 1 1 0 2
## 13 1 0 1 0 1 2
## 2 1 0 1 0 0 3
## 6 1 0 0 1 1 2
## 2 1 0 0 1 0 3
## 0 146 515 615 1233 2509
# There were 1064 cases where poverty was the only missing variable, the next highest was education at 511 cases. The largest number of missing cases among pairs was 78 individuals who had missing values for race_eth and poverty.
summary(nhanes_all$bmxbmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.40 24.14 27.93 29.15 32.70 86.20 146
#what happens when we replace the missings with the mean?
nhanes_all$bmxbmi.imp.mean<-ifelse(is.na(nhanes_all$bmxbmi)==T, mean(nhanes_all$bmxbmi, na.rm=T), nhanes_all$bmxbmi)
mean(nhanes_all$bmxbmi)
## [1] NA
mean(nhanes_all$bmxbmi.imp.mean)
## [1] 29.14705
table(nhanes_all$educ)
##
## 1 lths 2 hs 3 some col 4 col grad
## 2863 2671 3585 2926
#find the most common value
mcv.educ<-factor(names(which.max(table(nhanes_all$educ))), levels=levels(nhanes_all$educ))
mcv.educ
## [1] 3 some col
## Levels: 1 lths 2 hs 3 some col 4 col grad
#impute the cases
nhanes_all$educ.imp<-as.factor(ifelse(is.na(nhanes_all$educ)==T, mcv.educ, nhanes_all$educ))
levels(nhanes_all$educ.imp)<-levels(nhanes_all$educ)
prop.table(table(nhanes_all$educ))
##
## 1 lths 2 hs 3 some col 4 col grad
## 0.2376920 0.2217518 0.2976339 0.2429224
prop.table(table(nhanes_all$educ.imp))
##
## 1 lths 2 hs 3 some col 4 col grad
## 0.2261453 0.2109795 0.3317536 0.2311216
barplot(prop.table(table(nhanes_all$educ)), main="Original Data")
barplot(prop.table(table(nhanes_all$educ.imp)), main="Imputed Data")
table(nhanes_all$race_eth)
##
## 1 white 2 hispanic 3 black 4 other
## 4964 3289 2612 1280
#find the most common value
mcv.race_eth<-factor(names(which.max(table(nhanes_all$race_eth))), levels=levels(nhanes_all$race_eth))
mcv.race_eth
## [1] 1 white
## Levels: 1 white 2 hispanic 3 black 4 other
#impute the cases
nhanes_all$race_eth.imp<-as.factor(ifelse(is.na(nhanes_all$race_eth)==T, mcv.race_eth, nhanes_all$race_eth))
levels(nhanes_all$race_eth.imp)<-levels(nhanes_all$race_eth)
prop.table(table(nhanes_all$race_eth))
##
## 1 white 2 hispanic 3 black 4 other
## 0.4087279 0.2708110 0.2150679 0.1053932
prop.table(table(nhanes_all$race_eth.imp))
##
## 1 white 2 hispanic 3 black 4 other
## 0.4327804 0.2597946 0.2063191 0.1011058
barplot(prop.table(table(nhanes_all$race_eth)), main="Original Data")
barplot(prop.table(table(nhanes_all$race_eth.imp)), main="Imputed Data")
table(nhanes_all$pov2)
##
## 0 1
## 8783 2644
#find the most common value
mcv.pov2<-factor(names(which.max(table(nhanes_all$pov2))), levels=levels(nhanes_all$pov2))
mcv.pov2
## [1] 0
## Levels: 0 1
#impute the cases
nhanes_all$pov2.imp<-as.factor(ifelse(is.na(nhanes_all$pov2)==T, mcv.pov2, nhanes_all$pov2))
levels(nhanes_all$pov2.imp)<-levels(nhanes_all$pov2)
prop.table(table(nhanes_all$pov2))
##
## 0 1
## 0.7686182 0.2313818
prop.table(table(nhanes_all$pov2.imp))
##
## 0 1
## 0.7911532 0.2088468
barplot(prop.table(table(nhanes_all$pov2)), main="Original Data")
barplot(prop.table(table(nhanes_all$pov2.imp)), main="Imputed Data")
non-imputed data
nhanes_all$pred2 <- Recode(nhanes_all$pred, recodes= "'yes'=1; 'no'=0; else=NA", as.factor=T)
nodes<-svydesign(ids = ~newpsu, strata=~sdmvstra, weights = ~weight, data =nhanes_all, nest=TRUE)
options(survey.lonely.psu = "adjust")
fit.logit<-svyglm(pred2~race_eth+educ+sex+pov2+bmxbmi,
design= nodes,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit)
##
## Call:
## svyglm(formula = pred2 ~ race_eth + educ + sex + pov2 + bmxbmi,
## design = nodes, family = binomial)
##
## Survey design:
## svydesign(ids = ~newpsu, strata = ~sdmvstra, weights = ~weight,
## data = nhanes_all, nest = TRUE)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.41364 Inf
## race_eth2 hispanic 0.00606 Inf
## race_eth3 black 0.61560 Inf
## race_eth4 other 0.44235 Inf
## educ2 hs -0.25591 Inf
## educ3 some col -0.69396 Inf
## educ4 col grad -0.81337 Inf
## sexMale 0.03463 Inf
## pov21 -0.17972 Inf
## bmxbmi 0.07246 Inf
##
## (Dispersion parameter for binomial family taken to be 1.057936)
##
## Number of Fisher Scoring iterations: 4
Model with mean and modal imputation
fit.logit2<-svyglm(pred2~race_eth+educ+sex+pov2.imp+bmxbmi.imp.mean,
design= nodes,
family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit2)
##
## Call:
## svyglm(formula = pred2 ~ race_eth + educ + sex + pov2.imp + bmxbmi.imp.mean,
## design = nodes, family = binomial)
##
## Survey design:
## svydesign(ids = ~newpsu, strata = ~sdmvstra, weights = ~weight,
## data = nhanes_all, nest = TRUE)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.40021 Inf
## race_eth2 hispanic -0.01589 Inf
## race_eth3 black 0.57171 Inf
## race_eth4 other 0.43104 Inf
## educ2 hs -0.25144 Inf
## educ3 some col -0.66537 Inf
## educ4 col grad -0.80548 Inf
## sexMale 0.02316 Inf
## pov2.imp1 -0.17941 Inf
## bmxbmi.imp.mean 0.07270 Inf
##
## (Dispersion parameter for binomial family taken to be 1.032756)
##
## Number of Fisher Scoring iterations: 4
# Educational attainment has a negative gradient in coefficients obtained from the logistic regression, signaling a lower odds of having prediabetes with each level of education. The imputed values for poverty had a negative coefficient, suggesting that being below the poverty line decreased the odds of prediabetes. The imputed bmi values had positive coefficients, showing that as bmi increases, odds for prediabetes increase.
imp<-mice(data = nhanes_all[,c("bmxbmi", "sex", "pov2","educ","race_eth")], seed=22, m = 5)
##
## iter imp variable
## 1 1 bmxbmi pov2 educ race_eth
## 1 2 bmxbmi pov2 educ race_eth
## 1 3 bmxbmi pov2 educ race_eth
## 1 4 bmxbmi pov2 educ race_eth
## 1 5 bmxbmi pov2 educ race_eth
## 2 1 bmxbmi pov2 educ race_eth
## 2 2 bmxbmi pov2 educ race_eth
## 2 3 bmxbmi pov2 educ race_eth
## 2 4 bmxbmi pov2 educ race_eth
## 2 5 bmxbmi pov2 educ race_eth
## 3 1 bmxbmi pov2 educ race_eth
## 3 2 bmxbmi pov2 educ race_eth
## 3 3 bmxbmi pov2 educ race_eth
## 3 4 bmxbmi pov2 educ race_eth
## 3 5 bmxbmi pov2 educ race_eth
## 4 1 bmxbmi pov2 educ race_eth
## 4 2 bmxbmi pov2 educ race_eth
## 4 3 bmxbmi pov2 educ race_eth
## 4 4 bmxbmi pov2 educ race_eth
## 4 5 bmxbmi pov2 educ race_eth
## 5 1 bmxbmi pov2 educ race_eth
## 5 2 bmxbmi pov2 educ race_eth
## 5 3 bmxbmi pov2 educ race_eth
## 5 4 bmxbmi pov2 educ race_eth
## 5 5 bmxbmi pov2 educ race_eth
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## bmxbmi sex pov2 educ race_eth
## "pmm" "" "logreg" "polyreg" "polyreg"
## PredictorMatrix:
## bmxbmi sex pov2 educ race_eth
## bmxbmi 0 1 1 1 1
## sex 1 0 1 1 1
## pov2 1 1 0 1 1
## educ 1 1 1 0 1
## race_eth 1 1 1 1 0
Analysis with multiple imputed data
fit.logit3<-with(data=imp,expr=svyglm(pred2~race_eth+educ+sex+pov2+bmxbmi,
design= nodes,
family=binomial))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit3)
## # A tibble: 50 x 4
## term estimate std.error nobs
## <chr> <dbl> <dbl> <int>
## 1 (Intercept) -2.41 Inf 10284
## 2 race_eth2 hispanic 0.00606 Inf 10284
## 3 race_eth3 black 0.616 Inf 10284
## 4 race_eth4 other 0.442 Inf 10284
## 5 educ2 hs -0.256 Inf 10284
## 6 educ3 some col -0.694 Inf 10284
## 7 educ4 col grad -0.813 Inf 10284
## 8 sexMale 0.0346 Inf 10284
## 9 pov21 -0.180 Inf 10284
## 10 bmxbmi 0.0725 Inf 10284
## # ... with 40 more rows
# Coefficients remain largely similar between the model using mean and modal imputation and the one with multiple imputation. The gradient seen in education remains present. The sign for the Hispanic coefficient changed from negative, in the mean/modal model, to positive in the multiple imputation model.
Looking at analysis using original missing data and imputed data
# After comparing the coefficients from imputed models to the original data with missingness, the multiple imputation appears to match results from the original the best. Regression coefficient signs matched across both models. The mean and modal imputation switched the coefficient signs of a few variables.