Libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble  3.0.3     v dplyr   1.0.3
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(patchwork)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(ggsn)
## Loading required package: grid
library(foreign)
library(survey)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(VGAM)
## Warning: package 'VGAM' was built under R version 4.0.4
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:survey':
## 
##     calibrate
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following object is masked from 'package:caret':
## 
##     predictors

Importing NHANES data

DEMO_J <- read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DEMO_J.XPT", 
    NULL)
ALQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/ALQ_J.XPT", 
    NULL)
BMX_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/BMX_J.XPT", 
    NULL)
DIQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/DIQ_J.XPT", 
    NULL)
SMQ_J <-  read.xport("C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2/NHANES data/SMQ_J.XPT", 
    NULL)

Merging data into master file

Master1 <- merge(DEMO_J, ALQ_J, all.x=TRUE)
Master2 <- merge(Master1, BMX_J, all.x=TRUE)
Master3 <- merge(Master2, DIQ_J, all.x=TRUE)
Master4 <- merge(Master3, SMQ_J, all.x=TRUE)

Recoding, filtering, and cleaning up the data

nams<-names(Master4)
head(nams, n=10)
##  [1] "SEQN"     "SDDSRVYR" "RIDSTATR" "RIAGENDR" "RIDAGEYR" "RIDAGEMN"
##  [7] "RIDRETH1" "RIDRETH3" "RIDEXMON" "RIDEXAGM"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(Master4)<-newnames
#sex
Master4$sex<-as.factor(ifelse(Master4$riagendr==1,
                                "Male",
                                "Female"))
#race/ethnicity
Master4$hispanic <- Recode(Master4$ridreth3, recodes = "1=1; 2=1; else=0")
Master4$black<-Recode(Master4$ridreth3, recodes="4=1; else=0")
Master4$white<-Recode(Master4$ridreth3, recodes="3=1; else=0")
Master4$asian<-Recode(Master4$ridreth3, recodes="6=1; else=0")
Master4$other<-Recode(Master4$ridreth3, recodes="7=1; else=0")

Master4$race_eth<- Recode(Master4$ridreth3, recodes= "3= '1 white'; 1:2='2 hispanic'; 4= '3 black'; 6= '4 asian'; 7= '5 other' ", as.factor=T )

# at or below poverty threshold using income to poverty ratio
Master4$pov<-ifelse(Master4$indfmpir <= 1, "at or below poverty", 
                      ifelse(Master4$indfmpir > 1 & Master4$indfmpir <=2, "1-2 times pov",
                             ifelse(Master4$indfmpir >2 & Master4$indfmpir <=3, "2-3 times pov",
                                    ifelse(Master4$indfmpir >3 & Master4$indfmpir <=4, "3-4 times pov",
                                           ifelse(Master4$indfmpir >4 & Master4$indfmpir <=5, "4-5 times pov",
                                                  ifelse(Master4$indfmpir >5, "5+ times pov",NA))))))


Master4$pov2<- Recode(Master4$pov, recodes= "'at or below poverty'=1; '1-2 times pov'=2; '2-3 times pov'=3; '3-4 times pov'=4; '4-5 times pov'=5; '5+ times pov'=5",  as.factor=T)

# education

Master4$educ <- Recode(Master4$dmdeduc2, recodes="1:2='1 lths'; 3='2 hs'; 4='3 some col'; 5='4 col grad'; 7:9=NA; else=NA", as.factor=T)

# marital status
Master4$marst <- Recode(Master4$dmdmartl, recodes="1='1 married'; 2='2 widowed'; 3='3 divorced'; 4='4 separated'; 5='5 nm'; 6='6 cohab'; else=NA", as.factor=T)

# age
Master4$agec<-cut(Master4$ridageyr,
                   breaks=c(0,17,65,80))
# smoking status
Master4$cur_smk <- Recode(Master4$smq040, recodes="1:2 = 1; else=0", as.factor=T)

# ever used e-cig
Master4$evr_ecig <- Recode(Master4$smq900, recodes="1= 1; 2=0; else=NA", as.factor=T)

# ever consumed alcohol
Master4$evr_alc <- Recode(Master4$alq111, recodes="1= 1; 2=0; else=NA", as.factor=T)

# diabetes
Master4$diabetes <- Recode(Master4$diq010, recodes="1= 1; 2:3=0; else=NA", as.factor=T)

# prediabetes
Master4$pre_dbts <- Recode(Master4$diq160, recodes="1= 1; 2=0; else=NA", as.factor=T)

# health risk for diabetes
Master4$hlth_rsk <- Recode(Master4$diq170, recodes="1= 1; 2=0; else=NA", as.factor=T)

# BMI
Master4$bmicat <- ifelse(Master4$bmxbmi < 18.5, "1 underweight",
                         ifelse(Master4$bmxbmi >=18.5 & Master4$bmxbmi < 25, "2 normal",
                                ifelse(Master4$bmxbmi >=25 & Master4$bmxbmi< 30, "3 overweight",
                                       ifelse(Master4$bmxbmi >= 30, "4 obese", NA))))

Master4$bmicat2 <- Recode (Master4$bmicat, recodes= "'1 underweight'=1; '2 normal'=2; '3 overweight'=3; '4 obese'=4; else=NA ", as.factor=T)

Master4$bmicat2 <- relevel(Master4$bmicat2, ref ="1")

Master4$bminum <- Recode (Master4$bmicat, recodes= "'1 underweight'=1; '2 normal'=2; '3 overweight'=3; '4 obese'=4; else=NA ", as.factor=F)

Master4$obese <- Recode (Master4$bmicat, recodes= "'4 obese'=1; else=0 ", as.factor=T)

1. Define an ordinal or multinomial outcome variable of your choosing and define how you will recode the original variable.

# The outcome variable used for this assignment will be Body Mass Index (BMI). This variable will be ordinally categorized by: underweight (BMI<18.5), normal (18.5<=BMI<25), overweight (25<=BMI<30), and obese (BMI>=30)

2. State a research question about what factors you believe will affect your outcome variable.

# Independent variables like age, race/ethniciy, education, and poverty status will be significant influences on BMI. Are older individuals more likely to have higher BMI category? Will Blacks and Hispanics have greater odds of being in a higher BMi category than Whites? As education increases will the odds of being in a lower BMi category decrease? Lastly, will poverty status be a significant predictor in BMI status.

3. Fit both the ordinal or the multinomial logistic regression models to your outcome.

library(dplyr)
sub<-Master4%>%
  select(bmicat, bmicat2, bminum, hlth_rsk, obese, cur_smk, agec,ridageyr, marst, educ, pov, pov2, race_eth, hispanic, black, white, asian, other, sex, wtint2yr, sdmvpsu, sdmvstra) %>%
  filter( complete.cases(.))

#First we tell R our survey design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~sdmvpsu, strata=~sdmvstra, weights=~wtint2yr, data =sub, nest = TRUE )

fit.ordr<-svyolr(bmicat2~race_eth+educ+agec+pov2,des)
## Warning in svyolr.survey.design2(bmicat2 ~ race_eth + educ + agec + pov2, :
## design appears to be rank-deficient, so dropping some coefs
summary(fit.ordr)
## Call:
## svyolr(bmicat2 ~ race_eth + educ + agec + pov2, des)
## 
## Coefficients:
##                          Value Std. Error    t value
## race_eth2 hispanic  0.40074709 0.09461744  4.2354466
## race_eth3 black     0.31747488 0.12344109  2.5718736
## race_eth4 asian    -0.80774992 0.09398224 -8.5947085
## race_eth5 other     0.32139849 0.20318206  1.5818252
## educ2 hs            0.13000447 0.14158091  0.9182344
## educ3 some col      0.13406388 0.13970558  0.9596172
## educ4 col grad     -0.18892269 0.17729830 -1.0655640
## agec(17,65]        -0.04312386 0.14346676 -0.3005843
## pov22               0.08391165 0.12758609  0.6576865
## pov23               0.23387889 0.16794868  1.3925617
## pov24               0.20145693 0.14926907  1.3496227
## pov25               0.30457147 0.19774241  1.5402436
## 
## Intercepts:
##     Value    Std. Error t value 
## 1|2  -3.8094   0.2853   -13.3528
## 2|3  -0.7210   0.1979    -3.6440
## 3|4   0.6364   0.1944     3.2734
mfit<-vglm(bmicat2~race_eth+educ+agec+pov2,
           family=multinomial(refLevel = 1),
           data=Master4,
           weights=wtint2yr/mean(wtint2yr, na.rm=T))

summary(mfit)
## 
## Call:
## vglm(formula = bmicat2 ~ race_eth + educ + agec + pov2, family = multinomial(refLevel = 1), 
##     data = Master4, weights = wtint2yr/mean(wtint2yr, na.rm = T))
## 
## Coefficients: 
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1          1.6132     0.3668   4.398 1.09e-05 ***
## (Intercept):2          1.3373     0.3666   3.648 0.000264 ***
## (Intercept):3          1.9053     0.3611   5.277 1.31e-07 ***
## race_eth2 hispanic:1   1.0237     0.4753   2.154 0.031234 *  
## race_eth2 hispanic:2   1.9218     0.4709   4.081 4.48e-05 ***
## race_eth2 hispanic:3   1.6455     0.4693   3.507 0.000454 ***
## race_eth3 black:1      0.3830     0.3551   1.079 0.280784    
## race_eth3 black:2      0.5227     0.3543   1.475 0.140097    
## race_eth3 black:3      0.7201     0.3488   2.064 0.038989 *  
## race_eth4 asian:1      0.6069     0.4547   1.335 0.181960    
## race_eth4 asian:2      0.3931     0.4569   0.860 0.389569    
## race_eth4 asian:3     -0.8293     0.4683  -1.771 0.076596 .  
## race_eth5 other:1      0.8475     0.6796   1.247 0.212342    
## race_eth5 other:2      1.3636     0.6746   2.021 0.043231 *  
## race_eth5 other:3      1.2643     0.6711   1.884 0.059576 .  
## educ2 hs:1            -0.4247     0.3671  -1.157 0.247295    
## educ2 hs:2            -0.3703     0.3660  -1.012 0.311628    
## educ2 hs:3            -0.2143     0.3617  -0.592 0.553562    
## educ3 some col:1       0.2790     0.4024   0.693 0.488160    
## educ3 some col:2       0.4243     0.4012   1.058 0.290250    
## educ3 some col:3       0.4896     0.3975   1.232 0.218094    
## educ4 col grad:1       0.2512     0.4308   0.583 0.559765    
## educ4 col grad:2       0.2330     0.4299   0.542 0.587777    
## educ4 col grad:3      -0.1095     0.4271  -0.256 0.797641    
## agec(65,80]:1          1.2836     0.5011   2.562 0.010420 *  
## agec(65,80]:2          1.8645     0.4991   3.736 0.000187 ***
## agec(65,80]:3          1.5161     0.4984   3.042 0.002350 ** 
## pov22:1                0.7220     0.3014   2.395 0.016607 *  
## pov22:2                0.7563     0.3013   2.510 0.012083 *  
## pov22:3                0.6722     0.2958   2.273 0.023041 *  
## pov23:1                0.8552     0.3627   2.358 0.018380 *  
## pov23:2                0.9255     0.3625   2.553 0.010679 *  
## pov23:3                1.0824     0.3563   3.038 0.002379 ** 
## pov24:1                0.5157     0.3449   1.495 0.134912    
## pov24:2                0.8186     0.3435   2.383 0.017183 *  
## pov24:3                0.6899     0.3382   2.040 0.041343 *  
## pov25:1                1.6658     0.3747   4.446 8.76e-06 ***
## pov25:2                2.0712     0.3741   5.536 3.09e-08 ***
## pov25:3                1.9992     0.3700   5.403 6.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: log(mu[,2]/mu[,1]), log(mu[,3]/mu[,1]), 
## log(mu[,4]/mu[,1])
## 
## Residual deviance: 12648.79 on 13425 degrees of freedom
## 
## Log-likelihood: -6324.395 on 13425 degrees of freedom
## 
## Number of Fisher scoring iterations: 8 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Reference group is level  1  of the response

3a Are the assumptions of the proportional odds model met?

ex1<-svyglm(I(bminum>1)~race_eth+educ+agec+pov2,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ex2<-svyglm(I(bminum>2)~race_eth+educ+agec+pov2,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
ex3<-svyglm(I(bminum>3)~race_eth+educ+agec+pov2,des, family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
coefs<-data.frame(parms =c( coefficients(ex1)[-1],coefficients(ex2)[-1],coefficients(ex3)[-1]),
                  beta = rep(names(coefficients(ex1))[-1], 2, 3 ), 
                  mod = c(rep("I(bminum>1)",12 ),
                          rep("I(bminum>2)",12 ),
                          rep("I(bminum>3)",12 )))

coefs%>%
  ggplot()+
  geom_point(aes(x=beta, y=parms, color=mod))+
  theme(axis.text.x = element_text(angle = 45))+
  labs(title = "Graphical Summary of Proportional Odds Assumption",
       x= "Beta",
       y= "Estimate")

#Just print the odds ratios, 
round(exp(rbind(coef(ex1)[-1],
                coef(ex2)[-1],
                coef(ex3))),3)
## Warning in rbind(coef(ex1)[-1], coef(ex2)[-1], coef(ex3)): number of columns of
## result is not a multiple of vector length (arg 1)
##      (Intercept) race_eth2 hispanic race_eth3 black race_eth4 asian
## [1,]       4.890              1.790           1.098           3.075
## [2,]       2.304              1.373           0.520           1.669
## [3,]       0.641              1.168           1.326           0.288
##      race_eth5 other educ2 hs educ3 some col educ4 col grad agec(65,80] pov22
## [1,]           0.793    1.626          1.270          3.893       2.202 2.568
## [2,]           1.147    1.197          0.882          1.372       1.138 1.259
## [3,]           1.194    1.144          1.100          0.772       0.847 1.019
##      pov23 pov24 pov25
## [1,] 2.015 6.780 4.890
## [2,] 1.449 1.630 2.304
## [3,] 1.215 1.059 1.139
# Proportional odds assumption is not met

3b How did you evaluate the proportional odds assumption?

# I graphed the coafficients for each transition and calculated odds ratios. After viewing the graph and looking at the OR's its clear that the assumption of proportionality is not met

3c Which model (Proportional odds or Multinomial) fits the data better? how did you decide upon this?

# Since the assumption for the proportional odds model is not met, by default the multinomial modial would work better since it is not bound by the proportional assumption.

4. For the best fitting model from part c, describe the results of your model and present output from the model in terms of odds ratios and confidence intervals for all model parameters in a table.

round(exp(coef(mfit)), 3)
##        (Intercept):1        (Intercept):2        (Intercept):3 
##                5.019                3.809                6.722 
## race_eth2 hispanic:1 race_eth2 hispanic:2 race_eth2 hispanic:3 
##                2.784                6.833                5.184 
##    race_eth3 black:1    race_eth3 black:2    race_eth3 black:3 
##                1.467                1.687                2.055 
##    race_eth4 asian:1    race_eth4 asian:2    race_eth4 asian:3 
##                1.835                1.482                0.436 
##    race_eth5 other:1    race_eth5 other:2    race_eth5 other:3 
##                2.334                3.910                3.541 
##           educ2 hs:1           educ2 hs:2           educ2 hs:3 
##                0.654                0.691                0.807 
##     educ3 some col:1     educ3 some col:2     educ3 some col:3 
##                1.322                1.528                1.632 
##     educ4 col grad:1     educ4 col grad:2     educ4 col grad:3 
##                1.286                1.262                0.896 
##        agec(65,80]:1        agec(65,80]:2        agec(65,80]:3 
##                3.610                6.453                4.554 
##              pov22:1              pov22:2              pov22:3 
##                2.058                2.130                1.959 
##              pov23:1              pov23:2              pov23:3 
##                2.352                2.523                2.952 
##              pov24:1              pov24:2              pov24:3 
##                1.675                2.267                1.994 
##              pov25:1              pov25:2              pov25:3 
##                5.290                7.935                7.383
round(exp(confint(mfit)), 3)
##                      2.5 % 97.5 %
## (Intercept):1        2.446 10.299
## (Intercept):2        1.857  7.813
## (Intercept):3        3.312 13.640
## race_eth2 hispanic:1 1.097  7.065
## race_eth2 hispanic:2 2.715 17.197
## race_eth2 hispanic:3 2.066 13.004
## race_eth3 black:1    0.731  2.942
## race_eth3 black:2    0.842  3.377
## race_eth3 black:3    1.037  4.070
## race_eth4 asian:1    0.753  4.473
## race_eth4 asian:2    0.605  3.628
## race_eth4 asian:3    0.174  1.093
## race_eth5 other:1    0.616  8.841
## race_eth5 other:2    1.042 14.669
## race_eth5 other:3    0.950 13.193
## educ2 hs:1           0.318  1.343
## educ2 hs:2           0.337  1.415
## educ2 hs:3           0.397  1.640
## educ3 some col:1     0.601  2.909
## educ3 some col:2     0.696  3.355
## educ3 some col:3     0.749  3.556
## educ4 col grad:1     0.553  2.991
## educ4 col grad:2     0.544  2.932
## educ4 col grad:3     0.388  2.070
## agec(65,80]:1        1.352  9.638
## agec(65,80]:2        2.426 17.161
## agec(65,80]:3        1.715 12.097
## pov22:1              1.140  3.716
## pov22:2              1.180  3.845
## pov22:3              1.097  3.497
## pov23:1              1.155  4.788
## pov23:2              1.240  5.135
## pov23:3              1.468  5.934
## pov24:1              0.852  3.293
## pov24:2              1.156  4.446
## pov24:3              1.027  3.868
## pov25:1              2.538 11.026
## pov25:2              3.811 16.518
## pov25:3              3.575 15.248
# Compared to Whites, Blacks were more likely to be overweight, than obese (OR=1.46; CI= 0.73, 2.94). College graduates were about 26% more likely than individuals with some high school to be normal than overweight (OR=1.26; CI= 0.54, 2.93). Individuals 1-2 times over the poverty threshold were more likely than those at or below poverty to be overweight than obese (OR=2.05; CI= 1.14, 3.72)