Dem5283-Ordinal level analysis using Birth weight and smoking mothers

#Load Library

library(foreign)
library(dplyr)
library(haven)
library(tableone)
library(MASS)
library(car)
library(VGAM)
library(stargazer)

#Load data of National Vital Statistics Birth data of 2015

dat <- read_sas ("C:/Users/nahin/Google Drive/MSc Demography/Spring 2020/Stat II 5283/Data/OneDrive_1_3-6-2020/Single year/birth2015.sas7bdat")

#lowering capital case
names(dat)<-tolower(names(dat))
#names(dat)

#Selecting variables

dat <- dat %>% 
  dplyr::select(cig_n_t1, cig_n_t2, cig_n_t3, b_wt_grp, b_m_age, b_m_educ, m_hisnot, m_hismex,m_hispr,m_hiscub, m_hisoth, m_hisdes, m_rwhite, m_rblack,m_ramind, m_rindes, m_rasnin, m_rchina, m_rfilip, m_rjapan, m_rkorea, m_rviet, m_rothas, m_rasdes, m_rhawai, m_rguam, m_rsamoa, m_rothpa,m_rpacis, m_rother, m_rotdes)

#class convertion

dat %>%
   mutate_all(as.numeric)

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

#a) Define an ordinal or multinomial outcome variable of your choosing and define how you will recode the original variable.

Recoding dependent/ordinal variable. Low birth weight of the babies at birth is the depedent variable which is a ranked variable into 12 ordinal level. What we need to do is, compress those lavels into 3 catergories like as Very low, morderately-low, and High .

class(dat$b_wt_grp)

## [1] "character"

dat$b_wt_grp <- as.numeric(dat$b_wt_grp)
dat$bwt<-Recode(dat$b_wt_grp, recodes="1:3=1; 4:5=2; 6:11=3; else=NA", as.factor=T)
table(dat$bwt)

## 
##      1      2      3 
##   5725  27750 369908

Recoding smoking mothers.As smoking and non smoking mothers will be my two independent variables. Those who smoked During pregnancy will be counted as smoker and who did not smoked will be recoded as non-smoker irrespective of the number of cigaretts they had during pregnancy.

#Converting the class variables numeric variables
# dat$cig_n_t1 <- as.numeric(dat$cig_n_t1)
# dat$cig_n_t2 <- as.numeric(dat$cig_n_t2)
# dat$cig_n_t3 <- as.numeric(dat$cig_n_t3)

#Independent Variables: Smoker and non-smoker during pregnancy
dat$cigprt1<-Recode(dat$cig_n_t1, recodes="0=0; .5:50=1; else=NA")
dat$cigprt2<-Recode(dat$cig_n_t2, recodes="0=0; .3:50=1; else=NA")
dat$cigprt3<-Recode(dat$cig_n_t3, recodes="0=0; .1:41=1; else=NA")

dat$smoker <- ifelse(dat$cigprt1==0 & dat$cigprt2==0 & dat$cigprt3==0, 0, 1)


#Hispanic Ethnicity
dat$mhismex <- Recode(dat$m_hismex, recodes="0=0; 1=1")
dat$mhispr <-  Recode(dat$m_hispr, recodes="0=0; 1=1")
dat$mhiscub <- Recode(dat$m_hiscub, recodes="0=0; 1=1")
dat$mhisoth <- Recode(dat$m_hisoth, recodes="0=0; 1=1")

dat$hispan <- ifelse(dat$mhismex==1 | dat$mhispr==1 | dat$mhiscub==1 | dat$mhisoth==1, 1, 0)

#Race
dat$white <- Recode(dat$m_rwhite, recodes="0=0; 1=1")
dat$black <- Recode(dat$m_rblack, recodes="0=0; 1=1")

#American Indian Native
dat$amindnative <- ifelse(dat$m_ramind==1 | dat$m_rhawai==1 | dat$m_rguam ==1 | dat$m_rsamoa==1 | dat$m_rothpa==1| dat$m_rpacis==1,1,0)

#Asians
dat$india <- Recode(dat$m_rasnin, recodes="0=0; 1=1")
dat$china <- Recode(dat$m_rchina, recodes="0=0; 1=1")
dat$filip <- Recode(dat$m_rfilip, recodes="0=0; 1=1")
dat$japan <- Recode(dat$m_rjapan, recodes="0=0; 1=1")
dat$korea <- Recode(dat$m_rkorea, recodes="0=0; 1=1")
dat$viet <- Recode(dat$m_rviet, recodes="0=0; 1=1")
dat$other <- Recode(dat$m_rothas, recodes="0=0; 1=1")
dat$asian <- ifelse(dat$india==1 | dat$china==1 | dat$filip==1 | dat$japan==1 | dat$korea==1 | dat$viet==1 | dat$other==1,1,0)

#Other
dat$other <- Recode(dat$m_rother, recodes="0=0; 1=1")


#Race/Ethnicity Varible
dat$race_eth <- factor(ifelse(dat$hispan==1, "hispanic",
                  ifelse(dat$hispan==0 & dat$white==1, "nhwhite",
                    ifelse(dat$hispan==0 & dat$black==1, "nhblack",
                      ifelse(dat$hispan==0 & dat$asian==1, "nhasian",
                        ifelse(dat$hispan==0 & dat$other==1, "nhothe", NA))))))

#dat$race_eth <- relevel(dat$race_eth, ref = "nhwhite") 


#Age
dat$mage <- Recode(dat$b_m_age, recodes="99=NA")

dat$mage2<-cut(dat$mage, breaks=c(11,19,24,35,56))

table(dat$mage2)

## 
## (11,19] (19,24] (24,35] (35,56] 
##   33137   97732  229704   42819

b)State a research question about what factors you believe will affect your outcome variable.

Does smoking during pregnancy affects birth weight outcomes. Here, I will use two covariates, race and age to see how the realtionship changes with the inclusion of this two varibables in the model.

Fit either the ordinal or the multinomial logistic regression models to your outcome.

Are the assumptions of the proportional odds model met? How did you evaluate the proportional odds assumption?

fit1 <- polr(bwt~smoker+race_eth+mage2, data=dat)
summary(fit1)

## 
## Re-fitting to get Hessian

## Call:
## polr(formula = bwt ~ smoker + race_eth + mage2, data = dat)
## 
## Coefficients:
##                    Value Std. Error t value
## smoker          -0.41852    0.02346 -17.839
## race_ethnhasian -0.20612    0.02587  -7.967
## race_ethnhblack -0.60348    0.01602 -37.665
## race_ethnhothe  -0.09155    0.07467  -1.226
## race_ethnhwhite  0.11043    0.01381   7.998
## mage2(19,24]     0.16906    0.02253   7.505
## mage2(24,35]     0.15877    0.02086   7.611
## mage2(35,56]    -0.14990    0.02509  -5.974
## 
## Intercepts:
##     Value     Std. Error t value  
## 1|2   -4.2265    0.0231  -182.9186
## 2|3   -2.3842    0.0197  -121.1282
## 
## Residual Deviance: 255848.92 
## AIC: 255868.92 
## (4182 observations deleted due to missingness)

fit1$deviance+2*length(fit1$coefficients)

## [1] 255864.9

Now we can “examine” the proportional odds assumption by fitting logits for each change

dat$birth<-Recode(dat$bwt, recodes="1:3=1; 4:5=2; 6:11=3; else=NA", as.factor=F)

#dat$age <- Recode(dat$mage, recodes="11:19=1;20:24=2;25:35=3;36:56=4", as.factor=F)

ex1 <- glm(I(birth>1)~smoker+race_eth+mage2, data = dat, family="binomial")

## Warning: glm.fit: algorithm did not converge

ex2 <- glm(I(birth>2)~smoker+race_eth+mage2,data = dat, family="binomial")

## Warning: glm.fit: algorithm did not converge

plot(coef(ex1)[-1], xlim=c(1, 11), ylim=c(-3, 3), type="l", xaxt="n",
     ylab="Beta", main=c("Comparison of betas for", " proportional odds assumption"))
lines(coef(ex2)[-1], col=1, lty=2) 
axis(side=1, at=1:12, labels=F)
text(x=1:12, y=-4,  srt = 90, pos = 1, xpd = TRUE,cex=.8,
     labels = c("Nonsmoker", "Smoker", "Hispanic", "NHasin", "NHBlack","NHOther", "NHwhite",
                "11-19", "19-24", "24-35", "35-56" ))
legend("bottomright",col=c(1,1),lty=c(1,2), legend=c(">1", ">2"))

lines(coef(fit1)[c(-13:-16)], col=4, lwd=3)

round(exp(rbind(coef(ex1)[-1], coef(ex2)[-1])),3)

##      smoker race_ethnhasian race_ethnhblack race_ethnhothe race_ethnhwhite
## [1,]      1               1               1              1               1
## [2,]      1               1               1              1               1
##      mage2(19,24] mage2(24,35] mage2(35,56]
## [1,]            1            1            1
## [2,]            1            1            1

The transaction of betas are almost similar in the graph. That is the proportional odds assumption is made.

#Proportional and Non-proportional model

#<-parallel = T == proportional odds
fit.vgam<-vglm(as.ordered(bwt)~smoker+race_eth+mage2,
               dat, na.rm=T, family=cumulative(parallel = T, reverse = T))  
summary(fit.vgam)

## 
## Call:
## vglm(formula = as.ordered(bwt) ~ smoker + race_eth + mage2, family = cumulative(parallel = T, 
##     reverse = T), data = dat, na.rm = T)
## 
## Pearson residuals:
##                       Min      1Q  Median      3Q    Max
## logitlink(P[Y>=2]) -9.448 0.07562 0.07977 0.09247 0.6055
## logitlink(P[Y>=3]) -4.132 0.25434 0.26883 0.29694 0.5249
## 
## Coefficients: 
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1    4.22661    0.02311 182.885  < 2e-16 ***
## (Intercept):2    2.38423    0.01969 121.082  < 2e-16 ***
## smoker          -0.41852    0.02348 -17.821  < 2e-16 ***
## race_ethnhasian -0.20613    0.02593  -7.950 1.86e-15 ***
## race_ethnhblack -0.60348    0.01600 -37.729  < 2e-16 ***
## race_ethnhothe  -0.09157    0.07454  -1.229    0.219    
## race_ethnhwhite  0.11043    0.01381   7.998 1.27e-15 ***
## mage2(19,24]     0.16905    0.02253   7.502 6.27e-14 ***
## mage2(24,35]     0.15876    0.02087   7.609 2.77e-14 ***
## mage2(35,56]    -0.14991    0.02509  -5.975 2.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 255848.9 on 798504 degrees of freedom
## 
## Log-likelihood: -127924.5 on 798504 degrees of freedom
## 
## Number of Fisher scoring iterations: 4 
## 
## No Hauck-Donner effect found in any of the estimates
## 
## 
## Exponentiated coefficients:
##          smoker race_ethnhasian race_ethnhblack  race_ethnhothe race_ethnhwhite 
##       0.6580222       0.8137312       0.5469035       0.9124953       1.1167563 
##    mage2(19,24]    mage2(24,35]    mage2(35,56] 
##       1.1841779       1.1720608       0.8607870

#<-parallel = F == Nonproportional odds
fit.vgam2<-vglm(as.ordered(bwt)~smoker+race_eth+mage2,
               dat, na.rm=T, family=cumulative(parallel = F, reverse = T))  
summary(fit.vgam2)

## 
## Call:
## vglm(formula = as.ordered(bwt) ~ smoker + race_eth + mage2, family = cumulative(parallel = F, 
##     reverse = T), data = dat, na.rm = T)
## 
## Pearson residuals:
##                        Min      1Q  Median     3Q    Max
## logitlink(P[Y>=2]) -10.177 0.07521 0.07987 0.0896 0.6891
## logitlink(P[Y>=3])  -4.106 0.25448 0.26878 0.2961 0.5213
## 
## Coefficients: 
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept):1      4.31655    0.04681  92.219  < 2e-16 ***
## (Intercept):2      2.38277    0.01970 120.965  < 2e-16 ***
## smoker:1          -0.27174    0.05621  -4.835 1.33e-06 ***
## smoker:2          -0.42154    0.02349 -17.944  < 2e-16 ***
## race_ethnhasian:1  0.16704    0.07019   2.380 0.017318 *  
## race_ethnhasian:2 -0.21195    0.02589  -8.185 2.72e-16 ***
## race_ethnhblack:1 -0.81622    0.03432 -23.786  < 2e-16 ***
## race_ethnhblack:2 -0.59748    0.01604 -37.256  < 2e-16 ***
## race_ethnhothe:1  -0.29370    0.15814  -1.857 0.063279 .  
## race_ethnhothe:2  -0.08783    0.07469  -1.176 0.239650    
## race_ethnhwhite:1  0.12817    0.03297   3.887 0.000101 ***
## race_ethnhwhite:2  0.11017    0.01381   7.976 1.51e-15 ***
## mage2(19,24]:1     0.16339    0.05348   3.055 0.002248 ** 
## mage2(19,24]:2     0.16919    0.02254   7.505 6.14e-14 ***
## mage2(24,35]:1     0.06555    0.04918   1.333 0.182581    
## mage2(24,35]:2     0.16034    0.02088   7.680 1.59e-14 ***
## mage2(35,56]:1    -0.31363    0.05758  -5.446 5.14e-08 ***
## mage2(35,56]:2    -0.14656    0.02511  -5.836 5.36e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
## 
## Residual deviance: 255719.1 on 798496 degrees of freedom
## 
## Log-likelihood: -127859.5 on 798496 degrees of freedom
## 
## Number of Fisher scoring iterations: 5 
## 
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):1'
## 
## 
## Exponentiated coefficients:
##          smoker:1          smoker:2 race_ethnhasian:1 race_ethnhasian:2 
##         0.7620504         0.6560326         1.1817964         0.8090020 
## race_ethnhblack:1 race_ethnhblack:2  race_ethnhothe:1  race_ethnhothe:2 
##         0.4420986         0.5501966         0.7454978         0.9159164 
## race_ethnhwhite:1 race_ethnhwhite:2    mage2(19,24]:1    mage2(19,24]:2 
##         1.1367455         1.1164636         1.1775004         1.1843415 
##    mage2(24,35]:1    mage2(24,35]:2    mage2(35,56]:1    mage2(35,56]:2 
##         1.0677408         1.1739146         0.7307892         0.8636780

AIC(fit.vgam)

## [1] 255868.9

AIC(fit.vgam2)

## [1] 255755.1

According to AIC value Non proportional model fits best.

For the best fitting model from part c, Describe the results of your model, and present output from the model in terms of odds ratios and confidence intervals for all model parameters in a table.

#Odds Ratio
round(exp(coef(fit.vgam2)), 3)

##     (Intercept):1     (Intercept):2          smoker:1          smoker:2 
##            74.930            10.835             0.762             0.656 
## race_ethnhasian:1 race_ethnhasian:2 race_ethnhblack:1 race_ethnhblack:2 
##             1.182             0.809             0.442             0.550 
##  race_ethnhothe:1  race_ethnhothe:2 race_ethnhwhite:1 race_ethnhwhite:2 
##             0.745             0.916             1.137             1.116 
##    mage2(19,24]:1    mage2(19,24]:2    mage2(24,35]:1    mage2(24,35]:2 
##             1.178             1.184             1.068             1.174 
##    mage2(35,56]:1    mage2(35,56]:2 
##             0.731             0.864

#Confidence Interval
round(exp(confint(fit.vgam2)), 3)

##                    2.5 % 97.5 %
## (Intercept):1     68.362 82.129
## (Intercept):2     10.425 11.261
## smoker:1           0.683  0.851
## smoker:2           0.627  0.687
## race_ethnhasian:1  1.030  1.356
## race_ethnhasian:2  0.769  0.851
## race_ethnhblack:1  0.413  0.473
## race_ethnhblack:2  0.533  0.568
## race_ethnhothe:1   0.547  1.016
## race_ethnhothe:2   0.791  1.060
## race_ethnhwhite:1  1.066  1.213
## race_ethnhwhite:2  1.087  1.147
## mage2(19,24]:1     1.060  1.308
## mage2(19,24]:2     1.133  1.238
## mage2(24,35]:1     0.970  1.176
## mage2(24,35]:2     1.127  1.223
## mage2(35,56]:1     0.653  0.818
## mage2(35,56]:2     0.822  0.907

#Interpretation: Smoker 1: Non smokers mothers were 34% less likely to have low birth weight babies compared to smoker mothers. Asians mothers are 18% more likely to have low birth weight babies compared to hispanics mothers. Black mothers are 56% less likely to have low weight babies compared to hispanics mothers. Other race mothers are 36% of the less likely to have low weight babies compared to hispanic mothers. White mothers are 13% more likely to have low weight babies compared to hispanic mothers.

Compared to 11-19years age group mothers, 19-24years age group mothers are more 18% more likely to have low birth weight babies.the 35-56year age group mothers have 27% less likely to have a low birth weight baby.

Smoker 2: Non smoker mother are 35% less likely to have low weight babies compared to smoker mothers. Asian mothers are 20% less likely to have low birth weight babies compared to hispanics mothers. Black mothers are 45% less likely to have low weight babies compared to hispanic mothers.

Other race mothers are 9% less likely to have low weight babies compared to hispanic mothers.

White mothers are 11% more likely to habe low weight babies compared to smoker mother.

Compared to 11-19years age group mothers, 19-24years age group mothers are more 18% more likely to have low birth weight babies.the 35-56year age group mothers have 14% less likely to have a low birth weight baby.