#Load Library
library(foreign)
library(dplyr)
library(haven)
library(tableone)
library(MASS)
library(car)
library(VGAM)
library(stargazer)
#Load data of National Vital Statistics Birth data of 2015
dat <- read_sas ("C:/Users/nahin/Google Drive/MSc Demography/Spring 2020/Stat II 5283/Data/OneDrive_1_3-6-2020/Single year/birth2015.sas7bdat")
#lowering capital case
names(dat)<-tolower(names(dat))
#names(dat)
#Selecting variables
dat <- dat %>%
dplyr::select(cig_n_t1, cig_n_t2, cig_n_t3, b_wt_grp, b_m_age, b_m_educ, m_hisnot, m_hismex,m_hispr,m_hiscub, m_hisoth, m_hisdes, m_rwhite, m_rblack,m_ramind, m_rindes, m_rasnin, m_rchina, m_rfilip, m_rjapan, m_rkorea, m_rviet, m_rothas, m_rasdes, m_rhawai, m_rguam, m_rsamoa, m_rothpa,m_rpacis, m_rother, m_rotdes)
#class convertion
dat %>%
mutate_all(as.numeric)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
#a) Define an ordinal or multinomial outcome variable of your choosing and define how you will recode the original variable.
Recoding dependent/ordinal variable. Low birth weight of the babies at birth is the depedent variable which is a ranked variable into 12 ordinal level. What we need to do is, compress those lavels into 3 catergories like as Very low, morderately-low, and High .
class(dat$b_wt_grp)
## [1] "character"
dat$b_wt_grp <- as.numeric(dat$b_wt_grp)
dat$bwt<-Recode(dat$b_wt_grp, recodes="1:3=1; 4:5=2; 6:11=3; else=NA", as.factor=T)
table(dat$bwt)
##
## 1 2 3
## 5725 27750 369908
Recoding smoking mothers.As smoking and non smoking mothers will be my two independent variables. Those who smoked During pregnancy will be counted as smoker and who did not smoked will be recoded as non-smoker irrespective of the number of cigaretts they had during pregnancy.
#Converting the class variables numeric variables
# dat$cig_n_t1 <- as.numeric(dat$cig_n_t1)
# dat$cig_n_t2 <- as.numeric(dat$cig_n_t2)
# dat$cig_n_t3 <- as.numeric(dat$cig_n_t3)
#Independent Variables: Smoker and non-smoker during pregnancy
dat$cigprt1<-Recode(dat$cig_n_t1, recodes="0=0; .5:50=1; else=NA")
dat$cigprt2<-Recode(dat$cig_n_t2, recodes="0=0; .3:50=1; else=NA")
dat$cigprt3<-Recode(dat$cig_n_t3, recodes="0=0; .1:41=1; else=NA")
dat$smoker <- ifelse(dat$cigprt1==0 & dat$cigprt2==0 & dat$cigprt3==0, 0, 1)
#Hispanic Ethnicity
dat$mhismex <- Recode(dat$m_hismex, recodes="0=0; 1=1")
dat$mhispr <- Recode(dat$m_hispr, recodes="0=0; 1=1")
dat$mhiscub <- Recode(dat$m_hiscub, recodes="0=0; 1=1")
dat$mhisoth <- Recode(dat$m_hisoth, recodes="0=0; 1=1")
dat$hispan <- ifelse(dat$mhismex==1 | dat$mhispr==1 | dat$mhiscub==1 | dat$mhisoth==1, 1, 0)
#Race
dat$white <- Recode(dat$m_rwhite, recodes="0=0; 1=1")
dat$black <- Recode(dat$m_rblack, recodes="0=0; 1=1")
#American Indian Native
dat$amindnative <- ifelse(dat$m_ramind==1 | dat$m_rhawai==1 | dat$m_rguam ==1 | dat$m_rsamoa==1 | dat$m_rothpa==1| dat$m_rpacis==1,1,0)
#Asians
dat$india <- Recode(dat$m_rasnin, recodes="0=0; 1=1")
dat$china <- Recode(dat$m_rchina, recodes="0=0; 1=1")
dat$filip <- Recode(dat$m_rfilip, recodes="0=0; 1=1")
dat$japan <- Recode(dat$m_rjapan, recodes="0=0; 1=1")
dat$korea <- Recode(dat$m_rkorea, recodes="0=0; 1=1")
dat$viet <- Recode(dat$m_rviet, recodes="0=0; 1=1")
dat$other <- Recode(dat$m_rothas, recodes="0=0; 1=1")
dat$asian <- ifelse(dat$india==1 | dat$china==1 | dat$filip==1 | dat$japan==1 | dat$korea==1 | dat$viet==1 | dat$other==1,1,0)
#Other
dat$other <- Recode(dat$m_rother, recodes="0=0; 1=1")
#Race/Ethnicity Varible
dat$race_eth <- factor(ifelse(dat$hispan==1, "hispanic",
ifelse(dat$hispan==0 & dat$white==1, "nhwhite",
ifelse(dat$hispan==0 & dat$black==1, "nhblack",
ifelse(dat$hispan==0 & dat$asian==1, "nhasian",
ifelse(dat$hispan==0 & dat$other==1, "nhothe", NA))))))
#dat$race_eth <- relevel(dat$race_eth, ref = "nhwhite")
#Age
dat$mage <- Recode(dat$b_m_age, recodes="99=NA")
dat$mage2<-cut(dat$mage, breaks=c(11,19,24,35,56))
table(dat$mage2)
##
## (11,19] (19,24] (24,35] (35,56]
## 33137 97732 229704 42819
b)State a research question about what factors you believe will affect your outcome variable.
Does smoking during pregnancy affects birth weight outcomes. Here, I will use two covariates, race and age to see how the realtionship changes with the inclusion of this two varibables in the model.
fit1 <- polr(bwt~smoker+race_eth+mage2, data=dat)
summary(fit1)
##
## Re-fitting to get Hessian
## Call:
## polr(formula = bwt ~ smoker + race_eth + mage2, data = dat)
##
## Coefficients:
## Value Std. Error t value
## smoker -0.41852 0.02346 -17.839
## race_ethnhasian -0.20612 0.02587 -7.967
## race_ethnhblack -0.60348 0.01602 -37.665
## race_ethnhothe -0.09155 0.07467 -1.226
## race_ethnhwhite 0.11043 0.01381 7.998
## mage2(19,24] 0.16906 0.02253 7.505
## mage2(24,35] 0.15877 0.02086 7.611
## mage2(35,56] -0.14990 0.02509 -5.974
##
## Intercepts:
## Value Std. Error t value
## 1|2 -4.2265 0.0231 -182.9186
## 2|3 -2.3842 0.0197 -121.1282
##
## Residual Deviance: 255848.92
## AIC: 255868.92
## (4182 observations deleted due to missingness)
fit1$deviance+2*length(fit1$coefficients)
## [1] 255864.9
Now we can “examine” the proportional odds assumption by fitting logits for each change
dat$birth<-Recode(dat$bwt, recodes="1:3=1; 4:5=2; 6:11=3; else=NA", as.factor=F)
#dat$age <- Recode(dat$mage, recodes="11:19=1;20:24=2;25:35=3;36:56=4", as.factor=F)
ex1 <- glm(I(birth>1)~smoker+race_eth+mage2, data = dat, family="binomial")
## Warning: glm.fit: algorithm did not converge
ex2 <- glm(I(birth>2)~smoker+race_eth+mage2,data = dat, family="binomial")
## Warning: glm.fit: algorithm did not converge
plot(coef(ex1)[-1], xlim=c(1, 11), ylim=c(-3, 3), type="l", xaxt="n",
ylab="Beta", main=c("Comparison of betas for", " proportional odds assumption"))
lines(coef(ex2)[-1], col=1, lty=2)
axis(side=1, at=1:12, labels=F)
text(x=1:12, y=-4, srt = 90, pos = 1, xpd = TRUE,cex=.8,
labels = c("Nonsmoker", "Smoker", "Hispanic", "NHasin", "NHBlack","NHOther", "NHwhite",
"11-19", "19-24", "24-35", "35-56" ))
legend("bottomright",col=c(1,1),lty=c(1,2), legend=c(">1", ">2"))
lines(coef(fit1)[c(-13:-16)], col=4, lwd=3)
round(exp(rbind(coef(ex1)[-1], coef(ex2)[-1])),3)
## smoker race_ethnhasian race_ethnhblack race_ethnhothe race_ethnhwhite
## [1,] 1 1 1 1 1
## [2,] 1 1 1 1 1
## mage2(19,24] mage2(24,35] mage2(35,56]
## [1,] 1 1 1
## [2,] 1 1 1
The transaction of betas are almost similar in the graph. That is the proportional odds assumption is made.
#Proportional and Non-proportional model
#<-parallel = T == proportional odds
fit.vgam<-vglm(as.ordered(bwt)~smoker+race_eth+mage2,
dat, na.rm=T, family=cumulative(parallel = T, reverse = T))
summary(fit.vgam)
##
## Call:
## vglm(formula = as.ordered(bwt) ~ smoker + race_eth + mage2, family = cumulative(parallel = T,
## reverse = T), data = dat, na.rm = T)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## logitlink(P[Y>=2]) -9.448 0.07562 0.07977 0.09247 0.6055
## logitlink(P[Y>=3]) -4.132 0.25434 0.26883 0.29694 0.5249
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 4.22661 0.02311 182.885 < 2e-16 ***
## (Intercept):2 2.38423 0.01969 121.082 < 2e-16 ***
## smoker -0.41852 0.02348 -17.821 < 2e-16 ***
## race_ethnhasian -0.20613 0.02593 -7.950 1.86e-15 ***
## race_ethnhblack -0.60348 0.01600 -37.729 < 2e-16 ***
## race_ethnhothe -0.09157 0.07454 -1.229 0.219
## race_ethnhwhite 0.11043 0.01381 7.998 1.27e-15 ***
## mage2(19,24] 0.16905 0.02253 7.502 6.27e-14 ***
## mage2(24,35] 0.15876 0.02087 7.609 2.77e-14 ***
## mage2(35,56] -0.14991 0.02509 -5.975 2.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
##
## Residual deviance: 255848.9 on 798504 degrees of freedom
##
## Log-likelihood: -127924.5 on 798504 degrees of freedom
##
## Number of Fisher scoring iterations: 4
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Exponentiated coefficients:
## smoker race_ethnhasian race_ethnhblack race_ethnhothe race_ethnhwhite
## 0.6580222 0.8137312 0.5469035 0.9124953 1.1167563
## mage2(19,24] mage2(24,35] mage2(35,56]
## 1.1841779 1.1720608 0.8607870
#<-parallel = F == Nonproportional odds
fit.vgam2<-vglm(as.ordered(bwt)~smoker+race_eth+mage2,
dat, na.rm=T, family=cumulative(parallel = F, reverse = T))
summary(fit.vgam2)
##
## Call:
## vglm(formula = as.ordered(bwt) ~ smoker + race_eth + mage2, family = cumulative(parallel = F,
## reverse = T), data = dat, na.rm = T)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## logitlink(P[Y>=2]) -10.177 0.07521 0.07987 0.0896 0.6891
## logitlink(P[Y>=3]) -4.106 0.25448 0.26878 0.2961 0.5213
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept):1 4.31655 0.04681 92.219 < 2e-16 ***
## (Intercept):2 2.38277 0.01970 120.965 < 2e-16 ***
## smoker:1 -0.27174 0.05621 -4.835 1.33e-06 ***
## smoker:2 -0.42154 0.02349 -17.944 < 2e-16 ***
## race_ethnhasian:1 0.16704 0.07019 2.380 0.017318 *
## race_ethnhasian:2 -0.21195 0.02589 -8.185 2.72e-16 ***
## race_ethnhblack:1 -0.81622 0.03432 -23.786 < 2e-16 ***
## race_ethnhblack:2 -0.59748 0.01604 -37.256 < 2e-16 ***
## race_ethnhothe:1 -0.29370 0.15814 -1.857 0.063279 .
## race_ethnhothe:2 -0.08783 0.07469 -1.176 0.239650
## race_ethnhwhite:1 0.12817 0.03297 3.887 0.000101 ***
## race_ethnhwhite:2 0.11017 0.01381 7.976 1.51e-15 ***
## mage2(19,24]:1 0.16339 0.05348 3.055 0.002248 **
## mage2(19,24]:2 0.16919 0.02254 7.505 6.14e-14 ***
## mage2(24,35]:1 0.06555 0.04918 1.333 0.182581
## mage2(24,35]:2 0.16034 0.02088 7.680 1.59e-14 ***
## mage2(35,56]:1 -0.31363 0.05758 -5.446 5.14e-08 ***
## mage2(35,56]:2 -0.14656 0.02511 -5.836 5.36e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Names of linear predictors: logitlink(P[Y>=2]), logitlink(P[Y>=3])
##
## Residual deviance: 255719.1 on 798496 degrees of freedom
##
## Log-likelihood: -127859.5 on 798496 degrees of freedom
##
## Number of Fisher scoring iterations: 5
##
## Warning: Hauck-Donner effect detected in the following estimate(s):
## '(Intercept):1'
##
##
## Exponentiated coefficients:
## smoker:1 smoker:2 race_ethnhasian:1 race_ethnhasian:2
## 0.7620504 0.6560326 1.1817964 0.8090020
## race_ethnhblack:1 race_ethnhblack:2 race_ethnhothe:1 race_ethnhothe:2
## 0.4420986 0.5501966 0.7454978 0.9159164
## race_ethnhwhite:1 race_ethnhwhite:2 mage2(19,24]:1 mage2(19,24]:2
## 1.1367455 1.1164636 1.1775004 1.1843415
## mage2(24,35]:1 mage2(24,35]:2 mage2(35,56]:1 mage2(35,56]:2
## 1.0677408 1.1739146 0.7307892 0.8636780
AIC(fit.vgam)
## [1] 255868.9
AIC(fit.vgam2)
## [1] 255755.1
According to AIC value Non proportional model fits best.
#Odds Ratio
round(exp(coef(fit.vgam2)), 3)
## (Intercept):1 (Intercept):2 smoker:1 smoker:2
## 74.930 10.835 0.762 0.656
## race_ethnhasian:1 race_ethnhasian:2 race_ethnhblack:1 race_ethnhblack:2
## 1.182 0.809 0.442 0.550
## race_ethnhothe:1 race_ethnhothe:2 race_ethnhwhite:1 race_ethnhwhite:2
## 0.745 0.916 1.137 1.116
## mage2(19,24]:1 mage2(19,24]:2 mage2(24,35]:1 mage2(24,35]:2
## 1.178 1.184 1.068 1.174
## mage2(35,56]:1 mage2(35,56]:2
## 0.731 0.864
#Confidence Interval
round(exp(confint(fit.vgam2)), 3)
## 2.5 % 97.5 %
## (Intercept):1 68.362 82.129
## (Intercept):2 10.425 11.261
## smoker:1 0.683 0.851
## smoker:2 0.627 0.687
## race_ethnhasian:1 1.030 1.356
## race_ethnhasian:2 0.769 0.851
## race_ethnhblack:1 0.413 0.473
## race_ethnhblack:2 0.533 0.568
## race_ethnhothe:1 0.547 1.016
## race_ethnhothe:2 0.791 1.060
## race_ethnhwhite:1 1.066 1.213
## race_ethnhwhite:2 1.087 1.147
## mage2(19,24]:1 1.060 1.308
## mage2(19,24]:2 1.133 1.238
## mage2(24,35]:1 0.970 1.176
## mage2(24,35]:2 1.127 1.223
## mage2(35,56]:1 0.653 0.818
## mage2(35,56]:2 0.822 0.907
#Interpretation: Smoker 1: Non smokers mothers were 34% less likely to have low birth weight babies compared to smoker mothers. Asians mothers are 18% more likely to have low birth weight babies compared to hispanics mothers. Black mothers are 56% less likely to have low weight babies compared to hispanics mothers. Other race mothers are 36% of the less likely to have low weight babies compared to hispanic mothers. White mothers are 13% more likely to have low weight babies compared to hispanic mothers.
Compared to 11-19years age group mothers, 19-24years age group mothers are more 18% more likely to have low birth weight babies.the 35-56year age group mothers have 27% less likely to have a low birth weight baby.
Smoker 2: Non smoker mother are 35% less likely to have low weight babies compared to smoker mothers. Asian mothers are 20% less likely to have low birth weight babies compared to hispanics mothers. Black mothers are 45% less likely to have low weight babies compared to hispanic mothers.
Other race mothers are 9% less likely to have low weight babies compared to hispanic mothers.
White mothers are 11% more likely to habe low weight babies compared to smoker mother.
Compared to 11-19years age group mothers, 19-24years age group mothers are more 18% more likely to have low birth weight babies.the 35-56year age group mothers have 14% less likely to have a low birth weight baby.