#Load data for analysis#sub-setting and Re-codding variables for analysis purposes
brfss <- readRDS("brfss_177.rds")
# Cleaning the variable names for space, underscore & Uppercase Characters
renam<-names(brfss)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = renam))
names(brfss)<-newnames
homewk3 <- brfss %>%
dplyr::select(state,llcpwt,dispcode,seqno,psu,bmi5cat,racegr3,educa,ststr,smoke100,exerany2,sex,ageg,menthlth,smokday2,sleptim1) %>%
mutate( smokers=ifelse(smoke100==1 & smokday2%in%c(1,2),"Current",
ifelse(smoke100==1 & smokday2==3, "Former",
ifelse(smoke100==2, "Never", NA)))) %>%
mutate(bmi = car::Recode(bmi5cat,recodes="1='Underweight';2='Normal';3='Overweight'; 4='Obese';else=NA",as.factor=T ),
educa = car::Recode(educa,recodes="1:2='<HS';3:4='HS';5:6='>HS';else=NA", as.factor=T ),
racegr3 = car::Recode(racegr3,recodes="1='NH-White';2='NH-Black';3:4='Other';5='Hispanic';else=NA",as.factor=T ),
ageg = car::Recode(ageg,recodes="1='18-24';2='25-30';3='35-44';4='45-54';5='55-64';else='65+'" ),
Physicala = car::Recode(exerany2,recodes="='Yes';2='No';else=NA",as.factor=T ), sex = car::Recode(sex,recodes="1='Male';2='Female';else=NA",as.factor=T ),
sleepdurat=car::Recode(sleptim1, recodes="77=NA ;99= NA"),
mentald=ifelse(menthlth<31, "Yes",
ifelse(menthlth==88, "No",NA)))
#Data containing only variables used in the analysis
Final_data <- homewk3 %>%
dplyr:: select(state,llcpwt,dispcode,seqno,psu,smokers,bmi,educa,ageg,Physicala, sleepdurat,sex,racegr3,mentald,ststr) %>%
filter(complete.cases(.))
# removing unwanted data from the R environment
rm(brfss,newnames,renam,rg);gc()
## Warning in rm(brfss, newnames, renam, rg): object 'rg' not found
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2576943 137.7 5325135 284.4 2995657 160.0
## Vcells 12708632 97.0 156031371 1190.5 194979110 1487.6
#rm(list = ls())
# Survey Design
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~llcpwt, data = Final_data )
For the purpose of this analysis, the outcome variable is self reported average hours of sleep in a 24 period. The questiuon on survey is: “On average, how many hours of sleep do you have in a day” was re-coded such that 3 will be the lowest (Inadequate) sleep duration and 1 being the Highest (More than adequate) sleep duration any of the respondents reported.
what is the relationship between average daily sleep duration and Body Mass Index(BMI) ?
Since i am not interested in rates but counts of the outcome variable, i will not be using an offset for the poisson model
svyhist(~sleepdurat, des)
svyby(~sleepdurat, ~bmi+educa, des, svymean, na.rm=T)
svyby(~sleepdurat, ~ageg, des, svymean, na.rm=T)
#Poisson glm fit to survey data
fit1<-svyglm(sleepdurat~factor(bmi)+factor(educa)+factor(ageg), design=des, family=poisson)
summary(fit1)
##
## Call:
## svyglm(formula = sleepdurat ~ factor(bmi) + factor(educa) + factor(ageg),
## design = des, family = poisson)
##
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~llcpwt, data = Final_data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.983388 0.026176 75.772 < 2e-16 ***
## factor(bmi)Obese -0.004991 0.009544 -0.523 0.60103
## factor(bmi)Overweight -0.005767 0.009420 -0.612 0.54042
## factor(bmi)Underweight 0.018093 0.025691 0.704 0.48128
## factor(educa)>HS -0.037761 0.018476 -2.044 0.04100 *
## factor(educa)HS -0.038523 0.018851 -2.044 0.04102 *
## factor(ageg)25-30 -0.028610 0.020649 -1.386 0.16592
## factor(ageg)35-44 -0.017171 0.020079 -0.855 0.39249
## factor(ageg)45-54 -0.044254 0.019932 -2.220 0.02642 *
## factor(ageg)55-64 -0.006065 0.018283 -0.332 0.74008
## factor(ageg)65+ 0.048138 0.017598 2.735 0.00624 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 0.3740781)
##
## Number of Fisher Scoring iterations: 4
#here are the poisson model "risk ratios", which just show the change in the mean
round(exp(summary(fit1)$coef[-1,1]), 3)
## factor(bmi)Obese factor(bmi)Overweight factor(bmi)Underweight
## 0.995 0.994 1.018
## factor(educa)>HS factor(educa)HS factor(ageg)25-30
## 0.963 0.962 0.972
## factor(ageg)35-44 factor(ageg)45-54 factor(ageg)55-64
## 0.983 0.957 0.994
## factor(ageg)65+
## 1.049
# fitting logit for each changes
library(AER)
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
dispersiontest(fit1,trafo=1)
##
## Overdispersion test
##
## data: fit1
## z = -67.147, p-value = 1
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
## alpha
## -0.6416407
Since the p-value from the test above is greater than 0 it means that there is no overdispersion. However, the alpha from the test is less than 0 this indiccates underdispersion. The result from the test suggests that Poisson model is not a good choice for the data.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
fit2<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa),
data=Final_data,
weights=llcpwt/mean(llcpwt, na.rm=T))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
coeftest(fit2, vcov=vcovHC(fit2, type="HC1",cluster="ststr"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.9917218 0.0203111 98.0609 < 2.2e-16 ***
## factor(bmi)Obese -0.0081711 0.0098870 -0.8265 0.408548
## factor(bmi)Overweight -0.0075330 0.0097022 -0.7764 0.437503
## factor(bmi)Underweight 0.0251463 0.0261481 0.9617 0.336207
## factor(racegr3)NH-Black -0.0265255 0.0194589 -1.3632 0.172834
## factor(racegr3)NH-White -0.0123831 0.0126914 -0.9757 0.329211
## factor(racegr3)Other -0.0599543 0.0195331 -3.0694 0.002145 **
## factor(educa)>HS -0.0287362 0.0192925 -1.4895 0.136357
## factor(educa)HS -0.0330634 0.0195729 -1.6892 0.091172 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.nb1<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa),
data=Final_data,
weights=llcpwt/mean(llcpwt, na.rm=T))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb2<-glm.nb(sleepdurat~factor(bmi)+factor(racegr3)+factor(educa)+factor(ageg),
data=Final_data,
weights=llcpwt/mean(llcpwt, na.rm=T))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
tests1<-coeftest(fit.nb1, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
tests<-coeftest(fit.nb2, vcov=vcovHC(fit.nb2, type="HC1",cluster="ststr"))
stargazer(fit.nb1, fit.nb2,style="demography", type = "text", t.auto=F,p.auto=F,coef=list(tests1[, 1],tests[,1]), se =list(tests1[, 2], tests[, 2]), p=list(tests1[,4],tests[, 4]) )
##
## ---------------------------------------------------------------------------
## sleepdurat
## Model 1 Model 2
## ---------------------------------------------------------------------------
## factor(bmi)Obese -0.008 -0.006
## (0.010) (0.010)
## factor(bmi)Overweight -0.008 -0.008
## (0.009) (0.009)
## factor(bmi)Underweight 0.025 0.019
## (0.026) (0.026)
## factor(racegr3)NH-Black -0.027 -0.037
## (0.019) (0.019)
## factor(racegr3)NH-White -0.012 -0.033**
## (0.013) (0.013)
## factor(racegr3)Other -0.060** -0.067***
## (0.019) (0.019)
## factor(educa)> HS -0.029 -0.020
## (0.019) (0.019)
## factor(educa)HS -0.033 -0.024
## (0.020) (0.020)
## factor(ageg)25-30 -0.030
## (0.021)
## factor(ageg)35-44 -0.017
## (0.020)
## factor(ageg)45-54 -0.042*
## (0.020)
## factor(ageg)55-64 -0.002
## (0.018)
## factor(ageg)65+ 0.052**
## (0.018)
## Constant 1.992*** 1.998***
## (0.027) (0.027)
## N 11,432 11,432
## Log Likelihood -23,761.520 -23,715.180
## theta 333,562.800 (471,726.300) 339,828.300 (481,934.700)
## AIC 47,541.040 47,458.370
## ---------------------------------------------------------------------------
## *p < .05; **p < .01; ***p < .001
The second model appears to fit the data better. It has the smallest AIC 47,458.370