library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(haven)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#names(brfss_17)
#converting to lower cases
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
#a)Define a binary outcome variable of your choosing and define how you recode the original variable
#I have recoded smokday2 variable into binary variable. I put “1” for those who smoked everyday and those said somedays. I coded “0” for those who smoked last month and prior to that. Hence I have created smoker=1 and non-smoker=0
mydat<- brfss_17
mydat$smoker<-Recode(mydat$smokday2,recodes="1:2=1; 3=0; else=NA")
table(mydat$smoker)
##
## 0 1
## 61659 29620
#b) State a research question about what factors you believe will affect your outcome variable.
#Does smoking status is varied by age? #Does smoking status is varied by education?
#Here I have selected age and education as predictor variable which are categorical. I have selected age to see which age group has most tendency to smoke compare to other. On the otherhand, as higher education provides more awareness compare to less educated people, this varible would reveal that whether that has any reflection on that or not.
#d)Descriptive analysis
#Age cut into intervals
mydat$agec<-cut(mydat$age80, breaks=c(0,24,39,59,79,99))
#Education
mydat$educ<-Recode(mydat$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
mydat$educ<-relevel(mydat$educ, ref='2hsgrad')
table(mydat$smoker, mydat$agec)
##
## (0,24] (24,39] (39,59] (59,79] (79,99]
## 0 929 6760 16184 30801 6985
## 1 1604 6922 11688 8864 542
table(mydat$smoker, mydat$educ)
##
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 15622 1106 2442 18252 24050
## 1 10574 763 2752 9579 5867
#column percentages
prop.table(table(mydat$smoker, mydat$agec), margin=2)
##
## (0,24] (24,39] (39,59] (59,79] (79,99]
## 0 0.36675878 0.49407981 0.58065442 0.77652843 0.92799256
## 1 0.63324122 0.50592019 0.41934558 0.22347157 0.07200744
prop.table(table(mydat$smoker, mydat$educ), margin=2)
##
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 0.5963506 0.5917603 0.4701579 0.6558155 0.8038908
## 1 0.4036494 0.4082397 0.5298421 0.3441845 0.1961092
#basic chi square test of independence
chisq.test(table(mydat$smoker, mydat$agec))
##
## Pearson's Chi-squared test
##
## data: table(mydat$smoker, mydat$agec)
## X-squared = 8335.8, df = 4, p-value < 2.2e-16
chisq.test(table(mydat$smoker, mydat$educ))
##
## Pearson's Chi-squared test
##
## data: table(mydat$smoker, mydat$educ)
## X-squared = 4106.4, df = 4, p-value < 2.2e-16
mydat$tx<-NA
mydat$tx[grep(pattern = "TX", mydat$mmsaname)]<-1
mydat<-mydat%>%
filter(tx==1, is.na(mmsawt)==F)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = mydat )
Re-doing the analysis from above using only weights:
cat<-wtd.table(mydat$smoker, mydat$agec, weights = mydat$mmsawt)
prop.table(wtd.table(mydat$smoker, mydat$agec, weights = mydat$mmsawt), margin=2)
## (0,24] (24,39] (39,59] (59,79] (79,99]
## 0 0.41177038 0.41814904 0.58384826 0.73355673 0.92424106
## 1 0.58822962 0.58185096 0.41615174 0.26644327 0.07575894
#compare that with the original
prop.table(table(mydat$smoker, mydat$agec), margin=2)
##
## (0,24] (24,39] (39,59] (59,79] (79,99]
## 0 0.37234043 0.51483051 0.57678781 0.75658363 0.92556634
## 1 0.62765957 0.48516949 0.42321219 0.24341637 0.07443366
cat<-wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt)
prop.table(wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt), margin=2)
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 0.4927144 0.5698109 0.2322339 0.5860218 0.7928775
## 1 0.5072856 0.4301891 0.7677661 0.4139782 0.2071225
#compare that with the original
prop.table(table(mydat$smoker, mydat$educ), margin=2)
##
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 0.6105960 0.6120690 0.4205128 0.6274921 0.8143116
## 1 0.3894040 0.3879310 0.5794872 0.3725079 0.1856884
#Standard errors of these percentages
n<-table(is.na(mydat$smoker)==F)
n
FALSE TRUE 5500 3133
p<-prop.table(wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt), margin=2)
se<-(p*(1-p))/n[2]
stargazer(data.frame(proportion=p, se=sqrt(se)), summary = F, type = "html", digits = 2)
| proportion.Var1 | proportion.Var2 | proportion.Freq | se.Var1 | se.Var2 | se.Freq | |
| 1 | 0 | 2hsgrad | 0.49 | 0 | 2hsgrad | 0.01 |
| 2 | 1 | 2hsgrad | 0.51 | 1 | 2hsgrad | 0.01 |
| 3 | 0 | 0Prim | 0.57 | 0 | 0Prim | 0.01 |
| 4 | 1 | 0Prim | 0.43 | 1 | 0Prim | 0.01 |
| 5 | 0 | 1somehs | 0.23 | 0 | 1somehs | 0.01 |
| 6 | 1 | 1somehs | 0.77 | 1 | 1somehs | 0.01 |
| 7 | 0 | 3somecol | 0.59 | 0 | 3somecol | 0.01 |
| 8 | 1 | 3somecol | 0.41 | 1 | 3somecol | 0.01 |
| 9 | 0 | 4colgrad | 0.79 | 0 | 4colgrad | 0.01 |
| 10 | 1 | 4colgrad | 0.21 | 1 | 4colgrad | 0.01 |
#Since we have weighted, the standard error is too small. This is the reason of using survey statistical methods, to get the right standard error for a statistic.
#a) Calculate descriptive statistics (mean or percentage) for each variable using no weights or survey design, as well as with full survey and weights.
cat<-svytable(~smoker+educ, design = des)
stargazer(data.frame(prop.table(svytable(~smoker+educ, design = des), margin = 2)),summary = F, type = "html", digits=3)
| smoker | educ | Freq | |
| 1 | 0 | 2hsgrad | 0.493 |
| 2 | 1 | 2hsgrad | 0.507 |
| 3 | 0 | 0Prim | 0.570 |
| 4 | 1 | 0Prim | 0.430 |
| 5 | 0 | 1somehs | 0.232 |
| 6 | 1 | 1somehs | 0.768 |
| 7 | 0 | 3somecol | 0.586 |
| 8 | 1 | 3somecol | 0.414 |
| 9 | 0 | 4colgrad | 0.793 |
| 10 | 1 | 4colgrad | 0.207 |
#This gives the same value as in the weighted one. Therefore we need to correct our standard error for smoker prevalence.
sv.table<-svyby(formula = ~smoker, by = ~agec, design = des, FUN = svymean, na.rm=T)
stargazer(sv.table, summary = F, type = "html", digits = 2)
| agec | smoker | se | |
| (0,24] | (0,24] | 0.59 | 0.09 |
| (24,39] | (24,39] | 0.58 | 0.04 |
| (39,59] | (39,59] | 0.42 | 0.03 |
| (59,79] | (59,79] | 0.27 | 0.03 |
| (79,99] | (79,99] | 0.08 | 0.04 |
sv.table<-svyby(formula = ~smoker, by = ~educ, design = des, FUN = svymean, na.rm=T)
stargazer(sv.table, summary = F, type = "html", digits = 2)
| educ | smoker | se | |
| 2hsgrad | 2hsgrad | 0.51 | 0.04 |
| 0Prim | 0Prim | 0.43 | 0.09 |
| 1somehs | 1somehs | 0.77 | 0.05 |
| 3somecol | 3somecol | 0.41 | 0.03 |
| 4colgrad | 4colgrad | 0.21 | 0.02 |
#And I see the prevalences as in the simple weighted table, but the standard errors have now been adjusted for survey design. They are much larger than the ones we computed above, which assumed random sampling.
#b) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table with appropriate survey-corrected test statistics. (tableone package helps)
library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "agec", "educ"), strata = "smoker", test = T, data = mydat)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by smoker
## 0 1 p test
## n 2119 1014
## agec (%) <0.001
## (0,24] 1.7 5.8
## (24,39] 11.5 22.6
## (39,59] 23.2 35.6
## (59,79] 50.2 33.7
## (79,99] 13.5 2.3
## educ (%) <0.001
## 2hsgrad 21.8 29.1
## 0Prim 3.4 4.4
## 1somehs 3.9 11.2
## 3somecol 28.3 35.1
## 4colgrad 42.6 20.3
#using survey design
st1<-svyCreateTableOne(vars = c( "agec", "educ"), strata = "smoker", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
## Stratified by smoker
## 0 1 p test
## n 2842271.3 2136165.3
## agec (%) <0.001
## (0,24] 5.7 10.9
## (24,39] 20.5 37.9
## (39,59] 36.9 35.0
## (59,79] 32.7 15.8
## (79,99] 4.2 0.5
## educ (%) <0.001
## 2hsgrad 23.3 31.9
## 0Prim 7.1 7.1
## 1somehs 3.7 16.4
## 3somecol 37.0 34.7
## 4colgrad 28.8 10.0
Next we apply this logic to a regression case. First we fit the OLS model for smoker outcome using education and age as predictors:
fit1<-lm(smoker~educ+agec, data=mydat)
fit1
##
## Call:
## lm(formula = smoker ~ educ + agec, data = mydat)
##
## Coefficients:
## (Intercept) educ0Prim educ1somehs educ3somecol educ4colgrad
## 0.629158 0.007865 0.170637 -0.017477 -0.186240
## agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## -0.088199 -0.150414 -0.319223 -0.484784
Next we incorporate case weights
fit2<-lm(smoker~educ+agec, data=mydat, weights = mmsawt)
fit2
##
## Call:
## lm(formula = smoker ~ educ + agec, data = mydat, weights = mmsawt)
##
## Coefficients:
## (Intercept) educ0Prim educ1somehs educ3somecol educ4colgrad
## 0.62330 -0.05924 0.24399 -0.08997 -0.27268
## agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## 0.02183 -0.14110 -0.26170 -0.44284
To incorporate design effects as well:
fit3<-svyglm(smoker~educ+agec,des, family=gaussian)
fit3
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = mydat)
##
## Call: svyglm(formula = smoker ~ educ + agec, design = des, family = gaussian)
##
## Coefficients:
## (Intercept) educ0Prim educ1somehs educ3somecol educ4colgrad
## 0.62330 -0.05924 0.24399 -0.08997 -0.27268
## agec(24,39] agec(39,59] agec(59,79] agec(79,99]
## 0.02183 -0.14110 -0.26170 -0.44284
##
## Degrees of Freedom: 3122 Total (i.e. Null); 3069 Residual
## (5510 observations deleted due to missingness)
## Null Deviance: 681.8
## Residual Deviance: 578.9 AIC: 9359
To show the results of the three models:
stargazer(fit1, fit2, fit3, style="demography", type="html",
column.labels = c("OLS", "Weights Only", "Survey"),
title = "Regression models for smoker using survey data - BRFSS 2017",
covariate.labels=c("PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"),
keep.stat="n", model.names=F, align=T, ci=T)
| smoker | |||
| OLS | Weights Only | Survey | |
| Model 1 | Model 2 | Model 3 | |
| PrimarySchool | 0.008 | -0.059 | -0.059 |
| (-0.078, 0.093) | (-0.127, 0.009) | (-0.237, 0.119) | |
| SomeHS | 0.171*** | 0.244*** | 0.244*** |
| (0.102, 0.239) | (0.182, 0.306) | (0.120, 0.368) | |
| SomeColl | -0.017 | -0.090*** | -0.090 |
| (-0.059, 0.024) | (-0.131, -0.049) | (-0.188, 0.008) | |
| CollGrad | -0.186*** | -0.273*** | -0.273*** |
| (-0.227, -0.146) | (-0.320, -0.225) | (-0.359, -0.186) | |
| Age 24-39 | -0.088 | 0.022 | 0.022 |
| (-0.186, 0.009) | (-0.044, 0.087) | (-0.157, 0.200) | |
| Age 39-59 | -0.150** | -0.141*** | -0.141 |
| (-0.244, -0.057) | (-0.205, -0.077) | (-0.318, 0.036) | |
| Age 59-79 | -0.319*** | -0.262*** | -0.262** |
| (-0.411, -0.227) | (-0.328, -0.195) | (-0.439, -0.085) | |
| Age 80+ | -0.485*** | -0.443*** | -0.443*** |
| (-0.586, -0.383) | (-0.558, -0.328) | (-0.626, -0.260) | |
| Constant | 0.629*** | 0.623*** | 0.623*** |
| (0.538, 0.720) | (0.563, 0.684) | (0.462, 0.785) | |
| N | 3,123 | 3,123 | 3,123 |
| p < .05; p < .01; p < .001 | |||
#The result of education and age models are less significant compared to prior two models. This is because of the too small parameters. The survey and weight model have similar beta but the standard error is larger in survey model.
library(ggplot2)
library(dplyr)
coefs<-data.frame(coefs=c(coef(fit1)[-1], coef(fit3)[-1]),
mod=c(rep("Non Survey Model", 8),rep("Survey Model", 8)),
effect=rep(names(coef(fit1)[-1]), 2))
coefs%>%
ggplot()+
geom_point(aes( x=effect, y=coefs, group=effect,color=effect, shape=mod),
position=position_jitterdodge(jitter.width = 1),
size=2)+
ylab("Regression Coefficient")+
xlab("Beta")+
geom_abline(intercept = 0, slope=0)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
ggtitle(label = "Comparison of Survey and Non-Survey Regression effects")
#Which shows us that the betas are similar but have some differences between the two models.