library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(haven)

Loading Data

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
#names(brfss_17)
#converting to lower cases
nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

Recoding variables

#a)Define a binary outcome variable of your choosing and define how you recode the original variable

#I have recoded smokday2 variable into binary variable. I put “1” for those who smoked everyday and those said somedays. I coded “0” for those who smoked last month and prior to that. Hence I have created smoker=1 and non-smoker=0

mydat<- brfss_17

mydat$smoker<-Recode(mydat$smokday2,recodes="1:2=1; 3=0; else=NA")

table(mydat$smoker)
## 
##     0     1 
## 61659 29620

#b) State a research question about what factors you believe will affect your outcome variable.

#Does smoking status is varied by age? #Does smoking status is varied by education?

#Here I have selected age and education as predictor variable which are categorical. I have selected age to see which age group has most tendency to smoke compare to other. On the otherhand, as higher education provides more awareness compare to less educated people, this varible would reveal that whether that has any reflection on that or not.

#d)Descriptive analysis

#Age cut into intervals
mydat$agec<-cut(mydat$age80, breaks=c(0,24,39,59,79,99))

#Education
mydat$educ<-Recode(mydat$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
mydat$educ<-relevel(mydat$educ, ref='2hsgrad')

Descriptive Analysis

table(mydat$smoker, mydat$agec)
##    
##     (0,24] (24,39] (39,59] (59,79] (79,99]
##   0    929    6760   16184   30801    6985
##   1   1604    6922   11688    8864     542
table(mydat$smoker, mydat$educ)
##    
##     2hsgrad 0Prim 1somehs 3somecol 4colgrad
##   0   15622  1106    2442    18252    24050
##   1   10574   763    2752     9579     5867
#column percentages
prop.table(table(mydat$smoker, mydat$agec), margin=2)
##    
##         (0,24]    (24,39]    (39,59]    (59,79]    (79,99]
##   0 0.36675878 0.49407981 0.58065442 0.77652843 0.92799256
##   1 0.63324122 0.50592019 0.41934558 0.22347157 0.07200744
prop.table(table(mydat$smoker, mydat$educ), margin=2)
##    
##       2hsgrad     0Prim   1somehs  3somecol  4colgrad
##   0 0.5963506 0.5917603 0.4701579 0.6558155 0.8038908
##   1 0.4036494 0.4082397 0.5298421 0.3441845 0.1961092
#basic chi square test of independence
chisq.test(table(mydat$smoker, mydat$agec))
## 
##  Pearson's Chi-squared test
## 
## data:  table(mydat$smoker, mydat$agec)
## X-squared = 8335.8, df = 4, p-value < 2.2e-16
chisq.test(table(mydat$smoker, mydat$educ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(mydat$smoker, mydat$educ)
## X-squared = 4106.4, df = 4, p-value < 2.2e-16

Creating survey design

mydat$tx<-NA
mydat$tx[grep(pattern = "TX", mydat$mmsaname)]<-1
mydat<-mydat%>%
  filter(tx==1, is.na(mmsawt)==F)

options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = mydat )

Simple weighted analysis

Re-doing the analysis from above using only weights:

cat<-wtd.table(mydat$smoker, mydat$agec, weights = mydat$mmsawt)
prop.table(wtd.table(mydat$smoker, mydat$agec, weights = mydat$mmsawt), margin=2)
##       (0,24]    (24,39]    (39,59]    (59,79]    (79,99]
## 0 0.41177038 0.41814904 0.58384826 0.73355673 0.92424106
## 1 0.58822962 0.58185096 0.41615174 0.26644327 0.07575894
#compare that with the original
prop.table(table(mydat$smoker, mydat$agec), margin=2)
##    
##         (0,24]    (24,39]    (39,59]    (59,79]    (79,99]
##   0 0.37234043 0.51483051 0.57678781 0.75658363 0.92556634
##   1 0.62765957 0.48516949 0.42321219 0.24341637 0.07443366
cat<-wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt)
prop.table(wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt), margin=2)
##     2hsgrad     0Prim   1somehs  3somecol  4colgrad
## 0 0.4927144 0.5698109 0.2322339 0.5860218 0.7928775
## 1 0.5072856 0.4301891 0.7677661 0.4139782 0.2071225
#compare that with the original
prop.table(table(mydat$smoker, mydat$educ), margin=2)
##    
##       2hsgrad     0Prim   1somehs  3somecol  4colgrad
##   0 0.6105960 0.6120690 0.4205128 0.6274921 0.8143116
##   1 0.3894040 0.3879310 0.5794872 0.3725079 0.1856884

#Standard errors of these percentages

n<-table(is.na(mydat$smoker)==F)
n

FALSE TRUE 5500 3133

p<-prop.table(wtd.table(mydat$smoker, mydat$educ, weights = mydat$mmsawt), margin=2)
se<-(p*(1-p))/n[2]

stargazer(data.frame(proportion=p, se=sqrt(se)), summary = F, type = "html", digits = 2)
proportion.Var1 proportion.Var2 proportion.Freq se.Var1 se.Var2 se.Freq
1 0 2hsgrad 0.49 0 2hsgrad 0.01
2 1 2hsgrad 0.51 1 2hsgrad 0.01
3 0 0Prim 0.57 0 0Prim 0.01
4 1 0Prim 0.43 1 0Prim 0.01
5 0 1somehs 0.23 0 1somehs 0.01
6 1 1somehs 0.77 1 1somehs 0.01
7 0 3somecol 0.59 0 3somecol 0.01
8 1 3somecol 0.41 1 3somecol 0.01
9 0 4colgrad 0.79 0 4colgrad 0.01
10 1 4colgrad 0.21 1 4colgrad 0.01

#Since we have weighted, the standard error is too small. This is the reason of using survey statistical methods, to get the right standard error for a statistic.

#a) Calculate descriptive statistics (mean or percentage) for each variable using no weights or survey design, as well as with full survey and weights.

cat<-svytable(~smoker+educ, design = des)

stargazer(data.frame(prop.table(svytable(~smoker+educ, design = des), margin = 2)),summary = F, type = "html", digits=3)
smoker educ Freq
1 0 2hsgrad 0.493
2 1 2hsgrad 0.507
3 0 0Prim 0.570
4 1 0Prim 0.430
5 0 1somehs 0.232
6 1 1somehs 0.768
7 0 3somecol 0.586
8 1 3somecol 0.414
9 0 4colgrad 0.793
10 1 4colgrad 0.207

#This gives the same value as in the weighted one. Therefore we need to correct our standard error for smoker prevalence.

sv.table<-svyby(formula = ~smoker, by = ~agec, design = des, FUN = svymean, na.rm=T)
stargazer(sv.table, summary = F, type = "html", digits = 2)
agec smoker se
(0,24] (0,24] 0.59 0.09
(24,39] (24,39] 0.58 0.04
(39,59] (39,59] 0.42 0.03
(59,79] (59,79] 0.27 0.03
(79,99] (79,99] 0.08 0.04
sv.table<-svyby(formula = ~smoker, by = ~educ, design = des, FUN = svymean, na.rm=T)
stargazer(sv.table, summary = F, type = "html", digits = 2)
educ smoker se
2hsgrad 2hsgrad 0.51 0.04
0Prim 0Prim 0.43 0.09
1somehs 1somehs 0.77 0.05
3somecol 3somecol 0.41 0.03
4colgrad 4colgrad 0.21 0.02

#And I see the prevalences as in the simple weighted table, but the standard errors have now been adjusted for survey design. They are much larger than the ones we computed above, which assumed random sampling.

#b) Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table with appropriate survey-corrected test statistics. (tableone package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "agec", "educ"), strata = "smoker", test = T, data = mydat)
#t1<-print(t1, format="p")
print(t1,format="p")
##              Stratified by smoker
##               0    1    p      test
##   n           2119 1014            
##   agec (%)              <0.001     
##      (0,24]    1.7  5.8            
##      (24,39]  11.5 22.6            
##      (39,59]  23.2 35.6            
##      (59,79]  50.2 33.7            
##      (79,99]  13.5  2.3            
##   educ (%)              <0.001     
##      2hsgrad  21.8 29.1            
##      0Prim     3.4  4.4            
##      1somehs   3.9 11.2            
##      3somecol 28.3 35.1            
##      4colgrad 42.6 20.3
#using survey design
st1<-svyCreateTableOne(vars = c( "agec", "educ"), strata = "smoker", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##              Stratified by smoker
##               0         1         p      test
##   n           2842271.3 2136165.3            
##   agec (%)                        <0.001     
##      (0,24]         5.7      10.9            
##      (24,39]       20.5      37.9            
##      (39,59]       36.9      35.0            
##      (59,79]       32.7      15.8            
##      (79,99]        4.2       0.5            
##   educ (%)                        <0.001     
##      2hsgrad       23.3      31.9            
##      0Prim          7.1       7.1            
##      1somehs        3.7      16.4            
##      3somecol      37.0      34.7            
##      4colgrad      28.8      10.0

Regression example

Next we apply this logic to a regression case. First we fit the OLS model for smoker outcome using education and age as predictors:

fit1<-lm(smoker~educ+agec, data=mydat)
fit1
## 
## Call:
## lm(formula = smoker ~ educ + agec, data = mydat)
## 
## Coefficients:
##  (Intercept)     educ0Prim   educ1somehs  educ3somecol  educ4colgrad  
##     0.629158      0.007865      0.170637     -0.017477     -0.186240  
##  agec(24,39]   agec(39,59]   agec(59,79]   agec(79,99]  
##    -0.088199     -0.150414     -0.319223     -0.484784

Next we incorporate case weights

fit2<-lm(smoker~educ+agec, data=mydat, weights = mmsawt)
fit2
## 
## Call:
## lm(formula = smoker ~ educ + agec, data = mydat, weights = mmsawt)
## 
## Coefficients:
##  (Intercept)     educ0Prim   educ1somehs  educ3somecol  educ4colgrad  
##      0.62330      -0.05924       0.24399      -0.08997      -0.27268  
##  agec(24,39]   agec(39,59]   agec(59,79]   agec(79,99]  
##      0.02183      -0.14110      -0.26170      -0.44284

To incorporate design effects as well:

fit3<-svyglm(smoker~educ+agec,des, family=gaussian)
fit3
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = mydat)
## 
## Call:  svyglm(formula = smoker ~ educ + agec, design = des, family = gaussian)
## 
## Coefficients:
##  (Intercept)     educ0Prim   educ1somehs  educ3somecol  educ4colgrad  
##      0.62330      -0.05924       0.24399      -0.08997      -0.27268  
##  agec(24,39]   agec(39,59]   agec(59,79]   agec(79,99]  
##      0.02183      -0.14110      -0.26170      -0.44284  
## 
## Degrees of Freedom: 3122 Total (i.e. Null);  3069 Residual
##   (5510 observations deleted due to missingness)
## Null Deviance:       681.8 
## Residual Deviance: 578.9     AIC: 9359

To show the results of the three models:

stargazer(fit1, fit2, fit3, style="demography", type="html",
          column.labels = c("OLS", "Weights Only", "Survey"),
          title = "Regression models for smoker using survey data - BRFSS 2017", 
          covariate.labels=c("PrimarySchool", "SomeHS", "SomeColl", "CollGrad", "Age 24-39","Age 39-59" ,"Age 59-79", "Age 80+"), 
          keep.stat="n", model.names=F, align=T, ci=T)
Regression models for smoker using survey data - BRFSS 2017
smoker
OLS Weights Only Survey
Model 1 Model 2 Model 3
PrimarySchool 0.008 -0.059 -0.059
(-0.078, 0.093) (-0.127, 0.009) (-0.237, 0.119)
SomeHS 0.171*** 0.244*** 0.244***
(0.102, 0.239) (0.182, 0.306) (0.120, 0.368)
SomeColl -0.017 -0.090*** -0.090
(-0.059, 0.024) (-0.131, -0.049) (-0.188, 0.008)
CollGrad -0.186*** -0.273*** -0.273***
(-0.227, -0.146) (-0.320, -0.225) (-0.359, -0.186)
Age 24-39 -0.088 0.022 0.022
(-0.186, 0.009) (-0.044, 0.087) (-0.157, 0.200)
Age 39-59 -0.150** -0.141*** -0.141
(-0.244, -0.057) (-0.205, -0.077) (-0.318, 0.036)
Age 59-79 -0.319*** -0.262*** -0.262**
(-0.411, -0.227) (-0.328, -0.195) (-0.439, -0.085)
Age 80+ -0.485*** -0.443*** -0.443***
(-0.586, -0.383) (-0.558, -0.328) (-0.626, -0.260)
Constant 0.629*** 0.623*** 0.623***
(0.538, 0.720) (0.563, 0.684) (0.462, 0.785)
N 3,123 3,123 3,123
p < .05; p < .01; p < .001

#The result of education and age models are less significant compared to prior two models. This is because of the too small parameters. The survey and weight model have similar beta but the standard error is larger in survey model.

library(ggplot2)
library(dplyr)
coefs<-data.frame(coefs=c(coef(fit1)[-1], coef(fit3)[-1]),
                  mod=c(rep("Non Survey Model", 8),rep("Survey Model", 8)),
                  effect=rep(names(coef(fit1)[-1]), 2))

coefs%>%
  ggplot()+
  geom_point(aes( x=effect, y=coefs, group=effect,color=effect, shape=mod),
             position=position_jitterdodge(jitter.width = 1),
             size=2)+
  ylab("Regression Coefficient")+
  xlab("Beta")+
  geom_abline(intercept = 0, slope=0)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle(label = "Comparison of Survey and Non-Survey Regression effects")

#Which shows us that the betas are similar but have some differences between the two models.