Question# a) Define a binary outcome variable of your choosing and define how you recode the original variable.

A binary outcome variable of my choosing is alcohol consumption in past 30 days. This variable is re coded as: 1: Yes 0: No 7 and 9:NA

Question# b) State a research question about what factors you believe will affect your outcome variable.

To understand the effects of race/ethnicity and education level in alcohol consumption.

Question# c) Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

Predictor variables: Race and Education level

Loading libraries

#load brfss
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)

Reading the BRFSS data

brfss20<-readRDS(url
  ("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))
### Fix variable names
names(brfss20) <- tolower(gsub(pattern = "_",replacement =  "",x =  
                                 names(brfss20)))

Recoding of outcome and presictor variables

#Alcohol Intake within past 30 days
brfss20$alcohol<-Recode(brfss20$drnkany5, recodes="1=1; 2=0; else = NA")


#race/ethnicity
brfss20$black<-Recode(brfss20$racegr3,
                      recodes="2='black'; 9=NA; else='Others'")

brfss20$white<-Recode(brfss20$racegr3, 
                      recodes="1='white'; 9=NA; else='Others'")

brfss20$other<-Recode(brfss20$racegr3,
                      recodes="3:4='other'; 9=NA; else='Others'")

brfss20$hispanic<-Recode(brfss20$racegr3,
                         recodes="5=1; 9=NA; else='Others'")

brfss20$race_eth<-Recode(brfss20$racegr3,
recodes="1='NH-White'; 2='NH-Black'; 3='NH-Other';4='NH-Multirace'; 5='Hispanic'; else=NA",
as.factor = T)
brfss20$race_eth<-relevel(brfss20$race_eth,
                          ref = "NH-White")

#education level
brfss20$educ<-Recode(brfss20$educa,
                     recodes="1:2='Primary School'; 3='Some HS'; 4='HS Graduate'; 
                     5='Some College'; 6='College Graduate';9=NA",
                     as.factor=T)
brfss20$educ<-fct_relevel(brfss20$educ,'Primary School','Some HS','HS Graduate',
                          'Some College','College Graduate' ) 

Filter cases

brfss20<-brfss20%>%
  filter(
         is.na(educ)==F,
         is.na(alcohol)==F)

Analysis

Assuming Random Sampling

Frequencies and 2*k tables

Tables below show proportion of population grouped by race/ethnicity (first) and education level(second) who responded they have/haven’t taken alcoholic drinks in past 30 days.

#column percentages
100*prop.table(table(brfss20$alcohol, brfss20$race_eth), margin=2)
##    
##     NH-White Hispanic NH-Black NH-Multirace NH-Other
##   0 42.60512 54.85990 53.99919     45.91488 55.33052
##   1 57.39488 45.14010 46.00081     54.08512 44.66948
#column percentages
100*prop.table(table(brfss20$alcohol, brfss20$educ), margin=2)
##    
##     Primary School  Some HS HS Graduate Some College College Graduate
##   0       73.77336 65.01203    55.90546     47.96961         36.33952
##   1       26.22664 34.98797    44.09454     52.03039         63.66048

Bivariate tests of independence

#basic chi square test of independence
chisq.test(table(brfss20$alcohol, brfss20$educ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss20$alcohol, brfss20$educ)
## X-squared = 6779.5, df = 4, p-value < 2.2e-16
#basic chi square test of independence
chisq.test(table(brfss20$alcohol, brfss20$race_eth))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss20$alcohol, brfss20$race_eth)
## X-squared = 1916.9, df = 4, p-value < 2.2e-16

Chi-squared test show that alcohol intake is dependent on both race/ethnicity and education level.

Assuming Stratified Survey Sampling

Creating a survey design object

Identifying the survey design.

ids = PSU identifers, strata=strata identifiers, weights=sampling weights, data= the data frame where these variables are located.

library(srvyr)
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss20 )
des
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss20)

Simple weighted analysis

Now , redoing the analysis from above using only weights:

Education level

#counts
cat<-wtd.table(brfss20$alcohol,
               brfss20$educ,
               weights = brfss20$mmsawt)

#proportions
100*prop.table(
  cat,
  margin=2)
##   Primary School  Some HS HS Graduate Some College College Graduate
## 0       67.73505 63.01914    54.57821     45.74783         34.73858
## 1       32.26495 36.98086    45.42179     54.25217         65.26142

Race/Ethnicity

#counts
cat<-wtd.table(brfss20$alcohol,
               brfss20$race_eth,
               weights = brfss20$mmsawt)

#proportions
100*prop.table(
  cat,
  margin=2)
##   NH-White Hispanic NH-Black NH-Multirace NH-Other
## 0 41.03215 54.10302 51.42393     46.05683 57.36187
## 1 58.96785 45.89698 48.57607     53.94317 42.63813

When the weighted means were calculated more NH White population (58.9 %) responded yes to consuming alcohol within last 30 days compared to 57 % when no weights were used to calculated proportions. Similar trend was seen among Hispanic and NH Black population, with higher proportion of population saying they consumed alcohol within last 30 days.

Similarly, the weight percentage of population saying they consumed alcohol within last 30 days is higher that unweighted percentage of population across all eduation level.

Proper survey design analysis

Using tableone

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "alcohol"),
                   strata="educ",
                   data = brfss20)



#t1<-print(t1, format="p")
print(t1,format="p" )
##                      Stratified by educ
##                       Primary School Some HS     HS Graduate  Some College
##   n                   3424           6648        42266        48045       
##   alcohol (mean (SD)) 0.26 (0.44)    0.35 (0.48)  0.44 (0.50)  0.52 (0.50)
##                      Stratified by educ
##                       College Graduate p      test
##   n                   79924                       
##   alcohol (mean (SD))  0.64 (0.48)     <0.001
#, strata = "alcohol", test = T,
#using survey design
st1<-svyCreateTableOne(vars = c("alcohol"),
                       strata = "educ",
                       test = T,
                       data = des)

print(st1,
      format="p")
##                      Stratified by educ
##                       Primary School    Some HS           HS Graduate       
##   n                   5730845.02        9545128.90        34486371.92       
##   alcohol (mean (SD))       0.32 (0.47)       0.37 (0.48)        0.45 (0.50)
##                      Stratified by educ
##                       Some College       College Graduate   p      test
##   n                   40779618.01        46293493.92                   
##   alcohol (mean (SD))        0.54 (0.50)        0.65 (0.48) <0.001

Race and Ethnicity

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c( "alcohol"),
                   strata="race_eth",
                   data = brfss20)



#t1<-print(t1, format="p")
print(t1,format="p" )
##                      Stratified by race_eth
##                       NH-White      Hispanic     NH-Black     NH-Multirace
##   n                   129299        18879        17241        3219        
##   alcohol (mean (SD))   0.57 (0.49)  0.45 (0.50)  0.46 (0.50) 0.54 (0.50) 
##                      Stratified by race_eth
##                       NH-Other    p      test
##   n                   8048                   
##   alcohol (mean (SD)) 0.45 (0.50) <0.001
#, strata = "alcohol", test = T,
#using survey design
st1<-svyCreateTableOne(vars = c("alcohol"),
                       strata = "race_eth",
                       test = T,
                       data = des)

print(st1,
      format="p")
##                      Stratified by race_eth
##                       NH-White           Hispanic           NH-Black          
##   n                   78099561.55        25735404.43        18421693.91       
##   alcohol (mean (SD))        0.59 (0.49)        0.46 (0.50)        0.49 (0.50)
##                      Stratified by race_eth
##                       NH-Multirace      NH-Other           p      test
##   n                   1623799.16        10287034.72                   
##   alcohol (mean (SD))       0.54 (0.50)        0.43 (0.49) <0.001

Yet another way using gtsummary package, it provides a way to summarize

and create tables from a survey data source.

library(gtsummary)

brfss20%>%
  as_survey_design( strata =ststr,
                    weights = mmsawt)%>%
  select(alcohol, educ)%>%
  tbl_svysummary(by = educ, 
              label = list(alcohol = "Alcohol Use"))%>%
  add_p()%>%
  add_n()
Characteristic N Primary School, N = 5,730,8451 Some HS, N = 9,545,1291 HS Graduate, N = 34,486,3721 Some College, N = 40,779,6181 College Graduate, N = 46,293,4941 p-value2
Alcohol Use 136,835,458 1,849,054 (32%) 3,529,871 (37%) 15,664,328 (45%) 22,123,827 (54%) 30,211,792 (65%) <0.001

1 n (%)

2 chi-squared test with Rao & Scott's second-order correction

Race and Ethnicity

library(gtsummary)

brfss20%>%
  as_survey_design( strata =ststr,
                    weights = mmsawt)%>%
  select(alcohol, race_eth)%>%
  tbl_svysummary(by = race_eth, 
              label = list(alcohol = "Alcohol Use"))%>%
  add_p()%>%
  add_n()
## 3621 observations missing `race_eth` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `race_eth` column before passing to `tbl_svysummary()`.
Characteristic N NH-White, N = 78,099,5621 Hispanic, N = 25,735,4041 NH-Black, N = 18,421,6941 NH-Multirace, N = 1,623,7991 NH-Other, N = 10,287,0351 p-value2
Alcohol Use 134,167,494 46,053,634 (59%) 11,811,774 (46%) 8,948,535 (49%) 875,929 (54%) 4,386,199 (43%) <0.001

1 n (%)

2 chi-squared test with Rao & Scott's second-order correction

Question d c) Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

The results show that in general for both education level and race/ethnicity (NH White, Hispanic, NH Black) analysis using survey design produced mean of alcohol consumption that is higher than analysis not using survey design. Furthermore, the test statistics indicate that the differences in alcohol consumption in terms of race/ethnicity and education level are statistically significant (p<0.001).