Uploading the dataset

library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
brfss2020<- readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))


names(brfss2020) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(brfss2020)))

1. Define a binary outcome variable of your choosing and define how you recode the original variable.

I have chosen variable genhealth for question: Would you say in general your health is with possible responses of: 1-excellent, 2-very good, 3-good, 4-fair, 5-poor 7-Don’t know/not sure, and 9-refused. I recoded it to group responses 1 through 3 together, 4 and 5 together, and everything else as NA.

brfss2020$badhealth<-Recode(brfss2020$genhlth, recodes="4:5=1; 1:3=0; else=NA")

2. State a research question about what factors you believe will affect your outcome variable.

Is general health better among women or men and does having an annual check up or health coverage contribute to this as well?

3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

The first predictor variable I chose is sex indicating the sex of the individual with responses of 1-male, 2-female, and 9-refused. I recoded it so that those who refused are eliminated.

The second variable I chose is hlthpln1 for question:Do you have any kind of health care coverage, including health insurance, prepaid plans such as HMOs, orgovernment plans such as Medicare, or Indian Health Service? with possible responses of: 1-yes, 2-no, 3-Don’t know/not sure, and 9-refused. I recoded it to eliminate the “refused” and the “I dont know/not sure”.

Lastly, I have chosen variable checkup1 for question:About how long has it been since you last visited a doctor for a routine checkup? with possible responses of: 1-within the last year, 2-within the last 2 years, 3-within the last 5 years, 4-5 years or more ago, 7-Don’t know/not sure, 8-never and 9-refused. I recoded it group responses 1 and 2 together, 3 and 4 together, 5 as its own category and eliminated the “refused” and the “I dont know/not sure”.

brfss2020$male<-as.factor(ifelse(brfss2020$sex==1, "Male", "Female"))

brfss2020$hlthpln1<-Recode(brfss2020$hlthpln1, recodes="1='hp' ; 2='nohp'; else=NA")


brfss2020$checkup1<-Recode(brfss2020$checkup1,
                     recodes="1:2='0last2yrs'; 3:4='1last5yrs'; 8='2never'; 7=NA; 9=NA",
                     as.factor=T)
brfss2020$checkup1<-fct_relevel(brfss2020$checkup1,'0last2yrs','1last5yrs','2never') 
brfss2020<-brfss2020%>%
  filter(sex!=9,
         is.na(hlthpln1)==F,
         is.na(checkup1)==F, 
         is.na(badhealth)==F)

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table).

table(brfss2020$badhealth, brfss2020$male)
##    
##     Female  Male
##   0  88341 77312
##   1  14991 11092
table(brfss2020$badhealth, brfss2020$hlthpln1)
##    
##         hp   nohp
##   0 152431  13222
##   1  23556   2527
table(brfss2020$badhealth, brfss2020$checkup1)
##    
##     0last2yrs 1last5yrs 2never
##   0    148736     16132    785
##   1     24161      1789    133

1. Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

No weight

100*prop.table(table(brfss2020$badhealth, brfss2020$male), margin=2)
##    
##       Female     Male
##   0 85.49239 87.45306
##   1 14.50761 12.54694
100*prop.table(table(brfss2020$badhealth, brfss2020$hlthpln1), margin=2)
##    
##           hp     nohp
##   0 86.61492 83.95454
##   1 13.38508 16.04546
100*prop.table(table(brfss2020$badhealth, brfss2020$checkup1), margin=2)
##    
##     0last2yrs 1last5yrs    2never
##   0 86.025784 90.017298 85.511983
##   1 13.974216  9.982702 14.488017

With weight

cat<-wtd.table(brfss2020$badhealth,
               brfss2020$male,
               weights = brfss2020$mmsawt)

100*prop.table(cat, margin = 2)
##     Female     Male
## 0 85.90853 87.63072
## 1 14.09147 12.36928
cat2<-wtd.table(brfss2020$badhealth,
               brfss2020$hlthpln1,
               weights = brfss2020$mmsawt)

100*prop.table(cat2, margin = 2)
##         hp     nohp
## 0 87.14108 83.83737
## 1 12.85892 16.16263
cat3<-wtd.table(brfss2020$badhealth,
               brfss2020$checkup1,
               weights = brfss2020$mmsawt)

100*prop.table(cat3, margin = 2)
##   0last2yrs 1last5yrs   2never
## 0  86.48424  88.98423 84.15421
## 1  13.51576  11.01577 15.84579

2. Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics.

No weight

library(tableone)

t1<-CreateTableOne(vars = c( "badhealth"),
                   strata="male",
                   data = brfss2020)

print(t1,format="p" )
##                        Stratified by male
##                         Female        Male         p      test
##   n                     103332        88404                   
##   badhealth (mean (SD))   0.15 (0.35)  0.13 (0.33) <0.001
t2<-CreateTableOne(vars = c( "badhealth"),
                   strata="hlthpln1",
                   data = brfss2020)

print(t2,format="p" )
##                        Stratified by hlthpln1
##                         hp            nohp         p      test
##   n                     175987        15749                   
##   badhealth (mean (SD))   0.13 (0.34)  0.16 (0.37) <0.001
t3<-CreateTableOne(vars = c( "badhealth"),
                   strata="checkup1",
                   data = brfss2020)

print(t3,format="p" )
##                        Stratified by checkup1
##                         0last2yrs     1last5yrs    2never      p      test
##   n                     172897        17921         918                   
##   badhealth (mean (SD))   0.14 (0.35)  0.10 (0.30) 0.14 (0.35) <0.001

With Weight

library(srvyr)
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss2020 )
des
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~ststr, weights = ~mmsawt, data = brfss2020)
st1<-svyCreateTableOne(vars = c("badhealth"),
                       strata = "male",
                       test = T,
                       data = des)

print(st1,
      format="p")
##                        Stratified by male
##                         Female             Male               p      test
##   n                     76432959.80        71003927.46                   
##   badhealth (mean (SD))        0.14 (0.35)        0.12 (0.33) <0.001
st2<-svyCreateTableOne(vars = c("badhealth"),
                       strata = "hlthpln1",
                       test = T,
                       data = des)

print(st2,
      format="p")
##                        Stratified by hlthpln1
##                         hp                  nohp               p      test
##   n                     129444470.73        17992416.53                   
##   badhealth (mean (SD))         0.13 (0.33)        0.16 (0.37) <0.001
st3<-svyCreateTableOne(vars = c("badhealth"),
                       strata = "checkup1",
                       test = T,
                       data = des)

print(st3,
      format="p")
##                        Stratified by checkup1
##                         0last2yrs           1last5yrs          2never          
##   n                     130756385.37        15790212.77        890289.12       
##   badhealth (mean (SD))         0.14 (0.34)        0.11 (0.31)      0.16 (0.37)
##                        Stratified by checkup1
##                         p      test
##   n                                
##   badhealth (mean (SD)) <0.001

3. Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

There is a difference in the averages for both women and men regarding general health by .01 between the means with no weight and the means with weight.

There is no difference in the health means accounting for health plan coverage and no health coverage.

Health averages for those who have had check ups in the last 5 years or never did have higher averages than the ones without weights with a .01 and .02 difference respectively.

There is a small difference in the descriptive results. However, it is enough of a difference for me to ensure weights and survey design are always included in analysis because it can make the difference in what your final rsults reflect.