#load brfss
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

Fix variable names

nams<-names(brfss_17)
head(nams, n=10)
##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

Select Texas as the design object

brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

filtering the outcome variable

brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(medcost)==F)

1. Define a binary outcome variable of your choosing and define how you recode the original variable.

I am using medcost as the binary variable for homework 2. The question is: Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?

Recoding of original variable

brfss_17$medcost<-Recode(brfss_17$medcost, recodes ="7:9=NA; 1=1;2=0")
View(brfss_17)

2. State a research question about what factors you believe will affect your outcome variable.

How does medical cost affact people’s ability to visit a doctor in the past 12 months?

3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

My predictor variables are incomeg and educag and these two variables affacts an individual’s propensity to visit a doctor. These variables are recoded as follows.

Recode educational level

#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

Recode Income group

brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
table(brfss_17$inc)
## 
##    1    2    3    4    5 
##  783 1190  769  953 3679

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

library(tableone)

#not using survey design

t1<-CreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                     Stratified by medcost
##                      0           1           p      test
##   n                  7454        1156                   
##   educ (%)                                   <0.001     
##      2hsgrad         21.4        28.5                   
##      0Prim            3.9         7.4                   
##      1somehs          4.8        10.3                   
##      3somecol        25.5        28.6                   
##      4colgrad        44.5        25.2                   
##   incomg (mean (SD)) 4.66 (2.24) 3.61 (2.44) <0.001
print(t1)
##                     Stratified by medcost
##                      0            1            p      test
##   n                  7454         1156                    
##   educ (%)                                     <0.001     
##      2hsgrad         1586 (21.4)   329 (28.5)             
##      0Prim            287 ( 3.9)    85 ( 7.4)             
##      1somehs          357 ( 4.8)   119 (10.3)             
##      3somecol        1890 (25.5)   330 (28.6)             
##      4colgrad        3297 (44.5)   290 (25.2)             
##   incomg (mean (SD)) 4.66 (2.24)  3.61 (2.44)  <0.001

4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

Educ variable - Raw frequencies - no weights

table(brfss_17$medcost, brfss_17$educ)
##    
##     2hsgrad 0Prim 1somehs 3somecol 4colgrad
##   0    1586   287     357     1890     3297
##   1     329    85     119      330      290

Educ variable - Raw frequencies - with weights

wad<-wtd.table(brfss_17$medcost, brfss_17$educ, weights = brfss_17$mmsawt)
print(wad)
##     2hsgrad     0Prim   1somehs  3somecol  4colgrad
## 0 2953938.2  949873.4  866181.6 3874500.8 3858732.2
## 1  842522.2  324965.0  373120.8  872696.0  403573.7

#column percentages - no weights

prop.table(table(brfss_17$medcost, brfss_17$educ), margin=2)
##    
##       2hsgrad     0Prim   1somehs  3somecol  4colgrad
##   0 0.8281984 0.7715054 0.7500000 0.8513514 0.9191525
##   1 0.1718016 0.2284946 0.2500000 0.1486486 0.0808475

Percentages - weights

prop.table(wtd.table(brfss_17$medcost, brfss_17$educ, weights = brfss_17$mmsawt), margin = 2)
##      2hsgrad      0Prim    1somehs   3somecol   4colgrad
## 0 0.77807691 0.74509317 0.69892673 0.81616603 0.90531564
## 1 0.22192309 0.25490683 0.30107327 0.18383397 0.09468436

basic chi square test of independence

chisq.test(table(brfss_17$medcost, brfss_17$educ))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$medcost, brfss_17$educ)
## X-squared = 198.14, df = 4, p-value < 2.2e-16

Incomg variable - raw frequencies - no weights

table(brfss_17$medcost, brfss_17$incomg)
##    
##        1    2    3    4    5    9
##   0  552  902  634  824 3428 1114
##   1  229  286  132  125  246  138

Frequencies with weights

nat<-wtd.table(brfss_17$medcost, brfss_17$incomg, weights = brfss_17$mmsawt)
print(nat)
##           1         2         3         4         5         9
## 0 1023008.9 1714506.2  897777.1 1371338.7 5757999.9 1784342.9
## 1  526778.1  722580.6  375231.6  336149.6  493196.0  369202.9

Column Percentages - no weights

prop.table(table(brfss_17$medcost, brfss_17$incomg), margin=2)
##    
##             1         2         3         4         5         9
##   0 0.7067862 0.7592593 0.8276762 0.8682824 0.9330430 0.8897764
##   1 0.2932138 0.2407407 0.1723238 0.1317176 0.0669570 0.1102236

Percentages - weights

prop.table(wtd.table(brfss_17$medcost, brfss_17$incomg, weights = brfss_17$mmsawt), margin = 2)
##            1          2          3          4          5          9
## 0 0.66009645 0.70350641 0.70524036 0.80313211 0.92110374 0.82856046
## 1 0.33990355 0.29649359 0.29475964 0.19686789 0.07889626 0.17143954

basic chi square test of independence

chisq.test(table(brfss_17$medcost, brfss_17$incomg))
## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$medcost, brfss_17$incomg)
## X-squared = 444.65, df = 5, p-value < 2.2e-16

4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

not using survey design

t1<-CreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
##                     Stratified by medcost
##                      0           1           p      test
##   n                  7454        1156                   
##   educ (%)                                   <0.001     
##      2hsgrad         21.4        28.5                   
##      0Prim            3.9         7.4                   
##      1somehs          4.8        10.3                   
##      3somecol        25.5        28.6                   
##      4colgrad        44.5        25.2                   
##   incomg (mean (SD)) 4.66 (2.24) 3.61 (2.44) <0.001

using survey design

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17 )
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
##                     Stratified by medcost
##                      0                  1                 p      test
##   n                   12548973.9         2823138.8                   
##   educ (%)                                                <0.001     
##      2hsgrad                23.6              29.9                   
##      0Prim                   7.6              11.5                   
##      1somehs                 6.9              13.2                   
##      3somecol               31.0              31.0                   
##      4colgrad               30.9              14.3                   
##   incomg (mean (SD))        4.58 (2.24)       3.62 (2.48) <0.001

4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

Yes, in most cases, there were significant differences in the descriptive results using survey design and not using survey design. The differences were evident with the education variable. For instance, whiles 44.5% of people college graduate noted that due to cost, they had not visited a doctor, and 25.2% answered in the affirmative of seeing a doctor in the last 12 months when not using survey data. The results were 30.9% for people that did not visit a doctor and 25.2% for people that visited a doctor in the last 12 months when the survey design technique was used. I also noticed a significant difference in the respondents’ total number when I use each method for the descriptive analysis.