#load brfss
library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
Fix variable names
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
Select Texas as the design object
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
filtering the outcome variable
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(medcost)==F)
1. Define a binary outcome variable of your choosing and define how you recode the original variable.
I am using medcost as the binary variable for homework 2. The question is: Was there a time in the past 12 months when you needed to see a doctor but could not because of cost?
Recoding of original variable
brfss_17$medcost<-Recode(brfss_17$medcost, recodes ="7:9=NA; 1=1;2=0")
View(brfss_17)
2. State a research question about what factors you believe will affect your outcome variable.
How does medical cost affact people’s ability to visit a doctor in the past 12 months?
3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.
My predictor variables are incomeg and educag and these two variables affacts an individual’s propensity to visit a doctor. These variables are recoded as follows.
Recode educational level
#education level
brfss_17$educ<-Recode(brfss_17$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
Recode Income group
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
table(brfss_17$inc)
##
## 1 2 3 4 5
## 783 1190 769 953 3679
#not using survey design
t1<-CreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by medcost
## 0 1 p test
## n 7454 1156
## educ (%) <0.001
## 2hsgrad 21.4 28.5
## 0Prim 3.9 7.4
## 1somehs 4.8 10.3
## 3somecol 25.5 28.6
## 4colgrad 44.5 25.2
## incomg (mean (SD)) 4.66 (2.24) 3.61 (2.44) <0.001
print(t1)
## Stratified by medcost
## 0 1 p test
## n 7454 1156
## educ (%) <0.001
## 2hsgrad 1586 (21.4) 329 (28.5)
## 0Prim 287 ( 3.9) 85 ( 7.4)
## 1somehs 357 ( 4.8) 119 (10.3)
## 3somecol 1890 (25.5) 330 (28.6)
## 4colgrad 3297 (44.5) 290 (25.2)
## incomg (mean (SD)) 4.66 (2.24) 3.61 (2.44) <0.001
4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.
Educ variable - Raw frequencies - no weights
table(brfss_17$medcost, brfss_17$educ)
##
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 1586 287 357 1890 3297
## 1 329 85 119 330 290
Educ variable - Raw frequencies - with weights
wad<-wtd.table(brfss_17$medcost, brfss_17$educ, weights = brfss_17$mmsawt)
print(wad)
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 2953938.2 949873.4 866181.6 3874500.8 3858732.2
## 1 842522.2 324965.0 373120.8 872696.0 403573.7
#column percentages - no weights
prop.table(table(brfss_17$medcost, brfss_17$educ), margin=2)
##
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 0.8281984 0.7715054 0.7500000 0.8513514 0.9191525
## 1 0.1718016 0.2284946 0.2500000 0.1486486 0.0808475
Percentages - weights
prop.table(wtd.table(brfss_17$medcost, brfss_17$educ, weights = brfss_17$mmsawt), margin = 2)
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0 0.77807691 0.74509317 0.69892673 0.81616603 0.90531564
## 1 0.22192309 0.25490683 0.30107327 0.18383397 0.09468436
basic chi square test of independence
chisq.test(table(brfss_17$medcost, brfss_17$educ))
##
## Pearson's Chi-squared test
##
## data: table(brfss_17$medcost, brfss_17$educ)
## X-squared = 198.14, df = 4, p-value < 2.2e-16
Incomg variable - raw frequencies - no weights
table(brfss_17$medcost, brfss_17$incomg)
##
## 1 2 3 4 5 9
## 0 552 902 634 824 3428 1114
## 1 229 286 132 125 246 138
Frequencies with weights
nat<-wtd.table(brfss_17$medcost, brfss_17$incomg, weights = brfss_17$mmsawt)
print(nat)
## 1 2 3 4 5 9
## 0 1023008.9 1714506.2 897777.1 1371338.7 5757999.9 1784342.9
## 1 526778.1 722580.6 375231.6 336149.6 493196.0 369202.9
Column Percentages - no weights
prop.table(table(brfss_17$medcost, brfss_17$incomg), margin=2)
##
## 1 2 3 4 5 9
## 0 0.7067862 0.7592593 0.8276762 0.8682824 0.9330430 0.8897764
## 1 0.2932138 0.2407407 0.1723238 0.1317176 0.0669570 0.1102236
Percentages - weights
prop.table(wtd.table(brfss_17$medcost, brfss_17$incomg, weights = brfss_17$mmsawt), margin = 2)
## 1 2 3 4 5 9
## 0 0.66009645 0.70350641 0.70524036 0.80313211 0.92110374 0.82856046
## 1 0.33990355 0.29649359 0.29475964 0.19686789 0.07889626 0.17143954
basic chi square test of independence
chisq.test(table(brfss_17$medcost, brfss_17$incomg))
##
## Pearson's Chi-squared test
##
## data: table(brfss_17$medcost, brfss_17$incomg)
## X-squared = 444.65, df = 5, p-value < 2.2e-16
4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)
not using survey design
t1<-CreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by medcost
## 0 1 p test
## n 7454 1156
## educ (%) <0.001
## 2hsgrad 21.4 28.5
## 0Prim 3.9 7.4
## 1somehs 4.8 10.3
## 3somecol 25.5 28.6
## 4colgrad 44.5 25.2
## incomg (mean (SD)) 4.66 (2.24) 3.61 (2.44) <0.001
using survey design
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17 )
options(survey.lonely.psu = "adjust")
st1<-svyCreateTableOne(vars = c("educ", "incomg"), strata = "medcost", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
## Stratified by medcost
## 0 1 p test
## n 12548973.9 2823138.8
## educ (%) <0.001
## 2hsgrad 23.6 29.9
## 0Prim 7.6 11.5
## 1somehs 6.9 13.2
## 3somecol 31.0 31.0
## 4colgrad 30.9 14.3
## incomg (mean (SD)) 4.58 (2.24) 3.62 (2.48) <0.001
4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
Yes, in most cases, there were significant differences in the descriptive results using survey design and not using survey design. The differences were evident with the education variable. For instance, whiles 44.5% of people college graduate noted that due to cost, they had not visited a doctor, and 25.2% answered in the affirmative of seeing a doctor in the last 12 months when not using survey data. The results were 30.9% for people that did not visit a doctor and 25.2% for people that visited a doctor in the last 12 months when the survey design technique was used. I also noticed a significant difference in the respondents’ total number when I use each method for the descriptive analysis.