library(car)
## Loading required package: carData
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(questionr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
Fix variable names
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)
## [1] "dispcode" "statere1" "safetime" "hhadult" "genhlth" "physhlth"
## [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss_17)<-newnames
1. Define a binary outcome variable of your choosing and define how you recode the original variable.
Depression - (Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?
2. State a research question about what factors you believe will affect your outcome variable.
What relationship do veteran status and labor participation have with depression?
3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.
EMPLOY1 - Employment Status; VETERAN3 - Are You A Veteran
Recoding of variables
#depression
brfss_17$depr<-Recode(brfss_17$addepev2, recodes ="7:9=NA; 1=1;2=0")
#veteran
#brfss_17$vet<-Recode(brfss_17$veteran3, recodes ="7:9=NA; 1=1;2=0", as.factor=T)
brfss_17$vet<-Recode(brfss_17$veteran3, recodes="1='Veteran'; 2='NV'; else=NA", as.factor=T)
#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
Filter cases
brfss_17<-brfss_17%>%
filter(sex!=9, is.na(vet)==F,is.na(employ)==F)
4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!
#First we will do some tables
#Raw frequencies
table(brfss_17$depr, brfss_17$vet )
##
## NV Veteran
## 0 158907 23743
## 1 39937 4691
table(brfss_17$depr, brfss_17$employ)
##
## Employed nilf retired unable
## 0 98732 22363 54988 6567
## 1 19175 6892 10783 7778
#column percentages
prop.table(table(brfss_17$depr, brfss_17$vet ), margin=2)
##
## NV Veteran
## 0 0.7991541 0.8350215
## 1 0.2008459 0.1649785
prop.table(table(brfss_17$depr, brfss_17$employ ), margin=2)
##
## Employed nilf retired unable
## 0 0.8373718 0.7644163 0.8360524 0.4577902
## 1 0.1626282 0.2355837 0.1639476 0.5422098
#basic chi square test of independence
chisq.test(table(brfss_17$depr, brfss_17$vet))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(brfss_17$depr, brfss_17$vet)
## X-squared = 202.58, df = 1, p-value < 2.2e-16
chisq.test(table(brfss_17$depr, brfss_17$employ))
##
## Pearson's Chi-squared test
##
## data: table(brfss_17$depr, brfss_17$employ)
## X-squared = 12447, df = 3, p-value < 2.2e-16
Create survey design object
brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1
brfss_17<-brfss_17%>%
filter(tx==1, is.na(mmsawt)==F, is.na(depr)==F)
#
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17 )
4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.
sv.table<-svyby(formula = ~depr, by = ~vet, design = des, FUN = svymean, na.rm=T)
knitr::kable(sv.table,
caption = "Survey Estimates of Depression by Veteran Status",
align = 'c',
format = "html")
Survey Estimates of Depression by Veteran Status
|
|
vet
|
depr
|
se
|
|
NV
|
NV
|
0.1605685
|
0.0091099
|
|
Veteran
|
Veteran
|
0.2085911
|
0.0300425
|
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1, weights = ~1, data = brfss_17)
sv.table<-svyby(formula = ~depr, by = ~vet, design = nodes, FUN = svymean, na.rm=T)
knitr::kable(sv.table,
caption = "Estimates of Depression by Veteran Status - No survey design",
align = 'c',
format = "html")
Estimates of Depression by Veteran Status - No survey design
|
|
vet
|
depr
|
se
|
|
NV
|
NV
|
0.1804511
|
0.0044966
|
|
Veteran
|
Veteran
|
0.1565957
|
0.0106027
|
sv.table<-svyby(formula = ~depr, by = ~employ, design = des, FUN = svymean, na.rm=T)
knitr::kable(sv.table,
caption = "Survey Estimates of Depression by Employment Status",
align = 'c',
format = "html")
Survey Estimates of Depression by Employment Status
|
|
employ
|
depr
|
se
|
|
Employed
|
Employed
|
0.1191093
|
0.0086004
|
|
nilf
|
nilf
|
0.1788280
|
0.0239301
|
|
retired
|
retired
|
0.1509674
|
0.0184346
|
|
unable
|
unable
|
0.6205544
|
0.0469430
|
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1, weights = ~1, data = brfss_17)
sv.table<-svyby(formula = ~depr, by = ~employ, design = nodes, FUN = svymean, na.rm=T)
knitr::kable(sv.table,
caption = "Estimates of Depression by Employment Status - No survey design",
align = 'c',
format = "html")
Estimates of Depression by Employment Status - No survey design
|
|
employ
|
depr
|
se
|
|
Employed
|
Employed
|
0.1394644
|
0.0055069
|
|
nilf
|
nilf
|
0.1841693
|
0.0108520
|
|
retired
|
retired
|
0.1582840
|
0.0070198
|
|
unable
|
unable
|
0.5235507
|
0.0212590
|
4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)
library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")
## Stratified by depr
## 0 1 p test
## n 6986 1504
## employ (%) <0.001
## Employed 48.8 36.7
## nilf 14.9 15.6
## retired 32.6 28.5
## unable 3.8 19.2
## vet = Veteran (%) 14.2 12.2 0.052
#using survey design
st1<-svyCreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")
## Stratified by depr
## 0 1 p test
## n 12597218.4 2498940.0
## employ (%) <0.001
## Employed 61.4 41.8
## nilf 21.6 23.7
## retired 14.4 12.9
## unable 2.6 21.5
## vet = Veteran (%) 9.8 13.0 0.097
4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
Yes, the percent estimates are differet for for both independent variables across all levels and when looking at depression by veteran status, the survey design results show a greater proportion of veterans report depression whereas the non-survey design results show the opposite.