DEM 7283 - HW2

library(car)

## Loading required package: carData

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(questionr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

Fix variable names

#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
head(nams, n=10)

##  [1] "dispcode" "statere1" "safetime" "hhadult"  "genhlth"  "physhlth"
##  [7] "menthlth" "poorhlth" "hlthpln1" "persdoc2"

#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss_17)<-newnames

1. Define a binary outcome variable of your choosing and define how you recode the original variable.

Depression - (Ever told) you have a depressive disorder (including depression, major depression, dysthymia, or minor depression)?

2. State a research question about what factors you believe will affect your outcome variable.

What relationship do veteran status and labor participation have with depression?

3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

EMPLOY1 - Employment Status; VETERAN3 - Are You A Veteran

Recoding of variables

#depression
brfss_17$depr<-Recode(brfss_17$addepev2, recodes ="7:9=NA; 1=1;2=0")

#veteran
#brfss_17$vet<-Recode(brfss_17$veteran3, recodes ="7:9=NA; 1=1;2=0", as.factor=T)

brfss_17$vet<-Recode(brfss_17$veteran3, recodes="1='Veteran'; 2='NV'; else=NA", as.factor=T)

#employment
brfss_17$employ<-Recode(brfss_17$employ1, recodes="1:2='Employed'; 3:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

Filter cases

brfss_17<-brfss_17%>%
  filter(sex!=9, is.na(vet)==F,is.na(employ)==F)

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

#First we will do some tables
#Raw frequencies
table(brfss_17$depr, brfss_17$vet )

##    
##         NV Veteran
##   0 158907   23743
##   1  39937    4691

table(brfss_17$depr, brfss_17$employ)

##    
##     Employed  nilf retired unable
##   0    98732 22363   54988   6567
##   1    19175  6892   10783   7778

#column percentages
prop.table(table(brfss_17$depr, brfss_17$vet  ), margin=2)

##    
##            NV   Veteran
##   0 0.7991541 0.8350215
##   1 0.2008459 0.1649785

prop.table(table(brfss_17$depr, brfss_17$employ  ), margin=2)

##    
##      Employed      nilf   retired    unable
##   0 0.8373718 0.7644163 0.8360524 0.4577902
##   1 0.1626282 0.2355837 0.1639476 0.5422098

#basic chi square test of independence
chisq.test(table(brfss_17$depr, brfss_17$vet))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(brfss_17$depr, brfss_17$vet)
## X-squared = 202.58, df = 1, p-value < 2.2e-16

chisq.test(table(brfss_17$depr, brfss_17$employ))

## 
##  Pearson's Chi-squared test
## 
## data:  table(brfss_17$depr, brfss_17$employ)
## X-squared = 12447, df = 3, p-value < 2.2e-16

Create survey design object

brfss_17$tx<-NA
brfss_17$tx[grep(pattern = "TX", brfss_17$mmsaname)]<-1

brfss_17<-brfss_17%>%
  filter(tx==1, is.na(mmsawt)==F, is.na(depr)==F)

#
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~ststr, weights=~mmsawt, data = brfss_17 )

4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

sv.table<-svyby(formula = ~depr, by = ~vet, design = des, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Depression by Veteran Status",
      align = 'c',  
      format = "html")

Survey Estimates of Depression by Veteran Status
	vet	depr	se
NV	NV	0.1605685	0.0091099
Veteran	Veteran	0.2085911	0.0300425

#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss_17)

sv.table<-svyby(formula = ~depr, by = ~vet, design = nodes, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Estimates of Depression by Veteran Status - No survey design",
      align = 'c',  
      format = "html")

Estimates of Depression by Veteran Status - No survey design
	vet	depr	se
NV	NV	0.1804511	0.0044966
Veteran	Veteran	0.1565957	0.0106027

sv.table<-svyby(formula = ~depr, by = ~employ, design = des, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Depression by Employment Status",
      align = 'c',  
      format = "html")

Survey Estimates of Depression by Employment Status
	employ	depr	se
Employed	Employed	0.1191093	0.0086004
nilf	nilf	0.1788280	0.0239301
retired	retired	0.1509674	0.0184346
unable	unable	0.6205544	0.0469430

#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = brfss_17)

sv.table<-svyby(formula = ~depr, by = ~employ, design = nodes, FUN = svymean, na.rm=T)


knitr::kable(sv.table,
      caption = "Estimates of Depression by Employment Status - No survey design",
      align = 'c',  
      format = "html")

Estimates of Depression by Employment Status - No survey design
	employ	depr	se
Employed	Employed	0.1394644	0.0055069
nilf	nilf	0.1841693	0.0108520
retired	retired	0.1582840	0.0070198
unable	unable	0.5235507	0.0212590

4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

library(tableone)
#not using survey design
t1<-CreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = brfss_17)
#t1<-print(t1, format="p")
print(t1,format="p")

##                    Stratified by depr
##                     0    1    p      test
##   n                 6986 1504            
##   employ (%)                  <0.001     
##      Employed       48.8 36.7            
##      nilf           14.9 15.6            
##      retired        32.6 28.5            
##      unable          3.8 19.2            
##   vet = Veteran (%) 14.2 12.2  0.052

#using survey design
st1<-svyCreateTableOne(vars = c("employ", "vet"), strata = "depr", test = T, data = des)
#st1<-print(st1, format="p")
print(st1, format="p")

##                    Stratified by depr
##                     0          1         p      test
##   n                 12597218.4 2498940.0            
##   employ (%)                             <0.001     
##      Employed             61.4      41.8            
##      nilf                 21.6      23.7            
##      retired              14.4      12.9            
##      unable                2.6      21.5            
##   vet = Veteran (%)        9.8      13.0  0.097

4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?
Yes, the percent estimates are differet for for both independent variables across all levels and when looking at depression by veteran status, the survey design results show a greater proportion of veterans report depression whereas the non-survey design results show the opposite.

DEM 7283 - HW2

David Rodriguez

08 February, 2021

Fix variable names

1. Define a binary outcome variable of your choosing and define how you recode the original variable.

2. State a research question about what factors you believe will affect your outcome variable.

3. Define at least 2 predictor variables, based on your research question. For this assignment, it’s best if these are categorical variables.

Recoding of variables

Filter cases

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b. (e.g. 2 x 2 table, 2 x k table). Follow a similar approach to presenting your statistics as presented in Sparks 2009 (in the Google drive). This can be done easily using the tableone package!

Create survey design object

4.1 Calculate descriptive statistics (mean or percentages) for each variable using no weights or survey design, as well as with full survey design and weights.

4.2 Calculate percentages, or means, for each of your independent variables for each level of your outcome variable and present this in a table, with appropriate survey-corrected test statistics. (tableone package helps)

4.3 Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?