1. Define a binary outcome variable of your choosing and define how you recode the original variable.

Proposed topic: Advserse health conditions associated with opportunity youths

Outcome variabels: Health conditions such Health status, depression, anxiety, smoking etc. The purpose of this assignment I would focus on self rated health status of opportunity youths

2 State a research question about what factors you believe will affect your outcome variable.

Qst: What socio-demographic factors are associated with opportunity youths?. Facors that might affect the outcome variable include gender, country of birth(Us born or not), racial-ethnicity, etc.

Define at least 2 predictor variables, based on your research question

– Racial ethnicity – Opportunity youth status

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forcats)
library(ipumsr)
ddi <- read_ipums_ddi("nhis_00002.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS NHIS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
names(data) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(data)))
#sex
data$male<-as.factor(ifelse(data$sex==1, "Male", "Female"))


#race/ethnicity
data$black<- car::Recode(data$hisprace,
                       recodes="03=1; 99=NA; else=0")

data$white<- car::Recode(data$hisprace,
                       recodes="02=1; 99=NA; else=0")

data$other<- car::Recode(data$hisprace,
                      recodes="4:7=1; 99=NA; else=0")

data$hispanic<- car::Recode(data$hisprace,
                       recodes="01=1; 99=NA; else=0")

data$hisprace<- as.factor(data$hisprace)

data$race_eth<-car::Recode(data$hisprace,
recodes="01='hispanic'; 02='nhwhite'; 03='nh black';04:06='nh other'; 07='nh multirace'; else=NA",
as.factor = T)
data$race_eth<-relevel(data$race_eth,
                          ref = "nhwhite")


#education level

data$educ<- as.factor(data$educ)

data$educ<- car::Recode(data$educ,
                     recodes="102:103='0Prim'; 116='1somehs'; 201:202='2hsgrad'; 301:303='3somecol'; 400:503='4colgrad';997:999=NA;000=NA",
                     as.factor=T)
data$educ<-fct_relevel(data$educ,'0Prim','1somehs','2hsgrad','3somecol','4colgrad' ) 

#Urban-rural classification

data$urbrrl<- as.factor(data$urbrrl)

data$urban_rural<- car::Recode(data$urbrrl,
                     recodes="1:3='urban'; 4='rural';000=NA",
                     as.factor=T)
data$urban_rural<-relevel(data$urban_rural, ref='rural') 


#employment status


data$unemploy<- car::Recode(data$empstat,
                       recodes="200=1; 00=NA; 999=NA; else=0")

data$empstat<- as.factor(data$empstat)

data$employ_status<- car::Recode(data$empstat,
                       recodes="100='Employed'; 200='unemployed';else=NA",
                       as.factor=T)
data$employ_status<-relevel(data$employ_status, ref='Employed')


# currently in school

data$non_schooling<- car::Recode(data$schoolnow,
                       recodes="1=1; 0=NA; 7:9=NA; else=0")

data$schoolnow<- as.factor(data$schoolnow)

data$schoolstatus<- car::Recode(data$schoolnow,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$schoolstatus<-relevel(data$schoolstatus, ref='no')


# merging schooling and working

data$opportunity_youth <- paste( data$unemploy, data$non_schooling, sep ="")


data$opportunity_youth<- as.factor(data$opportunity_youth)
data$opportunity_youth_cat<- car::Recode(data$opportunity_youth,
                       recodes="11='Opportunity youth';00:11='Not opportunity youth';else=NA",
                       as.factor=T)
data$opportunity_youth_cat<-relevel(data$opportunity_youth_cat, ref='Opportunity youth')

#income grouping

data$familyincome <- data$incfam07on

# born in the US

data$usborn<- car::Recode(data$usborn,
                       recodes="20=1; 97:98=NA; else=0")

# US Citizen
data$citizen<- car::Recode(data$citizen,
                       recodes="2=1; 8:9=NA; else=0")


#last employed

data$emplast<- as.factor(data$emplast)

data$employedlast<- car::Recode(data$emplast,
                       recodes="1='Within past 12months'; 2='1-5years ago'; 3='over 5years ago'; 4='never worked';else=NA",
                       as.factor=T)
data$employedlast<-relevel(data$employedlast, ref='1-5years ago')


#HEALTH VARIABLES

#Poor or fair self rated health
data$badhealth<-car::Recode(data$health, recodes="4:5=1; 1:3=0; else=NA")


#place for medical care

data$usualpl<- as.factor(data$usualpl)

data$medicalplace<- car::Recode(data$usualpl,
                       recodes="1='no'; 2:3:='yes';else=NA",
                       as.factor=T)
data$medicalplace<-relevel(data$medicalplace, ref='no')


# delayed medical care due to cost

data$delaycost<- as.factor(data$delaycost)

data$medical_care_cost<- car::Recode(data$delaycost,
                       recodes="2='yes'; 1='no';else=NA",
                       as.factor=T)
data$medical_care_cost<-relevel(data$medical_care_cost, ref='yes')

# worried about paying medical bills

data$wormedbill<- as.factor(data$wormedbill)

data$medical_bill_worried<- car::Recode(data$wormedbill,
                       recodes="1='very worried'; 2='somewhat worried'; 3='not at all';else=NA",
                       as.factor=T)
data$medical_bill_worried<-relevel(data$medical_bill_worried, ref='very worried')

# unable to pay medical bills

data$hiunablepay<- as.factor(data$hiunablepay)

data$medical_bill_unabletopay<- car::Recode(data$hiunablepay,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medical_bill_unabletopay<-relevel(data$medical_bill_unabletopay, ref='yes')


# health insurance coverage

data$hinotcove <- as.factor(data$hinotcove)

data$healthinsurace_coverage<- car::Recode(data$hinotcove,
                       recodes="1='no, has coverage'; 2='yes, no coverage';else=NA",
                       as.factor=T)
data$healthinsurace_coverage<-relevel(data$healthinsurace_coverage, ref='yes, no coverage')

# dont have health insurance cuz of cost

data$hinocostr <- as.factor(data$hinocostr)

data$nohealthinsurace_cost<- car::Recode(data$hinocostr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$nohealthinsurace_cost<-relevel(data$nohealthinsurace_cost, ref='yes')

# used medication in the past year

data$premedyr <- as.factor(data$premedyr)

data$usedmedications<- car::Recode(data$premedyr,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$usedmedications<-relevel(data$usedmedications, ref='yes')




# if they smoked

data$smokfreqnow <- as.factor(data$smokfreqnow)

data$smoke_frequently<- car::Recode(data$smokfreqnow,
                       recodes="1='no'; 2:3='somedays/everyday';else=NA",
                       as.factor=T)
data$smoke_frequently<-relevel(data$smoke_frequently, ref='somedays/everyday')

# smoked up to 100 cigarreth in life time

data$smokev <- as.factor(data$smokev)

data$smoke_100cig<- car::Recode(data$smokev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$smoke_100cig<-relevel(data$smoke_100cig, ref='yes')


#Mental health

# ever had anxiety disoreder

data$anxietyev <- as.factor(data$anxietyev)

data$anxiety_disoreder<- car::Recode(data$anxietyev,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$anxiety_disoreder<-relevel(data$anxiety_disoreder, ref='yes')

# medication for worrying

data$worrx <- as.factor(data$worrx)

data$medication_for_worry<- car::Recode(data$worrx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_worry<-relevel(data$medication_for_worry, ref='yes')


# medication for depression

data$deprx <- as.factor(data$deprx)

data$medication_for_depression<- car::Recode(data$deprx,
                       recodes="1='no'; 2='yes';else=NA",
                       as.factor=T)
data$medication_for_depression<-relevel(data$medication_for_depression, ref='yes')

# level of worry

data$worfeelevl <- as.factor(data$worfeelevl)

data$level_of_worry<- car::Recode(data$worfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_worry<-relevel(data$level_of_worry, ref='alot')


# level of depression

data$depfeelevl <- as.factor(data$depfeelevl)

data$level_of_depression<- car::Recode(data$depfeelevl,
                       recodes="1='alot'; 2='a little'; 3='btw little and alot';else=NA",
                       as.factor=T)
data$level_of_depression<-relevel(data$level_of_depression, ref='alot')
data <- data%>%
filter(age >=16 & age<=24)

data<-data%>%
  filter(is.na(educ)==F,
         is.na(badhealth)==F)

Outcome variable- Opportunity youths

table(data$opportunity_youth_cat)
## 
##     Opportunity youth Not opportunity youth 
##                   399                  3366

4. Perform a descriptive analysis of the outcome variable by each of the variables you defined in part b.

Socio-demographic characteristics of Opportunity youths

opportunity youth by gender

table(data$opportunity_youth_cat, data$male)
##                        
##                         Female Male
##   Opportunity youth        231  168
##   Not opportunity youth   1669 1697

There are more female opportunity youth

opportunity youth by race-ethnicity

table(data$opportunity_youth_cat, data$race_eth)
##                        
##                         nhwhite hispanic nh black nh multirace nh other
##   Opportunity youth         192      100       68           13       26
##   Not opportunity youth    1897      758      340           93      278

Non-hispanic whites are more likely to be opportunity youths

opportunity youth by urban-rural residence

table(data$opportunity_youth_cat, data$urban_rural)
##                        
##                         rural urban
##   Opportunity youth        62   337
##   Not opportunity youth   417  2949

There are more opportunity youths whites in rural areas

table(data$opportunity_youth_cat, data$badhealth)
##                        
##                            0    1
##   Opportunity youth      349   50
##   Not opportunity youth 3211  155
100*prop.table(table(data$opportunity_youth_cat, data$badhealth), margin=2)
##                        
##                                 0         1
##   Opportunity youth      9.803371 24.390244
##   Not opportunity youth 90.196629 75.609756
#basic chi square test of independence
chisq.test(table(data$badhealth, data$opportunity_youth_cat))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(data$badhealth, data$opportunity_youth_cat)
## X-squared = 42.006, df = 1, p-value = 9.1e-11
library(srvyr)
## 
## Attaching package: 'srvyr'
## The following object is masked from 'package:stats':
## 
##     filter
options(survey.lonely.psu = "adjust")

des<-svydesign(ids=~1, strata=~strata, weights=~sampweight, data = data )
des
## Stratified Independent Sampling design (with replacement)
## svydesign(ids = ~1, strata = ~strata, weights = ~sampweight, 
##     data = data)
#compare that with the original, unweighted proportions
prop.table(table(data$badhealth,
                 data$opportunity_youth_cat),
           margin=2)
##    
##     Opportunity youth Not opportunity youth
##   0        0.87468672            0.95395128
##   1        0.12531328            0.04604872
cat<-svyby(formula = ~badhealth,
           by=~opportunity_youth_cat,
           design = des,
           FUN=svymean)

svychisq(~badhealth+sex, design = des)
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~badhealth + sex, design = des)
## F = 4.4058, ndf = 1, ddf = 3809, p-value = 0.03588
knitr::kable(cat,
      caption = "Survey Estimates of Poor SRH by Opportunity Youths",
      align = 'c',  
      format = "html")
Survey Estimates of Poor SRH by Opportunity Youths
opportunity_youth_cat badhealth se
Opportunity youth Opportunity youth 0.1058106 0.0167357
Not opportunity youth Not opportunity youth 0.0430556 0.0039799
sv.table<-svyby(formula = ~badhealth,
                by = ~opportunity_youth_cat,
                design = des,
                FUN = svymean,
                na.rm=T)


knitr::kable(sv.table,
      caption = "Survey Estimates of Poor SRH by Opportunity Youths",
      align = 'c',  
      format = "html")
Survey Estimates of Poor SRH by Opportunity Youths
opportunity_youth_cat badhealth se
Opportunity youth Opportunity youth 0.1058106 0.0167357
Not opportunity youth Not opportunity youth 0.0430556 0.0039799
#Make a survey design that is random sampling - no survey information
nodes<-svydesign(ids = ~1,  weights = ~1, data = data)

sv.table<-svyby(formula = ~factor(badhealth),
                by = ~opportunity_youth_cat,
                design = nodes,
                FUN = svymean,
                na.rm=T)
knitr::kable(sv.table,
      caption = "Estimates of Poor SRH by Opportunity youth - No survey design",
      align = 'c',  
      format = "html")
Estimates of Poor SRH by Opportunity youth - No survey design
opportunity_youth_cat factor(badhealth)0 factor(badhealth)1 se.factor(badhealth)0 se.factor(badhealth)1
Opportunity youth Opportunity youth 0.8746867 0.1253133 0.0165766 0.0165766
Not opportunity youth Not opportunity youth 0.9539513 0.0460487 0.0036130 0.0036130
library(srvyr)

data%>%
  as_survey_design(strata = strata,
                   weights = sampweight)%>%
  group_by(opportunity_youth_cat)%>%
  summarise(mean_bh = survey_mean(badhealth, na.rm=T))%>%
  ungroup()
## # A tibble: 3 × 3
##   opportunity_youth_cat mean_bh mean_bh_se
##   <fct>                   <dbl>      <dbl>
## 1 Opportunity youth      0.106     0.0167 
## 2 Not opportunity youth  0.0431    0.00398
## 3 <NA>                   0.0654    0.0256
library(gtsummary)

data%>%
  as_survey_design( strata =strata,
                    weights = sampweight)%>%
  select(badhealth, opportunity_youth_cat)%>%
  tbl_svysummary(by = opportunity_youth_cat, 
              label = list(badhealth = "Fair/Poor Health"))%>%
  add_p()%>%
  add_n()
## 96 observations missing `opportunity_youth_cat` have been removed. To include these observations, use `forcats::fct_explicit_na()` on `opportunity_youth_cat` column before passing to `tbl_svysummary()`.
## ℹ Column(s) badhealth are class "haven_labelled". This is an intermediate datastructure not meant for analysis. Convert columns with `haven::as_factor()`, `labelled::to_factor()`, `labelled::unlabelled()`, and `unclass()`. "haven_labelled" value labels are ignored when columns are not converted. Failure to convert may have unintended consequences or result in error.
## • https://haven.tidyverse.org/articles/semantics.html
## • https://larmarange.github.io/labelled/articles/intro_labelled.html#unlabelled
Characteristic N Opportunity youth, N = 6,943,2031 Not opportunity youth, N = 50,000,9401 p-value2
Fair/Poor Health 56,944,143 734,664 (11%) 2,152,821 (4.3%) <0.001

1 n (%)

2 chi-squared test with Rao & Scott's second-order correction

fit1<-lm(badhealth~opportunity_youth_cat+race_eth,
         data=data)
fit2<-lm(badhealth~opportunity_youth_cat+race_eth,
         data=data,
         weights = sampweight)
fit3<-svyglm(badhealth~opportunity_youth_cat+race_eth,
             design = des, 
             family=gaussian)

Are there substantive differences in the descriptive results between the analysis using survey design and that not using survey design?

Ans: There are no much substantive differences in the descriptive statistics

library(ggplot2)
library(dplyr)
coefs<-data.frame(coefs=c(coef(fit1)[-1], coef(fit3)[-1]),
                  mod=c(rep("Non Survey Model", 10),rep("Survey Model", 10)),
                  effect=rep(names(coef(fit1)[-1]), 2))

coefs%>%
  ggplot()+
  geom_point(aes( x=effect,
                  y=coefs,
                  group=effect,
                  color=effect,
                  shape=mod),
             position=position_jitterdodge(jitter.width = 1),
             size=2)+
  ylab("Regression Coefficient")+
  xlab("Beta")+
  geom_abline(intercept = 0, slope=0)+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  ggtitle(label = "Comparison of Survey and Non-Survey Regression effects")