#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(eha)
library(tidyverse)
options(survey.lonely.psu = "adjust")

Read wave 3, wave 4 and weight

w3 <- as_tibble(read.delim("21600-0008-Data.tsv", sep = '\t', header = TRUE ))
w4 <- as_tibble(read.delim("21600-0022-Data.tsv", sep = '\t', header = TRUE ))
w5 <- as_tibble(read.delim("21600-0032-Data.tsv", sep = '\t', header = TRUE ))
wt <- as_tibble(read.delim("21600-0004-Data.tsv", sep = '\t', header = TRUE ))


names(w3)<-tolower(names(w3))
names(w4)<-tolower(names(w4))
names(w5)<-tolower(names(w5))
names(wt)<-tolower(names(wt))

Data Cleaning

# Year of survey for wave 3
summary(w3$iyear3)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2001    2001    2001    2001    2002    2002
#Age at birth
summary(w3$h3od1y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1974    1978    1979    1979    1981    1983
# Current age is year of survey minus age at birth
w3$age <- (w3$iyear3 -w3$h3od1y)
summary(w3$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.0    21.0    22.0    22.2    24.0    28.0
# Year of survey for wave 4
summary(w4$iyear4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2007    2008    2008    2008    2008    2009
#Age at birth
summary(w4$h4od1y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1974    1978    1979    1979    1980    1983
# Current age is year of survey minus age at birth
w4$age <- (w4$iyear4 -w4$h4od1y)
summary(w4$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      25      28      29      29      30      34
# Year of survey for wave 5
summary(w5$iyear5)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2016    2016    2017    2017    2017    2018
#Age at birth
summary(w5$h5od1y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1976    1978    1980    1980    1981    1983
# Current age is year of survey minus age at birth
w5$age <- (w5$iyear5 -w5$h5od1y)
summary(w5$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.00   36.00   37.00   37.32   39.00   42.00
# Filter respondent to only 18-24

w3 <- w3%>%
  filter(age >=18 & age <=24)

# Filter respondent to those not serving in he military
table(w3$h3lm39)
## 
##    0    1    8    9 
## 3591   61    3    2
w3 <- w3%>%
  filter(h3lm39 ==0)

# Opportunity Youth Status

#School status- Are you currently enrolled in school or in a job training or vocational education program?
table(w3$h3da36)
## 
##    0    1    6    8    9  997 
## 2158 1426    2    2    2    1
# currently in school

w3$schoolstatus<- car::Recode(w3$h3da36,
                       recodes="1=1; 0=0; else=NA")

#Employment status
table(w3$h3da28)
## 
##    0    1    6    8    9 
##  999 2585    2    2    3
#Employment status recode

w3$empstat<- car::Recode(w3$h3da28,
                       recodes="1=1; 0=0; else=NA")

# merging schooling and working

w3$opport<- paste( w3$schoolstatus , w3$empstat, sep ="")
table(w3$opport)
## 
##   00   01  0NA   10   11  1NA  NA0  NA1 NANA 
##  533 1624    1  465  960    1    1    1    5
# Opportunity Youth status
w3$opportunity_youth_three<- car::Recode(w3$opport,
                       recodes="'00'='Opportunity Youth'; '01'='Schooling/Employed'; '10'='Schooling/Employed'; '11'='Schooling & Employed'; else=NA",
                       as.factor=T)

w3$opportunity_youth_two<- car::Recode(w3$opport,
                       recodes="'00'='Opportunity Youth'; '01'='Not Opport Youth'; '10'='Not Opport Youth'; '11'='Not Opport Youth'; else=NA",
                       as.factor=T)


# Insurance at Wave 3

table(w3$h3hs5)
## 
##    0    1    2    3    4    5    6    7    8    9   10   96   98   99 
##  814 1162  137  991   17  103    1   71  234   13   29    1   11    7
w3$insurance<- car::Recode(w3$h3hs5,
                       recodes="0='Not Covered'; 1:9='Covered'; else=NA",
                       as.factor=T)

# Highest level of education in wave 4
# Might not be able to use education because most are still trying to to school, not available
#table(w4$h4ed2)
#w4$educ<- car::Recode(w4$h4ed2,
                      # recodes="1='Less than HS'; 2:5='HS Degree'; 6:13='More than HS'; else=NA",
                      # as.factor=T)

# heallth insurance at wave 4
#table(w4$h4hs1)

#w4$insurance<- car::Recode(w4$h4hs1,
                       #recodes="1='Not Covered'; 2:9='Covered'; else=NA",
                       #as.factor=T)


# BMI for wave 4
#table(w4$h4bmicls)
#w4$bmi<- car::Recode(w4$h4bmicls,
                       #recodes="1='Underweight'; 2='Normal Weight'; 3='Over Weight'; 4:6='Obesity';else=NA",
                      # as.factor=T)


# Race ethnicity for wave 3
table(w3$h3od2)
## 
##    0    1    8 
## 3214  372    5
table(w3$h3ir4)
## 
##    0    1    2    3    4 
##    1 2482  895   61  147
w3$hisp <- car::Recode(w3$h3od2, recodes = "1= 'hisp' ; 0= 'not_hisp'; else=NA")
w3$race <- car::Recode(w3$h3ir4, recodes = "1= 'wh'; 2= 'bl'; 3 = 'asian'; 4='other'; else=NA")
w3$race_eth<-ifelse(w3$hisp ==1, "hisp", w3$race)
w3$race_eth <- interaction(w3$hisp, w3$race)


  w3$race_ethR<- mutate(w3, ifelse(hisp==0 & race==1, 1,
                    ifelse(hisp==0 & race==2, 2,
                     ifelse(hisp==1, 3,
                    ifelse(hisp==0 & race==3, 4,
                     ifelse(hisp==0 & race==4, 5, "NA"))))))
w3$race_ethR<- ifelse(substr(w3$race_eth, 1, 4)=="hisp", "hisp", as.character(w3$race_eth))


w3$race_ethR <- car::Recode(w3$race_ethR, recodes = "'not_hisp.asian'= 'not_hisp.other'")

#w3$race_ethR <- car::Recode(w3$race_ethR, recodes = "'hisp'= 'minority'; 'not_hisp.bl'='minority'; 'not_hisp.other'= 'minority'; 'not_hisp.wh' = 'majority'")



# Event of interest 

# SRH- Health status - WAVE 3
table(w3$h3gh1)
## 
##    1    2    3    4    5 
## 1172 1451  801  148   19
w3$badhealth <-car::Recode(w3$h3gh1, recodes="4:5 =1 ; 1:3=0 ; else=NA")

# SRH- Health status - WAVE 4
table(w4$h4gh1)
## 
##    1    2    3    4    5 
##  979 1963 1683  434   55
w4$badhealth <-car::Recode(w4$h4gh1, recodes="4:5 =1 ; 1:3=0 ; else=NA")


# SRH- Health status - WAVE 5
table(w5$h5id1)
## 
##    1    2    3    4    5 
##  701 1503 1393  496   99
w5$badhealth <-car::Recode(w5$h5id1, recodes="4:5 =1 ; 1:3=0 ; else=NA")
# Combine all ways
allwaves <- left_join(w3, w4, by="aid")
allwaves <- left_join(allwaves, w5, by="aid")
allwaves<- left_join(allwaves, wt, by="aid")
#get out only the variables for use 

myvars<-c( "aid","badhealth.x", "badhealth.y", "badhealth", "opportunity_youth_three","opportunity_youth_two",
           "age.x", "age.y", "bio_sex3", "age", "insurance", "gswgt1", "cluster2",
           "race_ethR")
allwaves2<-allwaves[,myvars]
# time varying variables- Health and age
allwaves2$age_1<- allwaves2$age.x
allwaves2$age_2<- allwaves2$age.y
allwaves2$age_3<- allwaves2$age



allwaves2$badhealth_1<-allwaves2$badhealth.x
allwaves2$badhealth_2<-allwaves2$badhealth.y
allwaves2$badhealth_3<-allwaves2$badhealth

#Time constant variables- Opportunity youth stat, sex, bmi, insurance, race
#Recode race with white, non Hispanic as reference using dummy vars

#Opportunity Youth status
#bmi
#Insurance
#race
allwaves2$sex<- car::Recode(allwaves2$bio_sex3, recodes="1='Male'; 2='Female'")

Remove young adult who already experienced bad health as they are not at risk of experience bad health in wave 4

allwaves2<-allwaves2 %>% filter(is.na(badhealth_1)==F &
                  is.na(badhealth_2)==F &
                  is.na(badhealth_3)==F & 
                  is.na(age_1)==F &
                  is.na(age_2)==F &
                  is.na(age_3)==F &
                    badhealth_1!=1)%>%
  mutate(badhealthtran_1 =ifelse(badhealth_1==0 & badhealth_2==0, 0,1), 
        badhealthtran_2 = ifelse(badhealthtran_1==1, NA,
                         ifelse(badhealth_2==0 & badhealth_3==0,0,1)))
e.long1 <- allwaves2 %>%
  #rename(wt = w4c4p_40,strata= w4c4p_4str, psu = w4c4p_4psu)%>%
  select(opportunity_youth_two,insurance, race_ethR, sex,opportunity_youth_three,aid,gswgt1,cluster2,    #time constant
        age_1, age_2,age_3, #t-varying variables
         badhealth_1, badhealth_2, badhealth_3 )%>%
   pivot_longer(cols = c(-opportunity_youth_two,-insurance, -race_ethR, -sex, -opportunity_youth_three, -aid, -gswgt1, -cluster2), #time constant variables go here
               names_to  = c(".value", "wave"), #make wave variable and put t-v vars into columns
               names_sep = "_") %>% #all t-v variables have _ between name and time, like age_1, age_2
   group_by(aid)%>%
  mutate(age_enter = age, 
         age_exit = lead(age, 1, order_by=aid))%>%
  mutate(nexbadhealth = dplyr::lead(badhealth,n=1, order_by = age))%>%
  mutate(badhealthtran = ifelse(nexbadhealth == 1 & badhealth == 0, 1, 0))%>%
  filter(is.na(age_exit)==F)%>%
  ungroup()%>%
  filter(complete.cases(age_enter, age_exit, badhealthtran,
                        opportunity_youth_two, insurance, race_ethR,sex, gswgt1,cluster2))
options(survey.lonely.psu = "adjust")


#des<-svydesign(ids=~psuscid, strata = ~region , weights=~gswgt4_2, data=dat3, nest = T)

# cluster
#des<-svydesign(ids=~psuscid, strata = ~1 , weights=~gswgt4_2, data=dat3, nest = T)



des2<-svydesign(ids = ~cluster2,
                #strata = 1,
                weights=~gswgt1, 
                data=e.long1,
                nest=T)

Working topic: Health status transition among opportunity youths and young adults in the United States

Selected predictors: Opportunity Youth status, Insurance, and sex

Cox model

#Fit the model
fitl1<-svycoxph(Surv(time = age_enter, time2=age_exit, event = badhealthtran)~opportunity_youth_two+insurance+ sex, design=des2)
summary(fitl1) 
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1, 
##     nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~ 
##     opportunity_youth_two + insurance + sex, design = des2)
## 
##   n= 4400, number of events= 383 
## 
##                                           coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.44442   1.55958  0.13300   0.13429
## insuranceNot Covered                   0.46143   1.58634  0.11091   0.12345
## sexMale                                0.02855   1.02896  0.10319   0.10461
##                                            z Pr(>|z|)    
## opportunity_youth_twoOpportunity Youth 3.309 0.000935 ***
## insuranceNot Covered                   3.738 0.000186 ***
## sexMale                                0.273 0.784898    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                        exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth     1.560     0.6412    1.1987     2.029
## insuranceNot Covered                       1.586     0.6304    1.2454     2.021
## sexMale                                    1.029     0.9719    0.8382     1.263
## 
## Concordance= 0.566  (se = 0.018 )
## Likelihood ratio test= NA  on 3 df,   p=NA
## Wald test            = 29.3  on 3 df,   p=2e-06
## Score (logrank) test = NA  on 3 df,   p=NA
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).

Model residuals

From the below results, it shows that one of the variables(insurance) is correlated with the timing of transition. Opportunity youth status is not correlated

schoenresid<-resid(fitl1, type="schoenfeld")

fit.sr<-lm(schoenresid~des2$variables$age_enter[des2$variables$badhealthtran==1])

fit.sr%>%
  broom::tidy()%>%
  filter(term != "(Intercept)")%>%
  select(response, estimate, statistic, p.value)
## # A tibble: 3 × 4
##   response                               estimate statistic p.value
##   <chr>                                     <dbl>     <dbl>   <dbl>
## 1 opportunity_youth_twoOpportunity Youth  0.00483     0.854  0.394 
## 2 insuranceNot Covered                   -0.0154     -2.35   0.0191
## 3 sexMale                                 0.00686     1.00   0.318

using weighted residuals

fit.test<-cox.zph(fitl1)
fit.test
##                          chisq df    p
## opportunity_youth_two 4.68e-04  1 0.98
## insurance             7.09e-05  1 0.99
## sex                   1.50e-04  1 0.99
## GLOBAL                6.45e-04  3 1.00

Produce plots of the survival function and the cumulative hazard function for different “risk profiles”,

plot(fit.test, df=2)

The result below also shows non-linearity

#extract Martingale residuals
res.mar<-resid(fitl1, type="martingale")

#plot vs opportunity youth status
scatter.smooth(des2$variables$opportunity_youth_two, res.mar,degree = 2,
               span = 1, ylab="Martingale Residual",
               col=1,  cex=.5, lpars=list(col = "red", lwd = 3))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 1.6709e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 4.6045e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 5.7824e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 5.6608e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 1.6709e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
title(main="Martingale residuals for Opportunity youth status")

stratification

fitl2<-svycoxph(Surv(time = age_enter, time2 = age_exit, event = badhealthtran)~opportunity_youth_two+strata(insurance),
                design=des2)
summary(fitl2) 
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1, 
##     nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~ 
##     opportunity_youth_two + strata(insurance), design = des2)
## 
##   n= 4400, number of events= 383 
## 
##                                          coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.4431    1.5575   0.1330    0.1331
##                                            z Pr(>|z|)    
## opportunity_youth_twoOpportunity Youth 3.329 0.000873 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                        exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth     1.558      0.642       1.2     2.022
## 
## Concordance= 0.526  (se = 0.012 )
## Likelihood ratio test= NA  on 1 df,   p=NA
## Wald test            = 11.08  on 1 df,   p=9e-04
## Score (logrank) test = NA  on 1 df,   p=NA
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).

Anova like test for factors

fit3<-svycoxph(Surv(time = age_enter, time2=age_exit, event = badhealthtran)~opportunity_youth_two+insurance,
               design=des2)
summary(fit3)
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1, 
##     nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~ 
##     opportunity_youth_two + insurance, design = des2)
## 
##   n= 4400, number of events= 383 
## 
##                                          coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.4432    1.5576   0.1329    0.1340
## insuranceNot Covered                   0.4634    1.5894   0.1107    0.1227
##                                            z Pr(>|z|)    
## opportunity_youth_twoOpportunity Youth 3.308 0.000938 ***
## insuranceNot Covered                   3.776 0.000159 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                        exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth     1.558     0.6420     1.198     2.025
## insuranceNot Covered                       1.589     0.6292     1.250     2.022
## 
## Concordance= 0.566  (se = 0.016 )
## Likelihood ratio test= NA  on 2 df,   p=NA
## Wald test            = 29.13  on 2 df,   p=5e-07
## Score (logrank) test = NA  on 2 df,   p=NA
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
regTermTest(fit3, ~opportunity_youth_two, method="LRT")
## Working (Rao-Scott+F) LRT for opportunity_youth_two
##  in svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~ 
##     opportunity_youth_two + insurance, design = des2)
## Working 2logLR =  10.15211 p= 0.0019237 
## df=1;  denominator df= 129

comment: After evaluating the assumptions of the cox model(using the residuals) that the time-constant covariate effect, i.e. the effect does not vary with time, the cox model seems to be a good model for this analysis