#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(eha)
library(tidyverse)
options(survey.lonely.psu = "adjust")
Read wave 3, wave 4 and weight
w3 <- as_tibble(read.delim("21600-0008-Data.tsv", sep = '\t', header = TRUE ))
w4 <- as_tibble(read.delim("21600-0022-Data.tsv", sep = '\t', header = TRUE ))
w5 <- as_tibble(read.delim("21600-0032-Data.tsv", sep = '\t', header = TRUE ))
wt <- as_tibble(read.delim("21600-0004-Data.tsv", sep = '\t', header = TRUE ))
names(w3)<-tolower(names(w3))
names(w4)<-tolower(names(w4))
names(w5)<-tolower(names(w5))
names(wt)<-tolower(names(wt))
Data Cleaning
# Year of survey for wave 3
summary(w3$iyear3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2001 2001 2001 2001 2002 2002
#Age at birth
summary(w3$h3od1y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1974 1978 1979 1979 1981 1983
# Current age is year of survey minus age at birth
w3$age <- (w3$iyear3 -w3$h3od1y)
summary(w3$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.0 21.0 22.0 22.2 24.0 28.0
# Year of survey for wave 4
summary(w4$iyear4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2007 2008 2008 2008 2008 2009
#Age at birth
summary(w4$h4od1y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1974 1978 1979 1979 1980 1983
# Current age is year of survey minus age at birth
w4$age <- (w4$iyear4 -w4$h4od1y)
summary(w4$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25 28 29 29 30 34
# Year of survey for wave 5
summary(w5$iyear5)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2016 2016 2017 2017 2017 2018
#Age at birth
summary(w5$h5od1y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1976 1978 1980 1980 1981 1983
# Current age is year of survey minus age at birth
w5$age <- (w5$iyear5 -w5$h5od1y)
summary(w5$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.00 36.00 37.00 37.32 39.00 42.00
# Filter respondent to only 18-24
w3 <- w3%>%
filter(age >=18 & age <=24)
# Filter respondent to those not serving in he military
table(w3$h3lm39)
##
## 0 1 8 9
## 3591 61 3 2
w3 <- w3%>%
filter(h3lm39 ==0)
# Opportunity Youth Status
#School status- Are you currently enrolled in school or in a job training or vocational education program?
table(w3$h3da36)
##
## 0 1 6 8 9 997
## 2158 1426 2 2 2 1
# currently in school
w3$schoolstatus<- car::Recode(w3$h3da36,
recodes="1=1; 0=0; else=NA")
#Employment status
table(w3$h3da28)
##
## 0 1 6 8 9
## 999 2585 2 2 3
#Employment status recode
w3$empstat<- car::Recode(w3$h3da28,
recodes="1=1; 0=0; else=NA")
# merging schooling and working
w3$opport<- paste( w3$schoolstatus , w3$empstat, sep ="")
table(w3$opport)
##
## 00 01 0NA 10 11 1NA NA0 NA1 NANA
## 533 1624 1 465 960 1 1 1 5
# Opportunity Youth status
w3$opportunity_youth_three<- car::Recode(w3$opport,
recodes="'00'='Opportunity Youth'; '01'='Schooling/Employed'; '10'='Schooling/Employed'; '11'='Schooling & Employed'; else=NA",
as.factor=T)
w3$opportunity_youth_two<- car::Recode(w3$opport,
recodes="'00'='Opportunity Youth'; '01'='Not Opport Youth'; '10'='Not Opport Youth'; '11'='Not Opport Youth'; else=NA",
as.factor=T)
# Insurance at Wave 3
table(w3$h3hs5)
##
## 0 1 2 3 4 5 6 7 8 9 10 96 98 99
## 814 1162 137 991 17 103 1 71 234 13 29 1 11 7
w3$insurance<- car::Recode(w3$h3hs5,
recodes="0='Not Covered'; 1:9='Covered'; else=NA",
as.factor=T)
# Highest level of education in wave 4
# Might not be able to use education because most are still trying to to school, not available
#table(w4$h4ed2)
#w4$educ<- car::Recode(w4$h4ed2,
# recodes="1='Less than HS'; 2:5='HS Degree'; 6:13='More than HS'; else=NA",
# as.factor=T)
# heallth insurance at wave 4
#table(w4$h4hs1)
#w4$insurance<- car::Recode(w4$h4hs1,
#recodes="1='Not Covered'; 2:9='Covered'; else=NA",
#as.factor=T)
# BMI for wave 4
#table(w4$h4bmicls)
#w4$bmi<- car::Recode(w4$h4bmicls,
#recodes="1='Underweight'; 2='Normal Weight'; 3='Over Weight'; 4:6='Obesity';else=NA",
# as.factor=T)
# Race ethnicity for wave 3
table(w3$h3od2)
##
## 0 1 8
## 3214 372 5
table(w3$h3ir4)
##
## 0 1 2 3 4
## 1 2482 895 61 147
w3$hisp <- car::Recode(w3$h3od2, recodes = "1= 'hisp' ; 0= 'not_hisp'; else=NA")
w3$race <- car::Recode(w3$h3ir4, recodes = "1= 'wh'; 2= 'bl'; 3 = 'asian'; 4='other'; else=NA")
w3$race_eth<-ifelse(w3$hisp ==1, "hisp", w3$race)
w3$race_eth <- interaction(w3$hisp, w3$race)
w3$race_ethR<- mutate(w3, ifelse(hisp==0 & race==1, 1,
ifelse(hisp==0 & race==2, 2,
ifelse(hisp==1, 3,
ifelse(hisp==0 & race==3, 4,
ifelse(hisp==0 & race==4, 5, "NA"))))))
w3$race_ethR<- ifelse(substr(w3$race_eth, 1, 4)=="hisp", "hisp", as.character(w3$race_eth))
w3$race_ethR <- car::Recode(w3$race_ethR, recodes = "'not_hisp.asian'= 'not_hisp.other'")
#w3$race_ethR <- car::Recode(w3$race_ethR, recodes = "'hisp'= 'minority'; 'not_hisp.bl'='minority'; 'not_hisp.other'= 'minority'; 'not_hisp.wh' = 'majority'")
# Event of interest
# SRH- Health status - WAVE 3
table(w3$h3gh1)
##
## 1 2 3 4 5
## 1172 1451 801 148 19
w3$badhealth <-car::Recode(w3$h3gh1, recodes="4:5 =1 ; 1:3=0 ; else=NA")
# SRH- Health status - WAVE 4
table(w4$h4gh1)
##
## 1 2 3 4 5
## 979 1963 1683 434 55
w4$badhealth <-car::Recode(w4$h4gh1, recodes="4:5 =1 ; 1:3=0 ; else=NA")
# SRH- Health status - WAVE 5
table(w5$h5id1)
##
## 1 2 3 4 5
## 701 1503 1393 496 99
w5$badhealth <-car::Recode(w5$h5id1, recodes="4:5 =1 ; 1:3=0 ; else=NA")
# Combine all ways
allwaves <- left_join(w3, w4, by="aid")
allwaves <- left_join(allwaves, w5, by="aid")
allwaves<- left_join(allwaves, wt, by="aid")
#get out only the variables for use
myvars<-c( "aid","badhealth.x", "badhealth.y", "badhealth", "opportunity_youth_three","opportunity_youth_two",
"age.x", "age.y", "bio_sex3", "age", "insurance", "gswgt1", "cluster2",
"race_ethR")
allwaves2<-allwaves[,myvars]
# time varying variables- Health and age
allwaves2$age_1<- allwaves2$age.x
allwaves2$age_2<- allwaves2$age.y
allwaves2$age_3<- allwaves2$age
allwaves2$badhealth_1<-allwaves2$badhealth.x
allwaves2$badhealth_2<-allwaves2$badhealth.y
allwaves2$badhealth_3<-allwaves2$badhealth
#Time constant variables- Opportunity youth stat, sex, bmi, insurance, race
#Recode race with white, non Hispanic as reference using dummy vars
#Opportunity Youth status
#bmi
#Insurance
#race
allwaves2$sex<- car::Recode(allwaves2$bio_sex3, recodes="1='Male'; 2='Female'")
Remove young adult who already experienced bad health as they are
not at risk of experience bad health in wave 4
allwaves2<-allwaves2 %>% filter(is.na(badhealth_1)==F &
is.na(badhealth_2)==F &
is.na(badhealth_3)==F &
is.na(age_1)==F &
is.na(age_2)==F &
is.na(age_3)==F &
badhealth_1!=1)%>%
mutate(badhealthtran_1 =ifelse(badhealth_1==0 & badhealth_2==0, 0,1),
badhealthtran_2 = ifelse(badhealthtran_1==1, NA,
ifelse(badhealth_2==0 & badhealth_3==0,0,1)))
e.long1 <- allwaves2 %>%
#rename(wt = w4c4p_40,strata= w4c4p_4str, psu = w4c4p_4psu)%>%
select(opportunity_youth_two,insurance, race_ethR, sex,opportunity_youth_three,aid,gswgt1,cluster2, #time constant
age_1, age_2,age_3, #t-varying variables
badhealth_1, badhealth_2, badhealth_3 )%>%
pivot_longer(cols = c(-opportunity_youth_two,-insurance, -race_ethR, -sex, -opportunity_youth_three, -aid, -gswgt1, -cluster2), #time constant variables go here
names_to = c(".value", "wave"), #make wave variable and put t-v vars into columns
names_sep = "_") %>% #all t-v variables have _ between name and time, like age_1, age_2
group_by(aid)%>%
mutate(age_enter = age,
age_exit = lead(age, 1, order_by=aid))%>%
mutate(nexbadhealth = dplyr::lead(badhealth,n=1, order_by = age))%>%
mutate(badhealthtran = ifelse(nexbadhealth == 1 & badhealth == 0, 1, 0))%>%
filter(is.na(age_exit)==F)%>%
ungroup()%>%
filter(complete.cases(age_enter, age_exit, badhealthtran,
opportunity_youth_two, insurance, race_ethR,sex, gswgt1,cluster2))
options(survey.lonely.psu = "adjust")
#des<-svydesign(ids=~psuscid, strata = ~region , weights=~gswgt4_2, data=dat3, nest = T)
# cluster
#des<-svydesign(ids=~psuscid, strata = ~1 , weights=~gswgt4_2, data=dat3, nest = T)
des2<-svydesign(ids = ~cluster2,
#strata = 1,
weights=~gswgt1,
data=e.long1,
nest=T)
Working topic: Health status transition among opportunity youths and
young adults in the United States
Selected predictors: Opportunity Youth status, Insurance, and
sex
Cox model
#Fit the model
fitl1<-svycoxph(Surv(time = age_enter, time2=age_exit, event = badhealthtran)~opportunity_youth_two+insurance+ sex, design=des2)
summary(fitl1)
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1,
## nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~
## opportunity_youth_two + insurance + sex, design = des2)
##
## n= 4400, number of events= 383
##
## coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.44442 1.55958 0.13300 0.13429
## insuranceNot Covered 0.46143 1.58634 0.11091 0.12345
## sexMale 0.02855 1.02896 0.10319 0.10461
## z Pr(>|z|)
## opportunity_youth_twoOpportunity Youth 3.309 0.000935 ***
## insuranceNot Covered 3.738 0.000186 ***
## sexMale 0.273 0.784898
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth 1.560 0.6412 1.1987 2.029
## insuranceNot Covered 1.586 0.6304 1.2454 2.021
## sexMale 1.029 0.9719 0.8382 1.263
##
## Concordance= 0.566 (se = 0.018 )
## Likelihood ratio test= NA on 3 df, p=NA
## Wald test = 29.3 on 3 df, p=2e-06
## Score (logrank) test = NA on 3 df, p=NA
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
using weighted residuals
fit.test<-cox.zph(fitl1)
fit.test
## chisq df p
## opportunity_youth_two 4.68e-04 1 0.98
## insurance 7.09e-05 1 0.99
## sex 1.50e-04 1 0.99
## GLOBAL 6.45e-04 3 1.00
Produce plots of the survival function and the cumulative hazard
function for different “risk profiles”,
plot(fit.test, df=2)



The result below also shows non-linearity
#extract Martingale residuals
res.mar<-resid(fitl1, type="martingale")
#plot vs opportunity youth status
scatter.smooth(des2$variables$opportunity_youth_two, res.mar,degree = 2,
span = 1, ylab="Martingale Residual",
col=1, cex=.5, lpars=list(col = "red", lwd = 3))
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 1.6709e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 4.6045e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 5.7824e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 5.6608e-17
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 0.995
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 1.6709e-28
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
title(main="Martingale residuals for Opportunity youth status")

stratification
fitl2<-svycoxph(Surv(time = age_enter, time2 = age_exit, event = badhealthtran)~opportunity_youth_two+strata(insurance),
design=des2)
summary(fitl2)
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1,
## nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~
## opportunity_youth_two + strata(insurance), design = des2)
##
## n= 4400, number of events= 383
##
## coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.4431 1.5575 0.1330 0.1331
## z Pr(>|z|)
## opportunity_youth_twoOpportunity Youth 3.329 0.000873 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth 1.558 0.642 1.2 2.022
##
## Concordance= 0.526 (se = 0.012 )
## Likelihood ratio test= NA on 1 df, p=NA
## Wald test = 11.08 on 1 df, p=9e-04
## Score (logrank) test = NA on 1 df, p=NA
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
Anova like test for factors
fit3<-svycoxph(Surv(time = age_enter, time2=age_exit, event = badhealthtran)~opportunity_youth_two+insurance,
design=des2)
summary(fit3)
## 1 - level Cluster Sampling design (with replacement)
## With (131) clusters.
## svydesign(ids = ~cluster2, weights = ~gswgt1, data = e.long1,
## nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~
## opportunity_youth_two + insurance, design = des2)
##
## n= 4400, number of events= 383
##
## coef exp(coef) se(coef) robust se
## opportunity_youth_twoOpportunity Youth 0.4432 1.5576 0.1329 0.1340
## insuranceNot Covered 0.4634 1.5894 0.1107 0.1227
## z Pr(>|z|)
## opportunity_youth_twoOpportunity Youth 3.308 0.000938 ***
## insuranceNot Covered 3.776 0.000159 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## opportunity_youth_twoOpportunity Youth 1.558 0.6420 1.198 2.025
## insuranceNot Covered 1.589 0.6292 1.250 2.022
##
## Concordance= 0.566 (se = 0.016 )
## Likelihood ratio test= NA on 2 df, p=NA
## Wald test = 29.13 on 2 df, p=5e-07
## Score (logrank) test = NA on 2 df, p=NA
##
## (Note: the likelihood ratio and score tests assume independence of
## observations within a cluster, the Wald and robust score tests do not).
regTermTest(fit3, ~opportunity_youth_two, method="LRT")
## Working (Rao-Scott+F) LRT for opportunity_youth_two
## in svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = badhealthtran) ~
## opportunity_youth_two + insurance, design = des2)
## Working 2logLR = 10.15211 p= 0.0019237
## df=1; denominator df= 129
comment: After evaluating the assumptions of the cox model(using the residuals) that the time-constant covariate effect, i.e. the effect does not vary with time, the cox model seems to be a good model for this analysis