library(haven)
library(ipumsr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(muhaz)
library(survival)
library(eha)
#Load data
## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml")
cpsdat<-read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
"MISH" ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,
"SEX" , "RACE" , "MARST" , "BPL" , "YRIMMIG" , "CITIZEN" ,
"NATIVITY" , "EMPSTAT" , "LABFORCE" , "OCC" , "IND" , "CLASSWKR" ,
"DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" , "EDUC" , "EDUC99" ,
"LNKFW1MWT")
cpsdat<-cpsdat%>%
select(all_of(variables))%>%
filter(YEAR>2019, AGE>16 & AGE<70,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
group_by(CPSIDP)%>%
summarise(ntime=n())%>%
arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)
cpsdat2<-cpsdat2%>%
mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
age=Recode(AGE, recodes="17:24='Age 17 to 24';
25:29='Age 25 to 29';
30:34='Age 30 to 34';
35:39='Age 35 to 39';
40:44='Age 40 to 44';
45:49='Age 45 to 49';
50:54='Age 50 to 54';
55:59='Age 55 to 59';
60:64='Age 60 to 64';
65:69='Age 65 to 69'",as.factor=TRUE)
)%>%
arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
group_by(CPSIDP)%>%
mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
arrange(CPSIDP, MONTH)
#Event history analysis
#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
# 2. time: duration(in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job(1-7)
# 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)
sub<-data_frame(id=cpsdat2$CPSIDP,
sex=cpsdat2$sex,
age=cpsdat2$age,
educ=cpsdat2$educ,
nativity=cpsdat2$nativity,
duration=cpsdat2$duration,
censor=cpsdat2$d.event,
weight=cpsdat2$LNKFW1MWT)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
sum(complete.cases(sub))
## [1] 294244
sub_complete<- sub[complete.cases(sub), ]
##Cox model
CoxModel<-coxph(Surv(duration,censor)~educ+age+nativity,method="efron",data=sub_complete)
summary(CoxModel)
## Call:
## coxph(formula = Surv(duration, censor) ~ educ + age + nativity,
## data = sub_complete, method = "efron")
##
## n= 294244, number of events= 7652
##
## coef exp(coef) se(coef) z Pr(>|z|)
## educHigh -0.76822 0.46384 0.04368 -17.587 < 2e-16 ***
## educMiddle -0.29977 0.74099 0.04115 -7.286 3.20e-13 ***
## ageAge 25 to 29 -0.25831 0.77236 0.04254 -6.072 1.26e-09 ***
## ageAge 30 to 34 -0.52078 0.59406 0.04493 -11.590 < 2e-16 ***
## ageAge 35 to 39 -0.61347 0.54147 0.04568 -13.429 < 2e-16 ***
## ageAge 40 to 44 -0.73454 0.47972 0.04859 -15.117 < 2e-16 ***
## ageAge 45 to 49 -0.70919 0.49204 0.04806 -14.757 < 2e-16 ***
## ageAge 50 to 54 -0.58601 0.55655 0.04581 -12.793 < 2e-16 ***
## ageAge 55 to 59 -0.63059 0.53228 0.04583 -13.758 < 2e-16 ***
## ageAge 60 to 64 -0.59095 0.55380 0.04892 -12.079 < 2e-16 ***
## ageAge 65 to 69 -0.45786 0.63264 0.06027 -7.597 3.03e-14 ***
## nativity1st gen immigrant 0.23807 1.26879 0.03100 7.679 1.61e-14 ***
## nativity2nd gen immigrant 0.23692 1.26734 0.03810 6.218 5.03e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## educHigh 0.4638 2.1559 0.4258 0.5053
## educMiddle 0.7410 1.3496 0.6836 0.8032
## ageAge 25 to 29 0.7724 1.2947 0.7106 0.8395
## ageAge 30 to 34 0.5941 1.6833 0.5440 0.6487
## ageAge 35 to 39 0.5415 1.8468 0.4951 0.5922
## ageAge 40 to 44 0.4797 2.0845 0.4361 0.5277
## ageAge 45 to 49 0.4920 2.0324 0.4478 0.5406
## ageAge 50 to 54 0.5565 1.7968 0.5088 0.6088
## ageAge 55 to 59 0.5323 1.8787 0.4865 0.5823
## ageAge 60 to 64 0.5538 1.8057 0.5032 0.6095
## ageAge 65 to 69 0.6326 1.5807 0.5622 0.7120
## nativity1st gen immigrant 1.2688 0.7881 1.1940 1.3483
## nativity2nd gen immigrant 1.2673 0.7891 1.1761 1.3656
##
## Concordance= 0.611 (se = 0.003 )
## Likelihood ratio test= 1222 on 13 df, p=<2e-16
## Wald test = 1278 on 13 df, p=<2e-16
## Score (logrank) test = 1338 on 13 df, p=<2e-16
#Plotting data
##Survival functions for profiles according to nativity and education level
#USA born
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.94,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (USA born)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="USA born"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="USA born"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="USA born"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))
#1st gen imm
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.92,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (1st gen immigrant)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="1st gen immigrant"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="1st gen immigrant"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="1st gen immigrant"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))
#2nd gen immigrant
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.92,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (2nd gen immigrant)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="2nd gen immigrant"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="2nd gen immigrant"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="2nd gen immigrant"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))
##Cumulative hazard function (for those who have high level of education - by nativity/immigration status, I took mean age category which is 40-44)
sf1<-survfit(CoxModel,weights=weight,conf.int=F)
H1<- -log(sf1$surv)
times<-sf1$time
#USA born
sf2<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="USA born"),conf.int=F)
H2<- -log(sf2$surv)
#1st gen imm
sf3<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="1st gen immigrant"),conf.int=F)
H3<- -log(sf3$surv)
#2nd gen imm
sf4<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="2nd gen immigrant"),conf.int=F)
H4<- -log(sf4$surv)
plot(H1~times, type="l", ylab="H(t)",xlab="Months",ylim=c(0,.06),
main="Cumulative hazard plot for losing job (high education)")
lines(H2~times,col="green")
lines(H3~times,col="blue")
lines(H4~times,col="red")
legend("topleft",c("mean","USA born","1st gen imm","2nd gen imm"),col=c(1,"green","blue","red"),lty=c(1,1,1))
#Interaction and risk score
#Is there interaction between education and nativity?
CoxModelInt<-coxph(Surv(duration,censor)~educ+age+nativity+educ*nativity,method="efron",data=sub_complete)
summary(CoxModelInt)
## Call:
## coxph(formula = Surv(duration, censor) ~ educ + age + nativity +
## educ * nativity, data = sub_complete, method = "efron")
##
## n= 294244, number of events= 7652
##
## coef exp(coef) se(coef) z
## educHigh -0.88183 0.41403 0.05803 -15.197
## educMiddle -0.37899 0.68455 0.05476 -6.920
## ageAge 25 to 29 -0.24689 0.78123 0.04273 -5.778
## ageAge 30 to 34 -0.50789 0.60177 0.04515 -11.248
## ageAge 35 to 39 -0.59991 0.54886 0.04591 -13.066
## ageAge 40 to 44 -0.72009 0.48671 0.04883 -14.745
## ageAge 45 to 49 -0.69489 0.49913 0.04830 -14.386
## ageAge 50 to 54 -0.57079 0.56508 0.04607 -12.389
## ageAge 55 to 59 -0.61848 0.53876 0.04604 -13.433
## ageAge 60 to 64 -0.57816 0.56093 0.04914 -11.766
## ageAge 65 to 69 -0.44431 0.64127 0.06045 -7.350
## nativity1st gen immigrant 0.03394 1.03453 0.07957 0.427
## nativity2nd gen immigrant 0.24392 1.27625 0.13335 1.829
## educHigh:nativity1st gen immigrant 0.33475 1.39759 0.09443 3.545
## educMiddle:nativity1st gen immigrant 0.17638 1.19289 0.09043 1.950
## educHigh:nativity2nd gen immigrant 0.01871 1.01888 0.14896 0.126
## educMiddle:nativity2nd gen immigrant -0.02031 0.97990 0.14201 -0.143
## Pr(>|z|)
## educHigh < 2e-16 ***
## educMiddle 4.50e-12 ***
## ageAge 25 to 29 7.55e-09 ***
## ageAge 30 to 34 < 2e-16 ***
## ageAge 35 to 39 < 2e-16 ***
## ageAge 40 to 44 < 2e-16 ***
## ageAge 45 to 49 < 2e-16 ***
## ageAge 50 to 54 < 2e-16 ***
## ageAge 55 to 59 < 2e-16 ***
## ageAge 60 to 64 < 2e-16 ***
## ageAge 65 to 69 1.99e-13 ***
## nativity1st gen immigrant 0.669694
## nativity2nd gen immigrant 0.067367 .
## educHigh:nativity1st gen immigrant 0.000392 ***
## educMiddle:nativity1st gen immigrant 0.051137 .
## educHigh:nativity2nd gen immigrant 0.900053
## educMiddle:nativity2nd gen immigrant 0.886305
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## educHigh 0.4140 2.4153 0.3695 0.4639
## educMiddle 0.6846 1.4608 0.6149 0.7621
## ageAge 25 to 29 0.7812 1.2800 0.7185 0.8495
## ageAge 30 to 34 0.6018 1.6618 0.5508 0.6575
## ageAge 35 to 39 0.5489 1.8220 0.5016 0.6005
## ageAge 40 to 44 0.4867 2.0546 0.4423 0.5356
## ageAge 45 to 49 0.4991 2.0035 0.4540 0.5487
## ageAge 50 to 54 0.5651 1.7697 0.5163 0.6185
## ageAge 55 to 59 0.5388 1.8561 0.4923 0.5896
## ageAge 60 to 64 0.5609 1.7828 0.5094 0.6176
## ageAge 65 to 69 0.6413 1.5594 0.5696 0.7219
## nativity1st gen immigrant 1.0345 0.9666 0.8851 1.2091
## nativity2nd gen immigrant 1.2762 0.7835 0.9827 1.6575
## educHigh:nativity1st gen immigrant 1.3976 0.7155 1.1615 1.6817
## educMiddle:nativity1st gen immigrant 1.1929 0.8383 0.9991 1.4242
## educHigh:nativity2nd gen immigrant 1.0189 0.9815 0.7609 1.3643
## educMiddle:nativity2nd gen immigrant 0.9799 1.0205 0.7418 1.2944
##
## Concordance= 0.612 (se = 0.003 )
## Likelihood ratio test= 1236 on 17 df, p=<2e-16
## Wald test = 1286 on 17 df, p=<2e-16
## Score (logrank) test = 1357 on 17 df, p=<2e-16
#The p-values in the output shows that interaction term is statistically significant for the category of highly educated 1st generation immigrants who have slightly higher risk of losing a job than USA born low educated respondents.
#Risk score
sub_complete$risk<-exp(CoxModel$linear.predictors)
#Highest risk group
sub_complete[which.max(sub_complete$risk),c("educ", "sex", "age", "nativity", "risk")]
## # A tibble: 1 x 5
## educ sex age nativity risk
## <fct> <fct> <fct> <fct> <dbl>
## 1 Low Male Age 17 to 24 1st gen immigrant 3.34
#The risk for this group (young male respondents with low level of education who are 1st generation immigrants) is 3.3 times that of the baseline category (USA born low educated respondents)
#Lowest risk group
sub_complete[which.min(sub_complete$risk),c("educ", "sex", "age", "nativity", "risk")]
## # A tibble: 1 x 5
## educ sex age nativity risk
## <fct> <fct> <fct> <fct> <dbl>
## 1 High Male Age 40 to 44 USA born 0.585
#The lowest risk group (highly educated male USA born respondents aged 40-44) had a risk score of .59 that means their score is 59% less than the baseline category.