##install.packages("survminer")
library("survminer")
## Loading required package: ggplot2
## Loading required package: ggpubr
library(tidycensus)
library(haven)
library(ipumsr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(muhaz)
#Load data
## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). The research question for this assignment is to compare the risks of losing a job during the COVID19 pandemic by nativity group: native born (both parents native born), 1st and 2nd generation immigrants. The grouping variable is nativity (categorical). The hypothesis states that immigrants experience higher risks of losing their jobs during the times of COVID19 pandemic than native born; the most vulnerable group in the labor market is the 1st generation immigrants.
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00002.xml")
cpsdat<-read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-cpsdat%>%
select(CPSID,CPSIDP,STATEFIP, YEAR,MISH,MONTH, SERIAL, AGE, SEX, RACE,CITIZEN,EMPSTAT, NATIVITY, BPL, LABFORCE, WHYUNEMP,YRIMMIG)%>%
filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
group_by(CPSIDP)%>%
summarise(ntime=n())%>%
arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)
cpsdat2<-cpsdat2%>%
mutate(curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0))%>% ## 1 employed, 0 - unemployed
arrange(CPSIDP,MONTH)
cpsdat2<-cpsdat2%>%
mutate(nativity=Recode(NATIVITY,recodes="1=1;2:4=2;5=3; else=NA"))
cpsdat2$nativity<-factor(cpsdat2$nativity)
levels(cpsdat2$nativity)=c("USA born","2nd gen imm","1st gen imm")
cpsdat2<-cpsdat2%>%
group_by(CPSIDP)%>%
mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
arrange(CPSIDP, MONTH)
#Analysis
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1 ## 1 lost job
# 2. time: duration(in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8) # Month when job is lost (2-7)
# 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),1,0) ## 1 censored (when job loss cannot be determined)
#Survival function
St<-survfit(Surv(duration, d.event)~1, data=cpsdat2,conf.int=T)
plot(St, ylim=c(.97,1), xlim=c(0,8), main="Survival Function for losing job")
#Hazard function
haz<-kphaz.fit(time=cpsdat2$duration, status=cpsdat2$d.event, method = "product-limit")
#kphaz.plot(haz, main="Hazard function plot")
haz1<-data.frame(haz)
haz1<-haz1%>%
filter(time<7)
plot(haz1$time,haz1$haz,type="b",xlab="time",ylab="hazard",main="Hazard function plot",col="green")
#Cumulative hazard
plot(cumsum(haz$haz)~haz$time,
main = "Cumulative Hazard function",
ylab="H(t)",xlab="Time in Months",
type="l",xlim=c(0,8), lwd=2,col=3)
#Groups
St_nat<-survfit(Surv(duration, d.event)~nativity, data=cpsdat2,conf.int=T)
summary(St_nat)
## Call: survfit(formula = Surv(duration, d.event) ~ nativity, data = cpsdat2,
## conf.int = T)
##
## 292 observations deleted due to missingness
## nativity=USA born
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 234711 213 0.9991 6.22e-05 0.9990 0.9992
## 3 234498 361 0.9976 1.02e-04 0.9974 0.9978
## 4 234137 2685 0.9861 2.42e-04 0.9856 0.9866
## 5 231452 890 0.9823 2.72e-04 0.9818 0.9829
## 6 230562 573 0.9799 2.90e-04 0.9793 0.9804
## 7 229989 497 0.9778 3.04e-04 0.9772 0.9784
## 8 229492 223835 0.0241 3.17e-04 0.0235 0.0247
##
## nativity=2nd gen imm
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 24552 28 0.9989 0.000215 0.9984 0.999
## 3 24524 72 0.9959 0.000406 0.9951 0.997
## 4 24452 339 0.9821 0.000846 0.9805 0.984
## 5 24113 126 0.9770 0.000957 0.9751 0.979
## 6 23987 66 0.9743 0.001010 0.9723 0.976
## 7 23921 59 0.9719 0.001055 0.9698 0.974
## 8 23862 23036 0.0336 0.001151 0.0315 0.036
##
## nativity=1st gen imm
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 46259 69 0.999 0.000179 0.9982 0.9989
## 3 46190 134 0.996 0.000307 0.9950 0.9962
## 4 46056 694 0.981 0.000641 0.9794 0.9819
## 5 45362 249 0.975 0.000723 0.9738 0.9766
## 6 45113 161 0.972 0.000770 0.9702 0.9733
## 7 44952 109 0.969 0.000801 0.9678 0.9710
## 8 44843 43411 0.031 0.000805 0.0294 0.0326
plot(St_nat, ylim=c(.97,1), xlim=c(0,7),xlab="time",ylab="survival rate",col=c(2,3,4), main="Survival functions for nativity groups")
legend("topright",c("USA born","2nd gen imm","1st gen imm"),col=c("red","green","blue"),lty=c(1,1,1))
##The results show that non-immigrants have higher survival rates than immigrant groups; and the 1st generation immigrants have higher risks to lose their jobs than the 2nd generation immigrants.
#Tests
#Log-rank test
survdiff(Surv(duration, d.event)~nativity, data=cpsdat2)
## Call:
## survdiff(formula = Surv(duration, d.event) ~ nativity, data = cpsdat2)
##
## n=305522, 292 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## nativity=USA born 234711 229054 229032 0.00209 0.18
## nativity=2nd gen imm 24552 23726 23816 0.34127 7.37
## nativity=1st gen imm 46259 44827 44759 0.10418 2.43
##
## Chisq= 8.9 on 2 degrees of freedom, p= 0.01
#THe test results show significant variation in survival status between nativity groups.