library(haven)
library(ipumsr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(muhaz)
library(survival)
library(eha)
#Load data
## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). The research question for this assignment is to compare the risks of losing a job during the COVID19 pandemic by nativity group: native born (both parents native born), 1st and 2nd generation immigrants. The outcome variable is duration until the job loss, and the covariates are nativity, age and education.
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml")
cpsdat<-read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
"MISH" ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,
"SEX" , "RACE" , "MARST" , "BPL" , "YRIMMIG" , "CITIZEN" ,
"NATIVITY" , "EMPSTAT" , "LABFORCE" , "OCC" , "IND" , "CLASSWKR" ,
"DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" , "EDUC" , "EDUC99" ,
"LNKFW1MWT")
cpsdat<-cpsdat%>%
select(all_of(variables))%>%
filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
group_by(CPSIDP)%>%
summarise(ntime=n())%>%
arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)
cpsdat2<-cpsdat2%>%
mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T)
)%>%
arrange(CPSIDP, MONTH)
cpsdat2<-cpsdat2%>%
group_by(CPSIDP)%>%
mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
arrange(CPSIDP, MONTH)
#Event history analysis
#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
# 2. time: duration(in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job(1-7)
# 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)
## Parameters
#I chose PH model since it is convenient to find hazard ratio.
#After comparing the models based on the minimum AIC criteria I chose log-logistic model.
sub<-data_frame(sex=cpsdat2$SEX,
age=cpsdat2$AGE,
educ=cpsdat2$educ,
nativity=cpsdat2$nativity,
duration=cpsdat2$duration,
censor=cpsdat2$d.event,
weight=cpsdat2$LNKFW1MWT)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Empirical hazard function
fit.haz.km<-kphaz.fit(sub$duration,
sub$censor,
method = "product-limit")
#Overlay the smoothed version
fit.haz.sm<-muhaz(sub$duration,sub$censor,min.time=0,max.time=7)
kphaz.plot(fit.haz.km,main="Empirical hazard function plot of loosing job")
lines(fit.haz.sm, col=2, lwd=3)
legend("topleft", legend = c("KM hazard", "Smoothed Hazard"),
col=c(1,2), lty=c(1,1))
#log-logistic distribution for hazard
fit.loglogistic<-phreg(Surv(duration, censor)~age+educ+nativity,
data=sub, dist="loglogistic", center=T)
summary(fit.loglogistic)
## Call:
## phreg(formula = Surv(duration, censor) ~ age + educ + nativity,
## data = sub, dist = "loglogistic", center = T)
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## (Intercept) -4.230 0.066 0.000
## age 43.961 -0.011 0.989 0.001 0.000
## educ
## High 0.466 0 1 (reference)
## Low 0.054 0.817 2.263 0.042 0.000
## Middle 0.480 0.498 1.646 0.025 0.000
## nativity
## 1st gen immigran 0.154 0 1 (reference)
## 2nd gen immigran 0.081 0.075 1.078 0.045 0.091
## USA born 0.765 -0.181 0.834 0.030 0.000
##
## log(scale) 0.597 0.027 0.000
## log(shape) 1.039 0.024 0.000
##
## Events 7915
## Total time at risk 1293077
## Max. log. likelihood -46739
## LR test statistic 951.37
## Degrees of freedom 5
## Overall p-value 0
##Interpreting the model coefficients
#1. Age
# beta<0 - an increase of age lowers the risk of losing the job.
# For example, compare hazard for 20-years old and 40.
# hazard ratio = exp(-0.011*20)/exp(-0.011*40)= 1.246077
# i.e. probability of loosing job during the pandemic was 1.246077 times higher for 20-years old than for 4--years old
#2. Education
#Higher the level of education - lower the risk of job loss.
#3. Nativity
#USA born have the lowest risk of job loss, and 2nd generation of immigrants have the highest risk of job loss.
##Correlation test
chisq <- chisq.test(sub$educ, sub$nativity)
chisq
##
## Pearson's Chi-squared test
##
## data: sub$educ and sub$nativity
## X-squared = 11153, df = 4, p-value < 2.2e-16
#p-value shows that the null hypothesis of independency of the variables should be rejected.Education and nativity are not independent.
plot(fit.loglogistic)
plot(fit.loglogistic,fn="haz")
lines(fit.haz.sm,col=2)
# Cumulative hazard function
emp<-coxreg(Surv(duration, censor)~age+educ+nativity,
data=sub)
check.dist(sp=emp,pp=fit.loglogistic, main = "Empirical vs. Log-logistic")