#Libraries
library(haven)
library(ipumsr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(survival)
library(foreign)
library(knitr)
library(splines)
#Load and recode data
## I am using IPUMS CPS data for the period of COVID19 pandemic (January through August 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00004.xml")
cpsdat<-read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
"MISH" ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,
"SEX" , "RACE" , "MARST" , "BPL" , "YRIMMIG" , "CITIZEN" ,
"NATIVITY" , "EMPSTAT" , "LABFORCE" , "OCC" , "IND" , "CLASSWKR" ,
"DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" , "EDUC" , "EDUC99" ,
"LNKFW1MWT")
cpsdat<-cpsdat%>%
select(all_of(variables))%>%
filter(YEAR>2019, AGE>16 & AGE<70,MONTH<8,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
group_by(CPSIDP)%>%
summarise(ntime=n())%>%
arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)
cpsdat2<-cpsdat2%>%
mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
age=Recode(AGE, recodes="17:24='Age 17 to 24';
25:29='Age 25 to 29';
30:34='Age 30 to 34';
35:39='Age 35 to 39';
40:44='Age 40 to 44';
45:49='Age 45 to 49';
50:54='Age 50 to 54';
55:59='Age 55 to 59';
60:64='Age 60 to 64';
65:69='Age 65 to 69'; else=NA",as.factor=TRUE)
)%>%
arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
group_by(CPSIDP)%>%
mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
arrange(CPSIDP, MONTH)
#Event history analysis
#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
# 2. time: duration (in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job (1-7)
# 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==0,ifelse(cpsdat2$MONTH!=1,1,0),0)
#Variables for regression
sub<-data_frame(id=cpsdat2$CPSIDP,
sex=cpsdat2$sex,
month=cpsdat2$MONTH,
lj=cpsdat2$lost_job,
age=cpsdat2$age,
educ=cpsdat2$educ,
nativity=cpsdat2$nativity,
duration=cpsdat2$duration,
m_lj=ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,0),
lj_event=cpsdat2$d.event,
weight=cpsdat2$LNKFW1MWT)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
sub<-subset(sub,sub$duration>1)
sub<-sub[order(sub$id,sub$month),]
#General model
fit.gm<-glm(lj_event~as.factor(duration)-1,data=sub,family="binomial")
summary(fit.gm)
##
## Call:
## glm(formula = lj_event ~ as.factor(duration) - 1, family = "binomial",
## data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5272 -0.4526 -0.3217 -0.2817 2.8776
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## as.factor(duration)2 -2.44071 0.01853 -131.7 <2e-16 ***
## as.factor(duration)3 -2.22703 0.01645 -135.4 <2e-16 ***
## as.factor(duration)4 -1.90339 0.01412 -134.8 <2e-16 ***
## as.factor(duration)5 -2.93554 0.02082 -141.0 <2e-16 ***
## as.factor(duration)6 -3.20724 0.02239 -143.2 <2e-16 ***
## as.factor(duration)7 -4.12427 0.04025 -102.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 370059 on 266941 degrees of freedom
## Residual deviance: 126428 on 266935 degrees of freedom
## AIC: 126440
##
## Number of Fisher Scoring iterations: 6
#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.gm)))
time<-2:7
plot(haz~time, type="l",xlab="duration", ylab="h(t)",main="Discrete Time Hazard for losing job")
"Plot the survival function estimate"
## [1] "Plot the survival function estimate"
St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
St[i]<-St[i-1]* (1-haz[i])
}
plot(y=St,x=time, type="l", main="Survival function for losing job")
#Linear model
fit.lm<-glm(lj_event~duration,data=sub ,family="binomial")
summary(fit.lm)
##
## Call:
## glm(formula = lj_event ~ duration, family = "binomial", data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5021 -0.4417 -0.3402 -0.2981 2.6064
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.465297 0.020541 -71.34 <2e-16 ***
## duration -0.271039 0.004828 -56.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132656 on 266940 degrees of freedom
## Residual deviance: 129363 on 266939 degrees of freedom
## AIC: 129367
##
## Number of Fisher Scoring iterations: 5
#Quadratic model
fit.s<-glm(lj_event~duration+I(duration^2),data=sub ,family="binomial")
summary(fit.s)
##
## Call:
## glm(formula = lj_event ~ duration + I(duration^2), family = "binomial",
## data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4739 -0.4647 -0.3873 -0.2727 2.9487
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.045197 0.063132 -64.08 <2e-16 ***
## duration 1.148185 0.032590 35.23 <2e-16 ***
## I(duration^2) -0.169928 0.003903 -43.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132656 on 266940 degrees of freedom
## Residual deviance: 127215 on 266938 degrees of freedom
## AIC: 127221
##
## Number of Fisher Scoring iterations: 6
#Cubic model
fit.c<-glm(lj_event~duration+I(duration^2)+I(duration^3),data=sub ,family="binomial")
summary(fit.c)
##
## Call:
## glm(formula = lj_event ~ duration + I(duration^2) + I(duration^3),
## family = "binomial", data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4927 -0.4702 -0.3726 -0.2642 2.8678
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.739485 0.176104 -32.59 <2e-16 ***
## duration 2.562051 0.140342 18.26 <2e-16 ***
## I(duration^2) -0.527305 0.034616 -15.23 <2e-16 ***
## I(duration^3) 0.027835 0.002669 10.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132656 on 266940 degrees of freedom
## Residual deviance: 127109 on 266937 degrees of freedom
## AIC: 127117
##
## Number of Fisher Scoring iterations: 6
#Quartic model
fit.q<-glm(lj_event~duration+I(duration^2)+I(duration^3)+I(duration^4),data=sub ,family="binomial")
summary(fit.q)
##
## Call:
## glm(formula = lj_event ~ duration + I(duration^2) + I(duration^3) +
## I(duration^4), family = "binomial", data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4828 -0.4775 -0.3770 -0.2497 2.8318
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.265156 0.584739 -2.164 0.030493 *
## duration -2.394018 0.633517 -3.779 0.000158 ***
## I(duration^2) 1.384031 0.240705 5.750 8.93e-09 ***
## I(duration^3) -0.279245 0.038354 -7.281 3.32e-13 ***
## I(duration^4) 0.017491 0.002179 8.029 9.84e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132656 on 266940 degrees of freedom
## Residual deviance: 127043 on 266936 degrees of freedom
## AIC: 127053
##
## Number of Fisher Scoring iterations: 6
#Spline model
fit.sp<-glm(lj_event~ns(duration,df=2),data=sub, family="binomial")
summary(fit.sp)
##
## Call:
## glm(formula = lj_event ~ ns(duration, df = 2), family = "binomial",
## data = sub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4636 -0.4606 -0.3953 -0.2701 2.9492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.37825 0.01594 -149.24 <2e-16 ***
## ns(duration, df = 2)1 -0.82808 0.03858 -21.46 <2e-16 ***
## ns(duration, df = 2)2 -2.24703 0.03790 -59.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 132656 on 266940 degrees of freedom
## Residual deviance: 127310 on 266938 degrees of freedom
## AIC: 127316
##
## Number of Fisher Scoring iterations: 6
#Hazards
dat<-expand.grid(duration=2:7)
dat$genmod<-predict(fit.gm, newdata=data.frame(duration=as.factor(2:7 )), type="response")
dat$lin<-predict(fit.lm, newdata=dat, type="response")
dat$sq<-predict(fit.s, newdata=dat, type="response")
dat$cub<-predict(fit.c, newdata=dat, type="response")
dat$quart<-predict(fit.q, newdata=dat, type="response")
dat$spline<-predict(fit.sp, newdata=expand.grid(duration=2:7), type="response")
dat
## duration genmod lin sq cub quart spline
## 1 2 0.08012042 0.11843076 0.08102236 0.07572339 0.07790381 0.08484673
## 2 3 0.09734934 0.09292653 0.10621600 0.11430159 0.10776395 0.10062393
## 3 4 0.12972511 0.07246322 0.10235777 0.10463722 0.10999885 0.10190631
## 4 5 0.05042435 0.05622679 0.07226136 0.06707611 0.06860578 0.07517285
## 5 6 0.03889410 0.04345792 0.03649231 0.03429615 0.03068247 0.03582909
## 6 7 0.01591775 0.03348592 0.01294070 0.01637151 0.01814212 0.01292012
plot(genmod~duration, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(lin~duration, dat, col=2, lwd=2)
lines(sq~duration, dat, col=3, lwd=2)
lines(cub~duration, dat, col=4, lwd=2)
lines(quart~duration, dat, col=5, lwd=2)
lines(spline~duration, dat, col=6, lwd=2)
legend("topright",
legend=c("General Model","Linear", "Square", "Cubic", "Quartic", "Natural spline"),
col=1:6, lwd=1.5)
#AIC table
aic<-round(c(
fit.lm$deviance+2*length(fit.lm$coefficients),
fit.s$deviance+2*length(fit.s$coefficients),
fit.c$deviance+2*length(fit.c$coefficients),
fit.q$deviance+2*length(fit.q$coefficients),
fit.sp$deviance+2*length(fit.sp$coefficients),
fit.gm$deviance+2*length(fit.gm$coefficients)),2)
#compare all AICs to the one from the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c("linear", "square", "cubic", "quartic","spline", "general"), aic=aic, aic_dif=dif.aic)
## model aic aic_dif
## 1 linear 129367.3 2927.50
## 2 square 127221.4 781.60
## 3 cubic 127116.6 676.82
## 4 quartic 127053.5 613.68
## 5 spline 127315.5 875.74
## 6 general 126439.8 0.00
#Minimal value gives general model and so it fits well, but we can see that quartic and cubic time specifications are the closest to general model. Linear model is the worst fit.