#Libraries
library(haven)
library(ipumsr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(survival)
library(foreign)
library(knitr)
#Load and recode data
## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml")
cpsdat<-read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
"MISH" ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,
"SEX" , "RACE" , "MARST" , "BPL" , "YRIMMIG" , "CITIZEN" ,
"NATIVITY" , "EMPSTAT" , "LABFORCE" , "OCC" , "IND" , "CLASSWKR" ,
"DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" , "EDUC" , "EDUC99" ,
"LNKFW1MWT")
cpsdat<-cpsdat%>%
select(all_of(variables))%>%
filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
group_by(CPSIDP)%>%
summarise(ntime=n())%>%
arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)
cpsdat2<-cpsdat2%>%
mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
age=Recode(AGE, recodes="17:24='Age 17 to 24';
25:29='Age 25 to 29';
30:34='Age 30 to 34';
35:39='Age 35 to 39';
40:44='Age 40 to 44';
45:49='Age 45 to 49';
50:54='Age 50 to 54';
55:59='Age 55 to 59';
60:64='Age 60 to 64';
65:69='Age 65 to 69'; else=NA",as.factor=TRUE)
)%>%
arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
group_by(CPSIDP)%>%
mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
arrange(CPSIDP, MONTH)
#Event history analysis
#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
# 2. time: duration (in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job (1-7)
# 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)
#Variables for regression
sub<-data_frame(id=cpsdat2$CPSIDP,
sex=cpsdat2$sex,
month=cpsdat2$MONTH,
lj=cpsdat2$lost_job,
age=cpsdat2$age,
educ=cpsdat2$educ,
nativity=cpsdat2$nativity,
duration=cpsdat2$duration,
m_lj=ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,0),
censor=cpsdat2$d.event,
weight=cpsdat2$LNKFW1MWT)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
sub<- sub[complete.cases(sub), ]
#The data in our case has long format
sub<-sub[order(sub$id,sub$month),]
head(sub)
## # A tibble: 6 x 11
## id sex month lj age educ nativity duration m_lj censor weight
## <dbl> <fct> <int> <dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 2.02e13 Male 1 0 Age 55… High USA born 7 0 0 1845.
## 2 2.02e13 Male 2 0 Age 55… High USA born 6 0 0 0
## 3 2.02e13 Female 1 0 Age 50… High USA born 7 0 0 1896.
## 4 2.02e13 Female 2 0 Age 50… High USA born 6 0 0 0
## 5 2.02e13 Male 1 0 Age 60… High USA born 7 0 0 1804.
## 6 2.02e13 Male 2 0 Age 45… Midd… USA born 6 0 0 0
#Basic discrete time model
fit.cloglog<-glm(lj~duration+educ+age+nativity,data=sub,weights=weight,family=binomial(link="cloglog"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.cloglog)
##
## Call:
## glm(formula = lj ~ duration + educ + age + nativity, family = binomial(link = "cloglog"),
## data = sub, weights = weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -39.93 -12.33 -8.37 0.00 355.39
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5054776 0.0012091 -2899.1 <2e-16 ***
## duration 0.0714968 0.0001382 517.4 <2e-16 ***
## educHigh -0.6133520 0.0010040 -610.9 <2e-16 ***
## educMiddle -0.1568924 0.0009535 -164.6 <2e-16 ***
## ageAge 25 to 29 -0.4401449 0.0009597 -458.6 <2e-16 ***
## ageAge 30 to 34 -0.5459739 0.0010031 -544.3 <2e-16 ***
## ageAge 35 to 39 -0.6088170 0.0010266 -593.0 <2e-16 ***
## ageAge 40 to 44 -0.6910088 0.0010805 -639.5 <2e-16 ***
## ageAge 45 to 49 -0.5788655 0.0010386 -557.3 <2e-16 ***
## ageAge 50 to 54 -0.6726755 0.0010745 -626.0 <2e-16 ***
## ageAge 55 to 59 -0.5564449 0.0010409 -534.6 <2e-16 ***
## ageAge 60 to 64 -0.5086986 0.0011290 -450.6 <2e-16 ***
## ageAge 65 to 69 -0.3917173 0.0014401 -272.0 <2e-16 ***
## nativity1st gen immigrant 0.3247732 0.0006530 497.4 <2e-16 ***
## nativity2nd gen immigrant 0.1346562 0.0008444 159.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 145644464 on 210435 degrees of freedom
## Residual deviance: 143251983 on 210421 degrees of freedom
## AIC: 143252038
##
## Number of Fisher Scoring iterations: 7
#round(fit.cloglog$coefficients,3)
# Cox model
#fit.Cox<-coxph(Surv(duration,censor)~educ+age+nativity,method="efron",data=sub)
#summary(fit.Cox)
#tab<-data.frame(cloglog=fit.cloglog$coefficients,
# Cox=c(NA,fit.Cox$coefficients)
# )
#kable(tab,digits=3)
#Interaction
#Is there interaction between educ and nativity?
fit.cloglog1<- glm(lj~educ*nativity+age+duration,data=sub,weights=weight,family=binomial(link="cloglog"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.cloglog1)
##
## Call:
## glm(formula = lj ~ educ * nativity + age + duration, family = binomial(link = "cloglog"),
## data = sub, weights = weight)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -40.22 -12.29 -8.38 0.00 354.83
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5284947 0.0015559 -2267.77 <2e-16
## educHigh -0.5689436 0.0015124 -376.18 <2e-16
## educMiddle -0.1339638 0.0014522 -92.25 <2e-16
## nativity1st gen immigrant 0.4215793 0.0018302 230.34 <2e-16
## nativity2nd gen immigrant -0.1892180 0.0037963 -49.84 <2e-16
## ageAge 25 to 29 -0.4468154 0.0009626 -464.18 <2e-16
## ageAge 30 to 34 -0.5541252 0.0010068 -550.38 <2e-16
## ageAge 35 to 39 -0.6162379 0.0010304 -598.08 <2e-16
## ageAge 40 to 44 -0.6991641 0.0010848 -644.51 <2e-16
## ageAge 45 to 49 -0.5867239 0.0010433 -562.38 <2e-16
## ageAge 50 to 54 -0.6814479 0.0010790 -631.56 <2e-16
## ageAge 55 to 59 -0.5637194 0.0010446 -539.67 <2e-16
## ageAge 60 to 64 -0.5159551 0.0011322 -455.69 <2e-16
## ageAge 65 to 69 -0.3988274 0.0014431 -276.37 <2e-16
## duration 0.0714787 0.0001382 517.24 <2e-16
## educHigh:nativity1st gen immigrant -0.1664764 0.0021394 -77.81 <2e-16
## educMiddle:nativity1st gen immigrant -0.0823420 0.0020243 -40.68 <2e-16
## educHigh:nativity2nd gen immigrant 0.3307658 0.0040616 81.44 <2e-16
## educMiddle:nativity2nd gen immigrant 0.3491669 0.0039430 88.55 <2e-16
##
## (Intercept) ***
## educHigh ***
## educMiddle ***
## nativity1st gen immigrant ***
## nativity2nd gen immigrant ***
## ageAge 25 to 29 ***
## ageAge 30 to 34 ***
## ageAge 35 to 39 ***
## ageAge 40 to 44 ***
## ageAge 45 to 49 ***
## ageAge 50 to 54 ***
## ageAge 55 to 59 ***
## ageAge 60 to 64 ***
## ageAge 65 to 69 ***
## duration ***
## educHigh:nativity1st gen immigrant ***
## educMiddle:nativity1st gen immigrant ***
## educHigh:nativity2nd gen immigrant ***
## educMiddle:nativity2nd gen immigrant ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 145644464 on 210435 degrees of freedom
## Residual deviance: 143232165 on 210417 degrees of freedom
## AIC: 143232227
##
## Number of Fisher Scoring iterations: 7
#The p-values in the output shows that interaction term is statistically significant.
prob1<-function(x)
{
beta_x<-sum(fit.cloglog1$coefficients*x)
pr<-1-exp(-exp(beta_x))
return (round(pr,3))
}
df<-data.frame(nativity=rep(c("USA born","1st gen","2nd gen"),each=6),
duration=rep(c(2:7),3),
prob=rep(0,18)
)
df
## nativity duration prob
## 1 USA born 2 0
## 2 USA born 3 0
## 3 USA born 4 0
## 4 USA born 5 0
## 5 USA born 6 0
## 6 USA born 7 0
## 7 1st gen 2 0
## 8 1st gen 3 0
## 9 1st gen 4 0
## 10 1st gen 5 0
## 11 1st gen 6 0
## 12 1st gen 7 0
## 13 2nd gen 2 0
## 14 2nd gen 3 0
## 15 2nd gen 4 0
## 16 2nd gen 5 0
## 17 2nd gen 6 0
## 18 2nd gen 7 0
### USA born
for (i in 2:7)
df$prob[i-1]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,0,0,i))
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
### 1st gen
for (i in 2:7)
df$prob[i+5]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,1,0,i))
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
### 2nd gen
for (i in 2:7)
df$prob[i+11]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,0,1,i))
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length
df%>%
ggplot()+
geom_bar(aes(y=prob,x=nativity,fill=nativity, group=nativity),stat="identity", position="dodge")+
facet_grid(~duration)+
ggtitle(label="Probability of losing job among people with middle level of education and age 35-39")