library(haven)
 library(ipumsr)
 library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
 library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
 library(ggplot2)
 library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
 library(muhaz)
 library(survival)
 library(eha)

#Load data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). The research question for this assignment is to compare the risks of losing a job during the COVID19 pandemic by nativity group: native born (both parents native born), 1st and 2nd generation immigrants. The outcome variable is duration until the job loss, and the covariates are nativity, age and education.

ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml") 
cpsdat<-read_ipums_micro(ddi) 
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
              "MISH"  ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,     
              "SEX"  ,     "RACE"  ,    "MARST"  ,   "BPL"   ,    "YRIMMIG"  , "CITIZEN" , 
              "NATIVITY" , "EMPSTAT" ,  "LABFORCE" , "OCC"  ,     "IND"   ,    "CLASSWKR" ,
              "DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" ,   "EDUC"   ,   "EDUC99"   ,
              "LNKFW1MWT")
 cpsdat<-cpsdat%>%
     select(all_of(variables))%>%
     filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22)) 
 cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
   filter(ntime>1)
cpsdat2<-cpsdat2%>%
    mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
          nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
          sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
          educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T)
          )%>%
    arrange(CPSIDP, MONTH)
cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)
#Event history analysis
#Time and status
# 1. Event variable: lost_job
 cpsdat2$lost_job<-0
 cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
 # 2. time: duration(in months)
 cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job(1-7)
 # 3. Censoring indicator: d.event
 cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)
## Parameters
#I chose PH model since it is convenient to find hazard ratio.
#After comparing the models based on the minimum AIC criteria I chose log-logistic model.
sub<-data_frame(sex=cpsdat2$SEX,
                age=cpsdat2$AGE,
                educ=cpsdat2$educ,
                nativity=cpsdat2$nativity,
                duration=cpsdat2$duration,
                censor=cpsdat2$d.event,
                weight=cpsdat2$LNKFW1MWT)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Empirical hazard function
fit.haz.km<-kphaz.fit(sub$duration,
                      sub$censor,
                      method = "product-limit")
#Overlay the smoothed version                      
fit.haz.sm<-muhaz(sub$duration,sub$censor,min.time=0,max.time=7)
kphaz.plot(fit.haz.km,main="Empirical hazard function plot of loosing job") 
lines(fit.haz.sm, col=2, lwd=3)
legend("topleft", legend = c("KM hazard", "Smoothed Hazard"),
     col=c(1,2), lty=c(1,1))

#log-logistic distribution for hazard
fit.loglogistic<-phreg(Surv(duration, censor)~age+educ+nativity,
    data=sub, dist="loglogistic", center=T)
summary(fit.loglogistic)
## Call:
## phreg(formula = Surv(duration, censor) ~ age + educ + nativity, 
##     data = sub, dist = "loglogistic", center = T)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## (Intercept)                  -4.230               0.066     0.000 
## age                43.961    -0.011     0.989     0.001     0.000 
## educ 
##             High    0.466     0         1           (reference)
##              Low    0.054     0.817     2.263     0.042     0.000 
##           Middle    0.480     0.498     1.646     0.025     0.000 
## nativity 
## 1st gen immigran    0.154     0         1           (reference)
## 2nd gen immigran    0.081     0.075     1.078     0.045     0.091 
##         USA born    0.765    -0.181     0.834     0.030     0.000 
## 
## log(scale)                    0.597               0.027     0.000 
## log(shape)                    1.039               0.024     0.000 
## 
## Events                    7915 
## Total time at risk        1293077 
## Max. log. likelihood      -46739 
## LR test statistic         951.37 
## Degrees of freedom        5 
## Overall p-value           0
##Interpreting the model coefficients
#1. Age
# beta<0 - an increase of age lowers the  risk of losing the job.
# For example, compare hazard for 20-years old and 40.
#  hazard ratio = exp(-0.011*20)/exp(-0.011*40)= 1.246077
#  i.e. probability of loosing job during the pandemic was 1.246077 times higher for 20-years old than for 4--years old
#2. Education
#Higher the level of education - lower the risk of job loss.
#3. Nativity
#USA born have the lowest risk of job loss, and 2nd generation of immigrants have the highest risk of job loss.

##Correlation test
chisq <- chisq.test(sub$educ, sub$nativity)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  sub$educ and sub$nativity
## X-squared = 11153, df = 4, p-value < 2.2e-16
#p-value shows that the null hypothesis of independency of the variables should be rejected.Education and nativity are not independent.
plot(fit.loglogistic)

plot(fit.loglogistic,fn="haz")
lines(fit.haz.sm,col=2)

# Cumulative hazard function
emp<-coxreg(Surv(duration, censor)~age+educ+nativity,
                data=sub)
check.dist(sp=emp,pp=fit.loglogistic, main = "Empirical vs. Log-logistic")