##install.packages("survminer")
library("survminer")
## Loading required package: ggplot2
## Loading required package: ggpubr
library(tidycensus)
library(haven)
library(ipumsr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggplot2)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(muhaz)

#Load data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). The research question for this assignment is to compare the risks of losing a job during the COVID19 pandemic by nativity group: native born (both parents native born), 1st and 2nd generation immigrants. The grouping variable is nativity (categorical). The hypothesis states that immigrants experience higher risks of losing their jobs during the times of COVID19 pandemic than native born; the most vulnerable group in the labor market is the 1st generation immigrants.

ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00002.xml") 
cpsdat<-read_ipums_micro(ddi) 
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
cpsdat<-cpsdat%>%
  select(CPSID,CPSIDP,STATEFIP, YEAR,MISH,MONTH, SERIAL, AGE, SEX, RACE,CITIZEN,EMPSTAT, NATIVITY, BPL, LABFORCE, WHYUNEMP,YRIMMIG)%>%
  filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22))
cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)
## `summarise()` ungrouping output (override with `.groups` argument)
cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
filter(ntime>1)


cpsdat2<-cpsdat2%>%
    mutate(curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0))%>% ## 1 employed, 0 - unemployed
  arrange(CPSIDP,MONTH)
      
cpsdat2<-cpsdat2%>%    
mutate(nativity=Recode(NATIVITY,recodes="1=1;2:4=2;5=3; else=NA"))
cpsdat2$nativity<-factor(cpsdat2$nativity)
levels(cpsdat2$nativity)=c("USA born","2nd gen imm","1st gen imm")

cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)

#Analysis

# 1. Event variable: lost_job
 cpsdat2$lost_job<-0
 cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1 ## 1 lost job
 # 2. time: duration(in months)
 cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8) # Month when job is lost (2-7)
 # 3. Censoring indicator: d.event
 cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),1,0) ## 1 censored (when job loss cannot be determined)
#Survival function
 St<-survfit(Surv(duration, d.event)~1, data=cpsdat2,conf.int=T)
 plot(St, ylim=c(.97,1), xlim=c(0,8), main="Survival Function for losing job")

#Hazard function
haz<-kphaz.fit(time=cpsdat2$duration, status=cpsdat2$d.event, method = "product-limit")
#kphaz.plot(haz, main="Hazard function plot")
haz1<-data.frame(haz)
haz1<-haz1%>%
   filter(time<7)
plot(haz1$time,haz1$haz,type="b",xlab="time",ylab="hazard",main="Hazard function plot",col="green") 

#Cumulative hazard
plot(cumsum(haz$haz)~haz$time, 
     main = "Cumulative Hazard function",
     ylab="H(t)",xlab="Time in Months", 
     type="l",xlim=c(0,8), lwd=2,col=3)

#Groups
St_nat<-survfit(Surv(duration, d.event)~nativity, data=cpsdat2,conf.int=T)
summary(St_nat)
## Call: survfit(formula = Surv(duration, d.event) ~ nativity, data = cpsdat2, 
##     conf.int = T)
## 
## 292 observations deleted due to missingness 
##                 nativity=USA born 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     2 234711     213   0.9991 6.22e-05       0.9990       0.9992
##     3 234498     361   0.9976 1.02e-04       0.9974       0.9978
##     4 234137    2685   0.9861 2.42e-04       0.9856       0.9866
##     5 231452     890   0.9823 2.72e-04       0.9818       0.9829
##     6 230562     573   0.9799 2.90e-04       0.9793       0.9804
##     7 229989     497   0.9778 3.04e-04       0.9772       0.9784
##     8 229492  223835   0.0241 3.17e-04       0.0235       0.0247
## 
##                 nativity=2nd gen imm 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     2  24552      28   0.9989 0.000215       0.9984        0.999
##     3  24524      72   0.9959 0.000406       0.9951        0.997
##     4  24452     339   0.9821 0.000846       0.9805        0.984
##     5  24113     126   0.9770 0.000957       0.9751        0.979
##     6  23987      66   0.9743 0.001010       0.9723        0.976
##     7  23921      59   0.9719 0.001055       0.9698        0.974
##     8  23862   23036   0.0336 0.001151       0.0315        0.036
## 
##                 nativity=1st gen imm 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     2  46259      69    0.999 0.000179       0.9982       0.9989
##     3  46190     134    0.996 0.000307       0.9950       0.9962
##     4  46056     694    0.981 0.000641       0.9794       0.9819
##     5  45362     249    0.975 0.000723       0.9738       0.9766
##     6  45113     161    0.972 0.000770       0.9702       0.9733
##     7  44952     109    0.969 0.000801       0.9678       0.9710
##     8  44843   43411    0.031 0.000805       0.0294       0.0326
plot(St_nat, ylim=c(.97,1), xlim=c(0,7),xlab="time",ylab="survival rate",col=c(2,3,4), main="Survival functions for nativity groups")
legend("topright",c("USA born","2nd gen imm","1st gen imm"),col=c("red","green","blue"),lty=c(1,1,1))

##The results show that non-immigrants have higher survival rates than immigrant groups; and the 1st generation immigrants have higher risks to lose their jobs than the 2nd generation immigrants.

#Tests
#Log-rank test
survdiff(Surv(duration, d.event)~nativity, data=cpsdat2)
## Call:
## survdiff(formula = Surv(duration, d.event) ~ nativity, data = cpsdat2)
## 
## n=305522, 292 observations deleted due to missingness.
## 
##                           N Observed Expected (O-E)^2/E (O-E)^2/V
## nativity=USA born    234711   229054   229032   0.00209      0.18
## nativity=2nd gen imm  24552    23726    23816   0.34127      7.37
## nativity=1st gen imm  46259    44827    44759   0.10418      2.43
## 
##  Chisq= 8.9  on 2 degrees of freedom, p= 0.01
#THe test results show significant variation in survival status between nativity groups.