Homework 3

 library(haven)
 library(ipumsr)
 library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

 library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

 library(ggplot2)
 library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

 library(muhaz)
 library(survival)
 library(eha)

#Load data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml") 
cpsdat<-read_ipums_micro(ddi)

## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
              "MISH"  ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,     
              "SEX"  ,     "RACE"  ,    "MARST"  ,   "BPL"   ,    "YRIMMIG"  , "CITIZEN" , 
              "NATIVITY" , "EMPSTAT" ,  "LABFORCE" , "OCC"  ,     "IND"   ,    "CLASSWKR" ,
              "DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" ,   "EDUC"   ,   "EDUC99"   ,
              "LNKFW1MWT")
 cpsdat<-cpsdat%>%
     select(all_of(variables))%>%
     filter(YEAR>2019, AGE>16 & AGE<70,EMPSTAT%in%c(10,12,21,22)) 
 cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)

## `summarise()` ungrouping output (override with `.groups` argument)

cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
   filter(ntime>1)
cpsdat2<-cpsdat2%>%
    mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
          nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
          sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
          educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
          age=Recode(AGE,  recodes="17:24='Age 17 to 24';
                                    25:29='Age 25 to 29';
                                    30:34='Age 30 to 34';
                                    35:39='Age 35 to 39';
                                    40:44='Age 40 to 44';
                                    45:49='Age 45 to 49';
                                    50:54='Age 50 to 54';
                                    55:59='Age 55 to 59';
                                    60:64='Age 60 to 64';
                                    65:69='Age 65 to 69'",as.factor=TRUE)
          )%>%
    arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)

#Event history analysis
#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
 # 2. time: duration(in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job(1-7)
 # 3. Censoring indicator: d.event
 cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)

sub<-data_frame(id=cpsdat2$CPSIDP,
                sex=cpsdat2$sex,
                age=cpsdat2$age,
                educ=cpsdat2$educ,
                nativity=cpsdat2$nativity,
                duration=cpsdat2$duration,
                censor=cpsdat2$d.event,
                weight=cpsdat2$LNKFW1MWT)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

sum(complete.cases(sub))

## [1] 294244

sub_complete<- sub[complete.cases(sub), ]

##Cox model                
CoxModel<-coxph(Surv(duration,censor)~educ+age+nativity,method="efron",data=sub_complete)  
summary(CoxModel)

## Call:
## coxph(formula = Surv(duration, censor) ~ educ + age + nativity, 
##     data = sub_complete, method = "efron")
## 
##   n= 294244, number of events= 7652 
## 
##                               coef exp(coef) se(coef)       z Pr(>|z|)    
## educHigh                  -0.76822   0.46384  0.04368 -17.587  < 2e-16 ***
## educMiddle                -0.29977   0.74099  0.04115  -7.286 3.20e-13 ***
## ageAge 25 to 29           -0.25831   0.77236  0.04254  -6.072 1.26e-09 ***
## ageAge 30 to 34           -0.52078   0.59406  0.04493 -11.590  < 2e-16 ***
## ageAge 35 to 39           -0.61347   0.54147  0.04568 -13.429  < 2e-16 ***
## ageAge 40 to 44           -0.73454   0.47972  0.04859 -15.117  < 2e-16 ***
## ageAge 45 to 49           -0.70919   0.49204  0.04806 -14.757  < 2e-16 ***
## ageAge 50 to 54           -0.58601   0.55655  0.04581 -12.793  < 2e-16 ***
## ageAge 55 to 59           -0.63059   0.53228  0.04583 -13.758  < 2e-16 ***
## ageAge 60 to 64           -0.59095   0.55380  0.04892 -12.079  < 2e-16 ***
## ageAge 65 to 69           -0.45786   0.63264  0.06027  -7.597 3.03e-14 ***
## nativity1st gen immigrant  0.23807   1.26879  0.03100   7.679 1.61e-14 ***
## nativity2nd gen immigrant  0.23692   1.26734  0.03810   6.218 5.03e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                           exp(coef) exp(-coef) lower .95 upper .95
## educHigh                     0.4638     2.1559    0.4258    0.5053
## educMiddle                   0.7410     1.3496    0.6836    0.8032
## ageAge 25 to 29              0.7724     1.2947    0.7106    0.8395
## ageAge 30 to 34              0.5941     1.6833    0.5440    0.6487
## ageAge 35 to 39              0.5415     1.8468    0.4951    0.5922
## ageAge 40 to 44              0.4797     2.0845    0.4361    0.5277
## ageAge 45 to 49              0.4920     2.0324    0.4478    0.5406
## ageAge 50 to 54              0.5565     1.7968    0.5088    0.6088
## ageAge 55 to 59              0.5323     1.8787    0.4865    0.5823
## ageAge 60 to 64              0.5538     1.8057    0.5032    0.6095
## ageAge 65 to 69              0.6326     1.5807    0.5622    0.7120
## nativity1st gen immigrant    1.2688     0.7881    1.1940    1.3483
## nativity2nd gen immigrant    1.2673     0.7891    1.1761    1.3656
## 
## Concordance= 0.611  (se = 0.003 )
## Likelihood ratio test= 1222  on 13 df,   p=<2e-16
## Wald test            = 1278  on 13 df,   p=<2e-16
## Score (logrank) test = 1338  on 13 df,   p=<2e-16

#Plotting data

##Survival functions for profiles according to nativity and education level

#USA born
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.94,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (USA born)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="USA born"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="USA born"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="USA born"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))

#1st gen imm
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.92,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (1st gen immigrant)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="1st gen immigrant"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="1st gen immigrant"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="1st gen immigrant"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))

#2nd gen immigrant
plot(survfit(CoxModel,weights=weight,conf.int=F),xlim=c(1,7),ylim=c(.92,1),xlab="Months",ylab="S(t)",main="Survivorship Function for losing job (2nd gen immigrant)")
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Low",nativity="2nd gen immigrant"),conf.int=F) ,col="red", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="2nd gen immigrant"),conf.int=F) ,col="green", lty=1)
lines(survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="Middle",nativity="2nd gen immigrant"),conf.int=F) ,col="blue", lty=1)
legend("bottomleft",c("mean","Low educ","Middle educ","High educ"),col=c(1,"red","blue","green"),lty=c(1,1,1))

##Cumulative hazard function (for those who have high level of education - by nativity/immigration status, I took mean age category which is 40-44)
sf1<-survfit(CoxModel,weights=weight,conf.int=F)
H1<- -log(sf1$surv)
times<-sf1$time

#USA born
sf2<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="USA born"),conf.int=F)
H2<- -log(sf2$surv)

#1st gen imm
sf3<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="1st gen immigrant"),conf.int=F)
H3<- -log(sf3$surv)

#2nd gen imm
sf4<-survfit(CoxModel,weights=weight, newdata = data.frame(age='Age 40 to 44',educ="High",nativity="2nd gen immigrant"),conf.int=F)
H4<- -log(sf4$surv)
plot(H1~times, type="l", ylab="H(t)",xlab="Months",ylim=c(0,.06),
     main="Cumulative hazard plot for losing job (high education)")
lines(H2~times,col="green")
lines(H3~times,col="blue")
lines(H4~times,col="red")
legend("topleft",c("mean","USA born","1st gen imm","2nd gen imm"),col=c(1,"green","blue","red"),lty=c(1,1,1))

#Interaction and risk score

#Is there interaction between education and nativity?
CoxModelInt<-coxph(Surv(duration,censor)~educ+age+nativity+educ*nativity,method="efron",data=sub_complete)
summary(CoxModelInt)

## Call:
## coxph(formula = Surv(duration, censor) ~ educ + age + nativity + 
##     educ * nativity, data = sub_complete, method = "efron")
## 
##   n= 294244, number of events= 7652 
## 
##                                          coef exp(coef) se(coef)       z
## educHigh                             -0.88183   0.41403  0.05803 -15.197
## educMiddle                           -0.37899   0.68455  0.05476  -6.920
## ageAge 25 to 29                      -0.24689   0.78123  0.04273  -5.778
## ageAge 30 to 34                      -0.50789   0.60177  0.04515 -11.248
## ageAge 35 to 39                      -0.59991   0.54886  0.04591 -13.066
## ageAge 40 to 44                      -0.72009   0.48671  0.04883 -14.745
## ageAge 45 to 49                      -0.69489   0.49913  0.04830 -14.386
## ageAge 50 to 54                      -0.57079   0.56508  0.04607 -12.389
## ageAge 55 to 59                      -0.61848   0.53876  0.04604 -13.433
## ageAge 60 to 64                      -0.57816   0.56093  0.04914 -11.766
## ageAge 65 to 69                      -0.44431   0.64127  0.06045  -7.350
## nativity1st gen immigrant             0.03394   1.03453  0.07957   0.427
## nativity2nd gen immigrant             0.24392   1.27625  0.13335   1.829
## educHigh:nativity1st gen immigrant    0.33475   1.39759  0.09443   3.545
## educMiddle:nativity1st gen immigrant  0.17638   1.19289  0.09043   1.950
## educHigh:nativity2nd gen immigrant    0.01871   1.01888  0.14896   0.126
## educMiddle:nativity2nd gen immigrant -0.02031   0.97990  0.14201  -0.143
##                                      Pr(>|z|)    
## educHigh                              < 2e-16 ***
## educMiddle                           4.50e-12 ***
## ageAge 25 to 29                      7.55e-09 ***
## ageAge 30 to 34                       < 2e-16 ***
## ageAge 35 to 39                       < 2e-16 ***
## ageAge 40 to 44                       < 2e-16 ***
## ageAge 45 to 49                       < 2e-16 ***
## ageAge 50 to 54                       < 2e-16 ***
## ageAge 55 to 59                       < 2e-16 ***
## ageAge 60 to 64                       < 2e-16 ***
## ageAge 65 to 69                      1.99e-13 ***
## nativity1st gen immigrant            0.669694    
## nativity2nd gen immigrant            0.067367 .  
## educHigh:nativity1st gen immigrant   0.000392 ***
## educMiddle:nativity1st gen immigrant 0.051137 .  
## educHigh:nativity2nd gen immigrant   0.900053    
## educMiddle:nativity2nd gen immigrant 0.886305    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                      exp(coef) exp(-coef) lower .95 upper .95
## educHigh                                0.4140     2.4153    0.3695    0.4639
## educMiddle                              0.6846     1.4608    0.6149    0.7621
## ageAge 25 to 29                         0.7812     1.2800    0.7185    0.8495
## ageAge 30 to 34                         0.6018     1.6618    0.5508    0.6575
## ageAge 35 to 39                         0.5489     1.8220    0.5016    0.6005
## ageAge 40 to 44                         0.4867     2.0546    0.4423    0.5356
## ageAge 45 to 49                         0.4991     2.0035    0.4540    0.5487
## ageAge 50 to 54                         0.5651     1.7697    0.5163    0.6185
## ageAge 55 to 59                         0.5388     1.8561    0.4923    0.5896
## ageAge 60 to 64                         0.5609     1.7828    0.5094    0.6176
## ageAge 65 to 69                         0.6413     1.5594    0.5696    0.7219
## nativity1st gen immigrant               1.0345     0.9666    0.8851    1.2091
## nativity2nd gen immigrant               1.2762     0.7835    0.9827    1.6575
## educHigh:nativity1st gen immigrant      1.3976     0.7155    1.1615    1.6817
## educMiddle:nativity1st gen immigrant    1.1929     0.8383    0.9991    1.4242
## educHigh:nativity2nd gen immigrant      1.0189     0.9815    0.7609    1.3643
## educMiddle:nativity2nd gen immigrant    0.9799     1.0205    0.7418    1.2944
## 
## Concordance= 0.612  (se = 0.003 )
## Likelihood ratio test= 1236  on 17 df,   p=<2e-16
## Wald test            = 1286  on 17 df,   p=<2e-16
## Score (logrank) test = 1357  on 17 df,   p=<2e-16

#The p-values in the output shows that interaction term is statistically significant for the category of highly educated 1st generation immigrants who have slightly higher risk of losing a job than USA born low educated respondents.


#Risk score
sub_complete$risk<-exp(CoxModel$linear.predictors)

#Highest risk group
sub_complete[which.max(sub_complete$risk),c("educ", "sex", "age", "nativity", "risk")]

## # A tibble: 1 x 5
##   educ  sex   age          nativity           risk
##   <fct> <fct> <fct>        <fct>             <dbl>
## 1 Low   Male  Age 17 to 24 1st gen immigrant  3.34

#The risk for this group (young male respondents with low level of education who are 1st generation immigrants)  is 3.3 times that of the baseline category (USA born low educated respondents)

#Lowest risk group
sub_complete[which.min(sub_complete$risk),c("educ", "sex", "age", "nativity", "risk")]

## # A tibble: 1 x 5
##   educ  sex   age          nativity  risk
##   <fct> <fct> <fct>        <fct>    <dbl>
## 1 High  Male  Age 40 to 44 USA born 0.585

#The lowest risk group (highly educated male USA born respondents aged 40-44) had a risk score of .59 that means their score is 59% less than the baseline category.

Homework 3

Asiya

10/04/2020