#Libraries

 library(haven)
 library(ipumsr)
 library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

 library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

 library(ggplot2)
 library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

 library(survival)
 library(foreign)
 library(knitr)
 library(splines)

#Load and recode data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through August 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00004.xml") 
cpsdat<-read_ipums_micro(ddi)

## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
              "MISH"  ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,     
              "SEX"  ,     "RACE"  ,    "MARST"  ,   "BPL"   ,    "YRIMMIG"  , "CITIZEN" , 
              "NATIVITY" , "EMPSTAT" ,  "LABFORCE" , "OCC"  ,     "IND"   ,    "CLASSWKR" ,
              "DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" ,   "EDUC"   ,   "EDUC99"   ,
              "LNKFW1MWT")
 cpsdat<-cpsdat%>%
     select(all_of(variables))%>%
     filter(YEAR>2019, AGE>16 & AGE<70,MONTH<8,EMPSTAT%in%c(10,12,21,22))
 cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)

## `summarise()` ungrouping output (override with `.groups` argument)

cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
   filter(ntime>1)
cpsdat2<-cpsdat2%>%
    mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
          nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
          sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
          educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
          age=Recode(AGE,  recodes="17:24='Age 17 to 24';
                                    25:29='Age 25 to 29';
                                    30:34='Age 30 to 34';
                                    35:39='Age 35 to 39';
                                    40:44='Age 40 to 44';
                                    45:49='Age 45 to 49';
                                    50:54='Age 50 to 54';
                                    55:59='Age 55 to 59';
                                    60:64='Age 60 to 64';
                                    65:69='Age 65 to 69';                                                else=NA",as.factor=TRUE)                            
          )%>%
    arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)

#Event history analysis

#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
 # 2. time: duration (in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job (1-7)
 # 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==0,ifelse(cpsdat2$MONTH!=1,1,0),0)

#Variables for regression

sub<-data_frame(id=cpsdat2$CPSIDP,
                sex=cpsdat2$sex,
                month=cpsdat2$MONTH,
                lj=cpsdat2$lost_job,
                age=cpsdat2$age,
                educ=cpsdat2$educ,
                nativity=cpsdat2$nativity,
                duration=cpsdat2$duration,
                m_lj=ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,0),
                lj_event=cpsdat2$d.event,
                weight=cpsdat2$LNKFW1MWT)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

sub<-subset(sub,sub$duration>1)
sub<-sub[order(sub$id,sub$month),]

#General model

fit.gm<-glm(lj_event~as.factor(duration)-1,data=sub,family="binomial")
summary(fit.gm)

## 
## Call:
## glm(formula = lj_event ~ as.factor(duration) - 1, family = "binomial", 
##     data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5272  -0.4526  -0.3217  -0.2817   2.8776  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## as.factor(duration)2 -2.44071    0.01853  -131.7   <2e-16 ***
## as.factor(duration)3 -2.22703    0.01645  -135.4   <2e-16 ***
## as.factor(duration)4 -1.90339    0.01412  -134.8   <2e-16 ***
## as.factor(duration)5 -2.93554    0.02082  -141.0   <2e-16 ***
## as.factor(duration)6 -3.20724    0.02239  -143.2   <2e-16 ***
## as.factor(duration)7 -4.12427    0.04025  -102.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 370059  on 266941  degrees of freedom
## Residual deviance: 126428  on 266935  degrees of freedom
## AIC: 126440
## 
## Number of Fisher Scoring iterations: 6

#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.gm)))
time<-2:7
plot(haz~time, type="l",xlab="duration", ylab="h(t)",main="Discrete Time Hazard for losing job")

"Plot the survival function estimate"

## [1] "Plot the survival function estimate"

St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
      St[i]<-St[i-1]* (1-haz[i])
}
plot(y=St,x=time, type="l", main="Survival function for losing job")

Other time specifications

#Linear model
fit.lm<-glm(lj_event~duration,data=sub ,family="binomial")
summary(fit.lm)

## 
## Call:
## glm(formula = lj_event ~ duration, family = "binomial", data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5021  -0.4417  -0.3402  -0.2981   2.6064  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.465297   0.020541  -71.34   <2e-16 ***
## duration    -0.271039   0.004828  -56.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132656  on 266940  degrees of freedom
## Residual deviance: 129363  on 266939  degrees of freedom
## AIC: 129367
## 
## Number of Fisher Scoring iterations: 5

#Quadratic model
fit.s<-glm(lj_event~duration+I(duration^2),data=sub ,family="binomial")
summary(fit.s)

## 
## Call:
## glm(formula = lj_event ~ duration + I(duration^2), family = "binomial", 
##     data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4739  -0.4647  -0.3873  -0.2727   2.9487  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -4.045197   0.063132  -64.08   <2e-16 ***
## duration       1.148185   0.032590   35.23   <2e-16 ***
## I(duration^2) -0.169928   0.003903  -43.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132656  on 266940  degrees of freedom
## Residual deviance: 127215  on 266938  degrees of freedom
## AIC: 127221
## 
## Number of Fisher Scoring iterations: 6

#Cubic model
fit.c<-glm(lj_event~duration+I(duration^2)+I(duration^3),data=sub ,family="binomial")
summary(fit.c)

## 
## Call:
## glm(formula = lj_event ~ duration + I(duration^2) + I(duration^3), 
##     family = "binomial", data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4927  -0.4702  -0.3726  -0.2642   2.8678  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.739485   0.176104  -32.59   <2e-16 ***
## duration       2.562051   0.140342   18.26   <2e-16 ***
## I(duration^2) -0.527305   0.034616  -15.23   <2e-16 ***
## I(duration^3)  0.027835   0.002669   10.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132656  on 266940  degrees of freedom
## Residual deviance: 127109  on 266937  degrees of freedom
## AIC: 127117
## 
## Number of Fisher Scoring iterations: 6

#Quartic model
fit.q<-glm(lj_event~duration+I(duration^2)+I(duration^3)+I(duration^4),data=sub ,family="binomial")
summary(fit.q)

## 
## Call:
## glm(formula = lj_event ~ duration + I(duration^2) + I(duration^3) + 
##     I(duration^4), family = "binomial", data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4828  -0.4775  -0.3770  -0.2497   2.8318  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.265156   0.584739  -2.164 0.030493 *  
## duration      -2.394018   0.633517  -3.779 0.000158 ***
## I(duration^2)  1.384031   0.240705   5.750 8.93e-09 ***
## I(duration^3) -0.279245   0.038354  -7.281 3.32e-13 ***
## I(duration^4)  0.017491   0.002179   8.029 9.84e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132656  on 266940  degrees of freedom
## Residual deviance: 127043  on 266936  degrees of freedom
## AIC: 127053
## 
## Number of Fisher Scoring iterations: 6

#Spline model
fit.sp<-glm(lj_event~ns(duration,df=2),data=sub, family="binomial")
summary(fit.sp)

## 
## Call:
## glm(formula = lj_event ~ ns(duration, df = 2), family = "binomial", 
##     data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4636  -0.4606  -0.3953  -0.2701   2.9492  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.37825    0.01594 -149.24   <2e-16 ***
## ns(duration, df = 2)1 -0.82808    0.03858  -21.46   <2e-16 ***
## ns(duration, df = 2)2 -2.24703    0.03790  -59.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 132656  on 266940  degrees of freedom
## Residual deviance: 127310  on 266938  degrees of freedom
## AIC: 127316
## 
## Number of Fisher Scoring iterations: 6

#Hazards

dat<-expand.grid(duration=2:7)
dat$genmod<-predict(fit.gm, newdata=data.frame(duration=as.factor(2:7 )), type="response")
dat$lin<-predict(fit.lm, newdata=dat, type="response")
dat$sq<-predict(fit.s, newdata=dat, type="response")
dat$cub<-predict(fit.c, newdata=dat, type="response")
dat$quart<-predict(fit.q, newdata=dat, type="response")
dat$spline<-predict(fit.sp, newdata=expand.grid(duration=2:7), type="response")
dat

##   duration     genmod        lin         sq        cub      quart     spline
## 1        2 0.08012042 0.11843076 0.08102236 0.07572339 0.07790381 0.08484673
## 2        3 0.09734934 0.09292653 0.10621600 0.11430159 0.10776395 0.10062393
## 3        4 0.12972511 0.07246322 0.10235777 0.10463722 0.10999885 0.10190631
## 4        5 0.05042435 0.05622679 0.07226136 0.06707611 0.06860578 0.07517285
## 5        6 0.03889410 0.04345792 0.03649231 0.03429615 0.03068247 0.03582909
## 6        7 0.01591775 0.03348592 0.01294070 0.01637151 0.01814212 0.01292012

plot(genmod~duration, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(lin~duration, dat, col=2, lwd=2)
lines(sq~duration, dat, col=3, lwd=2)
lines(cub~duration, dat, col=4, lwd=2)
lines(quart~duration, dat, col=5, lwd=2)
lines(spline~duration, dat, col=6, lwd=2)
legend("topright",
legend=c("General Model","Linear", "Square", "Cubic", "Quartic", "Natural spline"),
col=1:6, lwd=1.5)

#AIC table

aic<-round(c(
fit.lm$deviance+2*length(fit.lm$coefficients),
fit.s$deviance+2*length(fit.s$coefficients),
fit.c$deviance+2*length(fit.c$coefficients),
fit.q$deviance+2*length(fit.q$coefficients),
fit.sp$deviance+2*length(fit.sp$coefficients),
fit.gm$deviance+2*length(fit.gm$coefficients)),2)

#compare all AICs to the one from the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c("linear", "square", "cubic", "quartic","spline", "general"), aic=aic, aic_dif=dif.aic)

##     model      aic aic_dif
## 1  linear 129367.3 2927.50
## 2  square 127221.4  781.60
## 3   cubic 127116.6  676.82
## 4 quartic 127053.5  613.68
## 5  spline 127315.5  875.74
## 6 general 126439.8    0.00

#Minimal value gives general model and so it fits well, but we can see that quartic and cubic time specifications are the closest to general model. Linear model is the worst fit.

Homework 5

Asiya

10/22/2020

Other time specifications