#Libraries

 library(haven)
 library(ipumsr)
 library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

 library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

 library(ggplot2)
 library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

 library(survival)
 library(foreign)
 library(knitr)

#Load and recode data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Dropbox/Demography/Event history analysis/IPUMS CPS/cps_00003.xml") 
cpsdat<-read_ipums_micro(ddi)

## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
              "MISH"  ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,     
              "SEX"  ,     "RACE"  ,    "MARST"  ,   "BPL"   ,    "YRIMMIG"  , "CITIZEN" , 
              "NATIVITY" , "EMPSTAT" ,  "LABFORCE" , "OCC"  ,     "IND"   ,    "CLASSWKR" ,
              "DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" ,   "EDUC"   ,   "EDUC99"   ,
              "LNKFW1MWT")
 cpsdat<-cpsdat%>%
     select(all_of(variables))%>%
     filter(YEAR>2019, AGE>16,EMPSTAT%in%c(10,12,21,22)) 
 cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)

## `summarise()` ungrouping output (override with `.groups` argument)

cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
   filter(ntime>1)
cpsdat2<-cpsdat2%>%
    mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
          nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
          sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
          educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
          age=Recode(AGE,  recodes="17:24='Age 17 to 24';
                                    25:29='Age 25 to 29';
                                    30:34='Age 30 to 34';
                                    35:39='Age 35 to 39';
                                    40:44='Age 40 to 44';
                                    45:49='Age 45 to 49';
                                    50:54='Age 50 to 54';
                                    55:59='Age 55 to 59';
                                    60:64='Age 60 to 64';
                                    65:69='Age 65 to 69';                                                else=NA",as.factor=TRUE)                            
          )%>%
    arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)

#Event history analysis

#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
 # 2. time: duration (in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job (1-7)
 # 3. Censoring indicator: d.event
 cpsdat2$d.event<-ifelse(cpsdat2$curremp==1 | cpsdat2$curremp==0 & is.na(cpsdat2$prevwork) | !(cpsdat2$curremp==0 & cpsdat2$prevwork==0),0,1)

#Variables for regression

sub<-data_frame(id=cpsdat2$CPSIDP,
                sex=cpsdat2$sex,
                month=cpsdat2$MONTH,
                lj=cpsdat2$lost_job,
                age=cpsdat2$age,
                educ=cpsdat2$educ,
                nativity=cpsdat2$nativity,
                duration=cpsdat2$duration,
                m_lj=ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,0),
                censor=cpsdat2$d.event,
                weight=cpsdat2$LNKFW1MWT)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

sub<- sub[complete.cases(sub), ]
#The data in our case has long format
sub<-sub[order(sub$id,sub$month),]
head(sub)

## # A tibble: 6 x 11
##        id sex    month    lj age     educ  nativity duration  m_lj censor weight
##     <dbl> <fct>  <int> <dbl> <fct>   <fct> <fct>       <dbl> <dbl>  <dbl>  <dbl>
## 1 2.02e13 Male       1     0 Age 55… High  USA born        7     0      0  1845.
## 2 2.02e13 Male       2     0 Age 55… High  USA born        6     0      0     0 
## 3 2.02e13 Female     1     0 Age 50… High  USA born        7     0      0  1896.
## 4 2.02e13 Female     2     0 Age 50… High  USA born        6     0      0     0 
## 5 2.02e13 Male       1     0 Age 60… High  USA born        7     0      0  1804.
## 6 2.02e13 Male       2     0 Age 45… Midd… USA born        6     0      0     0

#Basic discrete time model

fit.cloglog<-glm(lj~duration+educ+age+nativity,data=sub,weights=weight,family=binomial(link="cloglog"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.cloglog)

## 
## Call:
## glm(formula = lj ~ duration + educ + age + nativity, family = binomial(link = "cloglog"), 
##     data = sub, weights = weight)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -39.93  -12.33   -8.37    0.00  355.39  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.5054776  0.0012091 -2899.1   <2e-16 ***
## duration                   0.0714968  0.0001382   517.4   <2e-16 ***
## educHigh                  -0.6133520  0.0010040  -610.9   <2e-16 ***
## educMiddle                -0.1568924  0.0009535  -164.6   <2e-16 ***
## ageAge 25 to 29           -0.4401449  0.0009597  -458.6   <2e-16 ***
## ageAge 30 to 34           -0.5459739  0.0010031  -544.3   <2e-16 ***
## ageAge 35 to 39           -0.6088170  0.0010266  -593.0   <2e-16 ***
## ageAge 40 to 44           -0.6910088  0.0010805  -639.5   <2e-16 ***
## ageAge 45 to 49           -0.5788655  0.0010386  -557.3   <2e-16 ***
## ageAge 50 to 54           -0.6726755  0.0010745  -626.0   <2e-16 ***
## ageAge 55 to 59           -0.5564449  0.0010409  -534.6   <2e-16 ***
## ageAge 60 to 64           -0.5086986  0.0011290  -450.6   <2e-16 ***
## ageAge 65 to 69           -0.3917173  0.0014401  -272.0   <2e-16 ***
## nativity1st gen immigrant  0.3247732  0.0006530   497.4   <2e-16 ***
## nativity2nd gen immigrant  0.1346562  0.0008444   159.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 145644464  on 210435  degrees of freedom
## Residual deviance: 143251983  on 210421  degrees of freedom
## AIC: 143252038
## 
## Number of Fisher Scoring iterations: 7

#round(fit.cloglog$coefficients,3)

Compare with Cox model (Mills)

# Cox model                
#fit.Cox<-coxph(Surv(duration,censor)~educ+age+nativity,method="efron",data=sub)  
#summary(fit.Cox)
#tab<-data.frame(cloglog=fit.cloglog$coefficients,
              #  Cox=c(NA,fit.Cox$coefficients)
              # )
#kable(tab,digits=3)

#Interaction

#Is there interaction between educ and nativity?
fit.cloglog1<- glm(lj~educ*nativity+age+duration,data=sub,weights=weight,family=binomial(link="cloglog"))

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

summary(fit.cloglog1)

## 
## Call:
## glm(formula = lj ~ educ * nativity + age + duration, family = binomial(link = "cloglog"), 
##     data = sub, weights = weight)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -40.22  -12.29   -8.38    0.00  354.83  
## 
## Coefficients:
##                                        Estimate Std. Error  z value Pr(>|z|)
## (Intercept)                          -3.5284947  0.0015559 -2267.77   <2e-16
## educHigh                             -0.5689436  0.0015124  -376.18   <2e-16
## educMiddle                           -0.1339638  0.0014522   -92.25   <2e-16
## nativity1st gen immigrant             0.4215793  0.0018302   230.34   <2e-16
## nativity2nd gen immigrant            -0.1892180  0.0037963   -49.84   <2e-16
## ageAge 25 to 29                      -0.4468154  0.0009626  -464.18   <2e-16
## ageAge 30 to 34                      -0.5541252  0.0010068  -550.38   <2e-16
## ageAge 35 to 39                      -0.6162379  0.0010304  -598.08   <2e-16
## ageAge 40 to 44                      -0.6991641  0.0010848  -644.51   <2e-16
## ageAge 45 to 49                      -0.5867239  0.0010433  -562.38   <2e-16
## ageAge 50 to 54                      -0.6814479  0.0010790  -631.56   <2e-16
## ageAge 55 to 59                      -0.5637194  0.0010446  -539.67   <2e-16
## ageAge 60 to 64                      -0.5159551  0.0011322  -455.69   <2e-16
## ageAge 65 to 69                      -0.3988274  0.0014431  -276.37   <2e-16
## duration                              0.0714787  0.0001382   517.24   <2e-16
## educHigh:nativity1st gen immigrant   -0.1664764  0.0021394   -77.81   <2e-16
## educMiddle:nativity1st gen immigrant -0.0823420  0.0020243   -40.68   <2e-16
## educHigh:nativity2nd gen immigrant    0.3307658  0.0040616    81.44   <2e-16
## educMiddle:nativity2nd gen immigrant  0.3491669  0.0039430    88.55   <2e-16
##                                         
## (Intercept)                          ***
## educHigh                             ***
## educMiddle                           ***
## nativity1st gen immigrant            ***
## nativity2nd gen immigrant            ***
## ageAge 25 to 29                      ***
## ageAge 30 to 34                      ***
## ageAge 35 to 39                      ***
## ageAge 40 to 44                      ***
## ageAge 45 to 49                      ***
## ageAge 50 to 54                      ***
## ageAge 55 to 59                      ***
## ageAge 60 to 64                      ***
## ageAge 65 to 69                      ***
## duration                             ***
## educHigh:nativity1st gen immigrant   ***
## educMiddle:nativity1st gen immigrant ***
## educHigh:nativity2nd gen immigrant   ***
## educMiddle:nativity2nd gen immigrant ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 145644464  on 210435  degrees of freedom
## Residual deviance: 143232165  on 210417  degrees of freedom
## AIC: 143232227
## 
## Number of Fisher Scoring iterations: 7

#The p-values in the output shows that interaction term is statistically significant.

Comparing probabilities of losing job by periods

prob1<-function(x)
{
   beta_x<-sum(fit.cloglog1$coefficients*x)
   pr<-1-exp(-exp(beta_x))
   return (round(pr,3))
}
df<-data.frame(nativity=rep(c("USA born","1st gen","2nd gen"),each=6),
               duration=rep(c(2:7),3),
               prob=rep(0,18)
              )
df

##    nativity duration prob
## 1  USA born        2    0
## 2  USA born        3    0
## 3  USA born        4    0
## 4  USA born        5    0
## 5  USA born        6    0
## 6  USA born        7    0
## 7   1st gen        2    0
## 8   1st gen        3    0
## 9   1st gen        4    0
## 10  1st gen        5    0
## 11  1st gen        6    0
## 12  1st gen        7    0
## 13  2nd gen        2    0
## 14  2nd gen        3    0
## 15  2nd gen        4    0
## 16  2nd gen        5    0
## 17  2nd gen        6    0
## 18  2nd gen        7    0

### USA born
for (i in 2:7)
   df$prob[i-1]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,0,0,i))

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

### 1st gen
for (i in 2:7)
   df$prob[i+5]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,1,0,i))

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

### 2nd gen
for (i in 2:7)
   df$prob[i+11]<-prob1(c(1,0,1,0,0,1,0,0,0,0,0,0,0,1,i))

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

## Warning in fit.cloglog1$coefficients * x: longer object length is not a multiple
## of shorter object length

 df%>%
      ggplot()+
      geom_bar(aes(y=prob,x=nativity,fill=nativity, group=nativity),stat="identity", position="dodge")+
      facet_grid(~duration)+
   ggtitle(label="Probability of losing job among people with middle level of education and age 35-39")

Homework 4

Asiya

10/17/2020

Compare with Cox model (Mills)

Comparing probabilities of losing job by periods