#Libraries

 library(haven)
 library(ipumsr)
 library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

 library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

 library(ggplot2)
 library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

 library(survival)
 library(foreign)
 library(knitr)

#Load and recode data

## I am using IPUMS CPS data for the period of COVID19 pandemic (January through July 2020). Research question is the same as in my previous works
ddi<-read_ipums_ddi("/Users/asiyavalidova/Desktop/cps_00004.xml") 
cpsdat<-read_ipums_micro(ddi)

## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

cpsdat<-zap_labels(cpsdat)
variables<-c("YEAR" ,"SERIAL","MONTH" ,"HWTFINL" ,"CPSID" , "ASECFLAG" ,
              "MISH"  ,"STATEFIP" ,"PERNUM" , "WTFINL" ,"CPSIDP" , "AGE" ,     
              "SEX"  ,     "RACE"  ,    "MARST"  ,   "BPL"   ,    "YRIMMIG"  , "CITIZEN" , 
              "NATIVITY" , "EMPSTAT" ,  "LABFORCE" , "OCC"  ,     "IND"   ,    "CLASSWKR" ,
              "DURUNEM2" , "DURUNEMP" , "WHYUNEMP" , "WKSTAT" ,   "EDUC"   ,   "EDUC99"   ,
              "LNKFW1MWT")
 cpsdat<-cpsdat%>%
     select(all_of(variables))%>%
     filter(YEAR>2019, MONTH<8, AGE>16,EMPSTAT%in%c(10,12,21,22)) 
 cntpep<-cpsdat%>%
   group_by(CPSIDP)%>%
   summarise(ntime=n())%>%
   arrange(ntime)

## `summarise()` ungrouping output (override with `.groups` argument)

cpsdat<-merge(cpsdat, cntpep, by="CPSIDP")
cpsdat2<-cpsdat%>%
   filter(ntime>1)
cpsdat2<-cpsdat2%>%
    mutate( curremp = ifelse(EMPSTAT%in%c(10,12) , 1, 0),
          nativity=Recode(NATIVITY,recodes="1='USA born';2:4='2nd gen immigrant';5='1st gen immigrant'; else=NA",as.factor=T),
          sex=Recode(SEX, recodes = "1='Male'; 2='Female'; else=NA", as.factor=T),
          educ=Recode(EDUC,recodes="1:70='Low';71:91='Middle';92:125='High';else=NA",as.factor=T),
          age=Recode(AGE,  recodes="17:24='Age 17 to 24';
                                    25:29='Age 25 to 29';
                                    30:34='Age 30 to 34';
                                    35:39='Age 35 to 39';
                                    40:44='Age 40 to 44';
                                    45:49='Age 45 to 49';
                                    50:54='Age 50 to 54';
                                    55:59='Age 55 to 59';
                                    60:64='Age 60 to 64';
                                    65:69='Age 65 to 69';                                                else=NA",as.factor=TRUE)                            
          )%>%
    arrange(CPSIDP, MONTH)
cpsdat2$educ<-relevel(cpsdat2$educ,ref="Low")
cpsdat2$nativity<-relevel(cpsdat2$nativity,ref="USA born")
cpsdat2<-cpsdat2%>%
    group_by(CPSIDP)%>%
    mutate(prevwork= lag(curremp, group_by = CPSIDP))%>%
    arrange(CPSIDP, MONTH)

#Event history analysis

#Time and status
# 1. Event variable: lost_job
cpsdat2$lost_job<-0
cpsdat2$lost_job[cpsdat2$curremp==0&cpsdat2$prevwork==1]<-1
 # 2. time: duration (in months)
cpsdat2$duration<-ifelse(cpsdat2$lost_job==1,cpsdat2$MONTH,8-cpsdat2$MONTH) # Month when a person lost job (1-7)
 # 3. Censoring indicator: d.event
cpsdat2$d.event<-ifelse(cpsdat2$curremp==0,ifelse(cpsdat2$MONTH!=1,1,0),0)

#Variables for regression

sub<-data_frame(id=cpsdat2$CPSIDP,
                sex=cpsdat2$sex,
                month=cpsdat2$MONTH,
                lj=cpsdat2$lost_job,
                age=cpsdat2$age,
                educ=cpsdat2$educ,
                nativity=cpsdat2$nativity,
                duration=cpsdat2$duration,
                censor=cpsdat2$d.event,
                weight=cpsdat2$LNKFW1MWT)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

#The data in our case has long format
sub<-subset(sub,sub$duration>1)
sub<-sub[order(sub$id,sub$month),]

#Basic discrete time model

fit.cloglog<-glm(lj~educ+age+nativity+as.factor(duration),data=sub,family=binomial(link="cloglog"))
summary(fit.cloglog)

## 
## Call:
## glm(formula = lj ~ educ + age + nativity + as.factor(duration), 
##     family = binomial(link = "cloglog"), data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7271  -0.2282  -0.1711  -0.1345   3.3082  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.08702    0.07402 -55.212  < 2e-16 ***
## educHigh                  -0.71785    0.04678 -15.346  < 2e-16 ***
## educMiddle                -0.20004    0.04424  -4.521 6.14e-06 ***
## ageAge 25 to 29           -0.34260    0.04674  -7.329 2.32e-13 ***
## ageAge 30 to 34           -0.49855    0.04776 -10.439  < 2e-16 ***
## ageAge 35 to 39           -0.56185    0.04771 -11.776  < 2e-16 ***
## ageAge 40 to 44           -0.65666    0.05035 -13.042  < 2e-16 ***
## ageAge 45 to 49           -0.66508    0.05012 -13.270  < 2e-16 ***
## ageAge 50 to 54           -0.61771    0.04902 -12.601  < 2e-16 ***
## ageAge 55 to 59           -0.54897    0.04738 -11.586  < 2e-16 ***
## ageAge 60 to 64           -0.55935    0.05118 -10.930  < 2e-16 ***
## ageAge 65 to 69           -0.38841    0.06185  -6.280 3.40e-10 ***
## nativity1st gen immigrant  0.35657    0.03159  11.289  < 2e-16 ***
## nativity2nd gen immigrant  0.15672    0.04196   3.735 0.000188 ***
## as.factor(duration)3       0.54775    0.07188   7.621 2.52e-14 ***
## as.factor(duration)4       2.39986    0.06021  39.861  < 2e-16 ***
## as.factor(duration)5       1.19820    0.06456  18.558  < 2e-16 ***
## as.factor(duration)6       0.59828    0.06842   8.745  < 2e-16 ***
## as.factor(duration)7       0.72141    0.07030  10.262  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64617  on 266811  degrees of freedom
## Residual deviance: 58609  on 266793  degrees of freedom
##   (10265 observations deleted due to missingness)
## AIC: 58647
## 
## Number of Fisher Scoring iterations: 7

#Results show that more educated respondents have lower probability of losing the job, older respondents also have lower probability of losing the job than the reference group. As for nativity groups, immigrants have higher probability of losing their jobs. Results for the time periods reveal that the probability to lose a job in April was the highest (reference month is January).

#Interaction

#Is there interaction between educ and nativity?
fit.cloglog1<- glm(lj~educ*nativity+age+as.factor(duration),data=sub,family=binomial(link="cloglog"))
summary(fit.cloglog1)

## 
## Call:
## glm(formula = lj ~ educ * nativity + age + as.factor(duration), 
##     family = binomial(link = "cloglog"), data = sub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7168  -0.2265  -0.1699  -0.1355   3.3018  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          -4.035268   0.082270 -49.049  < 2e-16 ***
## educHigh                             -0.752727   0.064132 -11.737  < 2e-16 ***
## educMiddle                           -0.271143   0.061128  -4.436 9.18e-06 ***
## nativity1st gen immigrant             0.276392   0.083619   3.305 0.000948 ***
## nativity2nd gen immigrant            -0.101974   0.174493  -0.584 0.558949    
## ageAge 25 to 29                      -0.339483   0.046932  -7.233 4.71e-13 ***
## ageAge 30 to 34                      -0.495048   0.047980 -10.318  < 2e-16 ***
## ageAge 35 to 39                      -0.558096   0.047974 -11.633  < 2e-16 ***
## ageAge 40 to 44                      -0.652781   0.050609 -12.899  < 2e-16 ***
## ageAge 45 to 49                      -0.660908   0.050384 -13.117  < 2e-16 ***
## ageAge 50 to 54                      -0.614706   0.049301 -12.469  < 2e-16 ***
## ageAge 55 to 59                      -0.544837   0.047604 -11.445  < 2e-16 ***
## ageAge 60 to 64                      -0.554934   0.051392 -10.798  < 2e-16 ***
## ageAge 65 to 69                      -0.384011   0.062059  -6.188 6.10e-10 ***
## as.factor(duration)3                  0.547725   0.071876   7.620 2.53e-14 ***
## as.factor(duration)4                  2.399911   0.060207  39.861  < 2e-16 ***
## as.factor(duration)5                  1.198039   0.064565  18.556  < 2e-16 ***
## as.factor(duration)6                  0.597917   0.068417   8.739  < 2e-16 ***
## as.factor(duration)7                  0.721163   0.070302  10.258  < 2e-16 ***
## educHigh:nativity1st gen immigrant    0.006929   0.099777   0.069 0.944632    
## educMiddle:nativity1st gen immigrant  0.142562   0.093584   1.523 0.127671    
## educHigh:nativity2nd gen immigrant    0.186533   0.189649   0.984 0.325327    
## educMiddle:nativity2nd gen immigrant  0.323429   0.182190   1.775 0.075861 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 64617  on 266811  degrees of freedom
## Residual deviance: 58601  on 266789  degrees of freedom
##   (10265 observations deleted due to missingness)
## AIC: 58647
## 
## Number of Fisher Scoring iterations: 7

#The p-values in the output shows that interaction term is not statistically significant.

Comparing probabilities of losing job by periods

prob<-function(x)
{
   beta_x<-sum(fit.cloglog$coefficients*x)
   pr<-1-exp(-exp(beta_x))
   return (round(pr,3))
}
df<-data.frame(nativity=rep(c("USA born","1st gen","2nd gen"),each=6),
               duration=rep(c(2:7),3),
               prob=rep(0,18)
              )
### USA born
x0<-c(1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0)
for (i in 2:7)
{
   x<-x0
   x[12+i]<-1
   df$prob[i-1]<-prob(x)
}   
### 1st gen
x0<-c(1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0)
for (i in 2:7)
{
   x<-x0
   x[12+i]<-1
   df$prob[i+5]<-prob(x)
}   
### 2nd gen
x0<-c(1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0)
for (i in 2:7)
{
   x<-x0
   x[12+i]<-1
   df$prob[i+11]<-prob(x) 
}   
df%>%
      ggplot()+
      geom_bar(aes(y=prob,x=nativity,fill=nativity, group=nativity),stat="identity", position="dodge")+
      facet_grid(~duration)+
   ggtitle(label="Probability of losing job among people with middle education and age 35-39")

Homework 4

Asiya

10/17/2020

Comparing probabilities of losing job by periods