Now, the question is “does first birth affect women’s life satisfaction?” use th staggered difference in difference to estimate the effectl

Prepare the dataset

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(texreg) #output result
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(plm) #linear models for panel data
library(did) #for staggered difference in difference

##Import 6 waves of women data
women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")

##Clean 6 waves of women data
clean_fun <- function(df) {  df %>% 
  transmute(
    id, 
    age, 
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    health=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }

women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, health.1=health, childno.1=childno, sat.1=sat ) #rename variables
women2b <- women2a %>% 
rename(wave.2=wave, age.2=age, relstat.2=relstat, health.2=health, childno.2=childno, sat.2=sat )
women3b <- women3a %>% 
rename(wave.3=wave, age.3=age, relstat.3=relstat, health.3=health, childno.3=childno, sat.3=sat )
women4b <- women4a %>% 
rename(wave.4=wave, age.4=age, relstat.4=relstat, health.4=health, childno.4=childno, sat.4=sat )
women5b <- women5a %>% 
rename(wave.5=wave, age.5=age, relstat.5=relstat, health.5=health, childno.5=childno, sat.5=sat )
women6b <- women6a %>% 
rename(wave.6=wave, age.6=age, relstat.6=relstat, health.6=health, childno.6=childno, sat.6=sat )

###six waves of women data
women_6wave_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

women_6wave_long<- merged.stack(women_6wave_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "health","childno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave

women_6wave_long <- women_6wave_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno>0 ~ 1,
                          TRUE ~ 0), 
    #when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
    twin=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==1 ~ 1, #single birth
                    childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==2 ~ 2, #twin birth
                    TRUE ~ 0)
    #when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
    #when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
    ) 

#second, remove individuals who have twins
twinid <- women_6wave_long$id[women_6wave_long$twin==2] #the id of women who have twin for their first birth
women_6wave_long1 <- women_6wave_long[!(women_6wave_long$id %in% twinid),] #now the data does not have twin situations

#third, remove observations when people start to have a second or higher-order birth.
women_6wave_long2 <-  women_6wave_long1%>% 
  filter(childno<2)
##women_6wave_long2 is a cleaned data, which removes twin cases and observations when people start to have a second or higher-order birth

No. 1

Question

Use the staggered difference in difference method to estimate the group-period specific effect of first birth on women’s wellbeing and plot the result

  • Step 1: create the treat group variable, a variable that tell us at which wave a woman is treated.

  • Step 2: then estimate the group-period specific effect of first birth on women’s wellbeing

  • Step 3: plot the result

Answer for step 1

women_6wave_long3 <- women_6wave_long2 %>% 
  group_by(id) %>% 
  mutate(
     wave=as.numeric(wave),
     birthwave=case_when(firstkid==1 ~ wave,
                         TRUE ~ 99), #identify the timing of first birth
     anchorwave=min(birthwave), #generate the anchor wave
     treatgroup=case_when(anchorwave %in% c(2:6) ~ anchorwave,
                          anchorwave==99 ~0 )
       ) 

Answer for step 2

did1 <- att_gt(yname = "sat", #dependent variable
              tname = "wave", #time variable
              idname = "id", #id
              gname = "treatgroup", #the variable in data that contains the first period when a particular observation is treated.
              xformla = ~ health, #when you don't have any covariates to control, use "~ 1"; if yes, you can add covariates here by ~ x1+x2
              data = women_6wave_long3  #specify your data
              )
## Warning in pre_process_did(yname = yname, tname = tname, idname = idname, :
## Dropped 2346 observations while converting to balanced panel.
summary(did1)
## 
## Call:
## att_gt(yname = "sat", tname = "wave", idname = "id", gname = "treatgroup", 
##     xformla = ~health, data = women_6wave_long3)
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Simult.  Conf. Band]  
##      2    2   0.6364     0.3337       -0.2890      1.5618  
##      2    3  -0.1154     0.3966       -1.2152      0.9844  
##      2    4  -0.0283     0.2839       -0.8155      0.7590  
##      2    5   0.1268     0.2816       -0.6540      0.9077  
##      2    6  -0.1344     0.2812       -0.9142      0.6455  
##      3    2   0.1890     0.3822       -0.8710      1.2489  
##      3    3   0.4683     0.3607       -0.5318      1.4684  
##      3    4   0.2713     0.3614       -0.7309      1.2735  
##      3    5  -0.3048     0.4083       -1.4369      0.8273  
##      3    6  -0.0345     0.4100       -1.1713      1.1024  
##      4    2  -0.5418     0.3706       -1.5694      0.4857  
##      4    3   0.6849     0.2590       -0.0333      1.4031  
##      4    4   0.4158     0.2366       -0.2403      1.0720  
##      4    5  -1.0459     0.3530       -2.0249     -0.0670 *
##      4    6  -0.5254     0.2534       -1.2280      0.1772  
##      5    2   0.1981     0.3640       -0.8112      1.2074  
##      5    3   0.2336     0.3146       -0.6386      1.1059  
##      5    4   0.5715     0.2239       -0.0493      1.1923  
##      5    5   0.0140     0.2812       -0.7658      0.7938  
##      5    6  -0.8749     0.3730       -1.9091      0.1594  
##      6    2  -0.4040     0.3087       -1.2601      0.4520  
##      6    3   0.1991     0.3112       -0.6639      1.0620  
##      6    4  -0.2025     0.3146       -1.0749      0.6699  
##      6    5   0.9996     0.2974        0.1749      1.8243 *
##      6    6   0.2850     0.2154       -0.3123      0.8823  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## P-value for pre-test of parallel trends assumption:  1e-05
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust

Answer for step 3

ggdid(did1)

No. 2

Question

Estimate the weighted average effect of all group-time specific treatment effects

Answer

agg1 <- aggte(did1, type = "simple")
summary(agg1)
## 
## Call:
## aggte(MP = did1, type = "simple")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
##     ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.076        0.1315    -0.3337      0.1817 
## 
## 
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust

No. 3

Question

Estimate the average treatment effect by groups and plot the result

Answer

agg2 <- aggte(did1, type = "group")
summary(agg2)
## 
## Call:
## aggte(MP = did1, type = "group")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
## Overall summary of ATT's based on group/cohort aggregation:  
##      ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.0729        0.1107    -0.2898      0.1439 
## 
## 
## Group Effects:
##  Group Estimate Std. Error [95% Simult.  Conf. Band] 
##      2   0.0970     0.2284       -0.4438      0.6379 
##      3   0.1001     0.3331       -0.6889      0.8891 
##      4  -0.3852     0.1923       -0.8407      0.0704 
##      5  -0.4304     0.2583       -1.0423      0.1814 
##      6   0.2850     0.2359       -0.2736      0.8437 
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust
ggdid(agg2)

No. 4

Question

Estimate the average time-dynamic effect and plot the result

Answer

agg3 <- aggte(did1, type = "dynamic")
summary(agg3)
## 
## Call:
## aggte(MP = did1, type = "dynamic")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
## Overall summary of ATT's based on event-study/dynamic aggregation:  
##      ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.1128        0.1414      -0.39      0.1644 
## 
## 
## Dynamic Effects:
##  Event time Estimate Std. Error [95% Simult.  Conf. Band]  
##          -4  -0.4040     0.3078       -1.2309      0.4228  
##          -3   0.1986     0.2347       -0.4317      0.8289  
##          -2  -0.1713     0.1892       -0.6794      0.3369  
##          -1   0.6238     0.1541        0.2100      1.0377 *
##           0   0.3410     0.1307       -0.0100      0.6921  
##           1  -0.4796     0.1785       -0.9592      0.0001  
##           2  -0.3210     0.1814       -0.8083      0.1663  
##           3   0.0301     0.2620       -0.6739      0.7340  
##           4  -0.1344     0.2773       -0.8792      0.6105  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust
ggdid(agg3)