Now, the question is “does first birth affect women’s life satisfaction?” use different models and select the approriate model Prepare the dataset

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(ggplot2)
library(plm)
library(lmtest)

##Import 6 waves of women data
for (i in 1:6) {
  assign(paste0("women", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6. 
         read_dta(paste0("wave", i, "_women.dta"))
         )
} 

##Clean 6 waves of women data
clean_fun <- function(df) {  df %>% 
  transmute(
    id=zap_label(id), #remove label of id
    age=zap_label(age), #remove label of age
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    hlt=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    nkidsbio=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

women1b <- women1a %>% filter(nkidsbio==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, hlt.1=hlt, nkidsbio.1=nkidsbio, sat.1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave.2=wave, age.2=age, relstat.2=relstat, hlt.2=hlt, nkidsbio.2=nkidsbio, sat.2=sat )

women3b <- women3a %>% 
rename(wave.3=wave, age.3=age, relstat.3=relstat, hlt.3=hlt, nkidsbio.3=nkidsbio, sat.3=sat )

women4b <- women4a %>% 
rename(wave.4=wave, age.4=age, relstat.4=relstat, hlt.4=hlt, nkidsbio.4=nkidsbio, sat.4=sat )

women5b <- women5a %>% 
rename(wave.5=wave, age.5=age, relstat.5=relstat, hlt.5=hlt, nkidsbio.5=nkidsbio, sat.5=sat )

women6b <- women6a %>% 
rename(wave.6=wave, age.6=age, relstat.6=relstat, hlt.6=hlt, nkidsbio.6=nkidsbio, sat.6=sat )


women_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

women_wide$check <- paste(women_wide$wave.1, women_wide$wave.2, women_wide$wave.3,
                          women_wide$wave.4, women_wide$wave.5, women_wide$wave.6, sep='-')

women_long<- merged.stack(women_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "hlt","nkidsbio", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave


women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio>0 ~ 1,
                          TRUE ~ 0),
    firstkid2=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio==2 ~ 1,
                          TRUE ~ 0)
    ) #to identify individual whose first childbearing is twins

twinid <- women_long$id[women_long$firstkid2==1]

women_long <- women_long[!(women_long$id %in% twinid),] #remove respondents whose first childbearing is twins
women_long <-  filter(women_long, nkidsbio<2) # remove repeated event of childbearing, only focus on having first child

women_long <- women_long %>% 
  select(-firstkid2)%>%
  group_by(id) %>% 
  mutate(
     wave=as.numeric(wave),
     birthwave=case_when(firstkid==1 ~ wave), #identify the timing of first birth
     birthwave1=min(birthwave, na.rm = TRUE) #generate a variable for timing of first birth
  )
women_long$birthwave1[is.infinite(women_long$birthwave1)] <- 0 

No. 1

Question

Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method

  • Estimate the group-period specific effect of first birth on women’s wellbeing and plot the result

Answer

library(did)
## Warning: package 'did' was built under R version 4.2.2
did1 <- att_gt(yname = "sat", #dependent variable
              tname = "wave", #time variable
              idname = "id", #id
              gname = "birthwave1", #the variable in data that contains the first period when a particular observation is treated.
              xformla = ~ hlt, #when you don't have any covariates to control, use "~ 1"; if yes, you can add covariates here by ~ x1+x2
              data = women_long  #specify your data
              )
## Warning in pre_process_did(yname = yname, tname = tname, idname = idname, :
## Dropped 2346 observations while converting to balanced panel.
summary(did1)
## 
## Call:
## att_gt(yname = "sat", tname = "wave", idname = "id", gname = "birthwave1", 
##     xformla = ~hlt, data = women_long)
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Simult.  Conf. Band]  
##      2    2   0.6364     0.3153       -0.2086      1.4815  
##      2    3  -0.1154     0.3794       -1.1323      0.9015  
##      2    4  -0.0283     0.2884       -0.8013      0.7447  
##      2    5   0.1268     0.2719       -0.6019      0.8556  
##      2    6  -0.1344     0.2715       -0.8622      0.5934  
##      3    2   0.1890     0.3930       -0.8645      1.2424  
##      3    3   0.4683     0.3774       -0.5433      1.4798  
##      3    4   0.2713     0.3509       -0.6694      1.2120  
##      3    5  -0.3048     0.3874       -1.3432      0.7336  
##      3    6  -0.0345     0.4167       -1.1515      1.0826  
##      4    2  -0.5418     0.3488       -1.4769      0.3932  
##      4    3   0.6849     0.2496        0.0159      1.3539 *
##      4    4   0.4158     0.2634       -0.2903      1.1220  
##      4    5  -1.0459     0.3350       -1.9440     -0.1478 *
##      4    6  -0.5254     0.2539       -1.2061      0.1553  
##      5    2   0.1981     0.3542       -0.7515      1.1476  
##      5    3   0.2336     0.3214       -0.6280      1.0952  
##      5    4   0.5715     0.2117        0.0041      1.1390 *
##      5    5   0.0140     0.2890       -0.7608      0.7888  
##      5    6  -0.8749     0.3623       -1.8461      0.0964  
##      6    2  -0.4040     0.3088       -1.2319      0.4238  
##      6    3   0.1991     0.3238       -0.6690      1.0672  
##      6    4  -0.2025     0.3328       -1.0945      0.6895  
##      6    5   0.9996     0.2872        0.2297      1.7696 *
##      6    6   0.2850     0.2303       -0.3323      0.9024  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## P-value for pre-test of parallel trends assumption:  1e-05
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust
ggdid(did1)

No. 2

Question

Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method

  • Estimate the weighted average effect of all group-time specific treatment effects

Answer

agg1 <- aggte(did1, type = "simple")
summary(agg1)
## 
## Call:
## aggte(MP = did1, type = "simple")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
##     ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.076        0.1376    -0.3458      0.1937 
## 
## 
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust

No. 3

Question

Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method

  • Estimate the average treatment effect by groups and plot the result

Answer

agg2 <- aggte(did1, type = "group")
summary(agg2)
## 
## Call:
## aggte(MP = did1, type = "group")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
## Overall summary of ATT's based on group/cohort aggregation:  
##      ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.0729        0.1086    -0.2857      0.1398 
## 
## 
## Group Effects:
##  Group Estimate Std. Error [95% Simult.  Conf. Band] 
##      2   0.0970     0.2407       -0.4954      0.6895 
##      3   0.1001     0.3370       -0.7296      0.9298 
##      4  -0.3852     0.1906       -0.8544      0.0841 
##      5  -0.4304     0.2613       -1.0738      0.2129 
##      6   0.2850     0.2189       -0.2539      0.8239 
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust
ggdid(agg2)

No. 4

Question

Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method

  • Estimate the average time-dynamic effect and plot the result

Answer

agg3 <- aggte(did1, type = "dynamic")
summary(agg3)
## 
## Call:
## aggte(MP = did1, type = "dynamic")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
## Overall summary of ATT's based on event-study/dynamic aggregation:  
##      ATT    Std. Error     [ 95%  Conf. Int.] 
##  -0.1128        0.1363    -0.3799      0.1543 
## 
## 
## Dynamic Effects:
##  Event time Estimate Std. Error [95% Simult.  Conf. Band]  
##          -4  -0.4040     0.3040       -1.1867      0.3786  
##          -3   0.1986     0.2420       -0.4244      0.8216  
##          -2  -0.1713     0.1995       -0.6849      0.3424  
##          -1   0.6238     0.1440        0.2531      0.9945 *
##           0   0.3410     0.1155        0.0437      0.6384 *
##           1  -0.4796     0.1840       -0.9534     -0.0058 *
##           2  -0.3210     0.1845       -0.7960      0.1541  
##           3   0.0301     0.2416       -0.5921      0.6522  
##           4  -0.1344     0.2748       -0.8418      0.5731  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust
ggdid(agg3)