Now, the question is “does first birth affect women’s life satisfaction?” use th staggered difference in difference to estimate the effectl
Prepare the dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(texreg) #output result
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(plm) #linear models for panel data
library(did) #for staggered difference in difference
##Import 6 waves of women data
women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")
##Clean 6 waves of women data
clean_fun <- function(df) { df %>%
transmute(
id,
age,
wave=as.numeric(wave),
relstat=as_factor(relstat), #make relstat as a factor
relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
health=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(hlt1)),
childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6)),
)%>% drop_na() }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)
women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, health.1=health, childno.1=childno, sat.1=sat ) #rename variables
women2b <- women2a %>%
rename(wave.2=wave, age.2=age, relstat.2=relstat, health.2=health, childno.2=childno, sat.2=sat )
women3b <- women3a %>%
rename(wave.3=wave, age.3=age, relstat.3=relstat, health.3=health, childno.3=childno, sat.3=sat )
women4b <- women4a %>%
rename(wave.4=wave, age.4=age, relstat.4=relstat, health.4=health, childno.4=childno, sat.4=sat )
women5b <- women5a %>%
rename(wave.5=wave, age.5=age, relstat.5=relstat, health.5=health, childno.5=childno, sat.5=sat )
women6b <- women6a %>%
rename(wave.6=wave, age.6=age, relstat.6=relstat, health.6=health, childno.6=childno, sat.6=sat )
###six waves of women data
women_6wave_wide <- left_join(women1b, women2b, by = "id") %>% # left join women1b and women2b
left_join(women3b, by = "id") %>% # left join with women3b
left_join(women4b, by = "id") %>% # left join with women4b
left_join(women5b, by = "id") %>% # left join with women5b
left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them
women_6wave_long<- merged.stack(women_6wave_wide, #dataset for transfrom
var.stubs = c("age", "wave", "relstat", "health","childno", "sat"),
#var.stubs is to specify the prefixes of the variable groups
sep = ".") %>%
#sep is to specify the character that separates the "variable name" from the "times" in the source
drop_na(wave)
#drop the observations which did not join the wave
women_6wave_long <- women_6wave_long %>%
group_by(id) %>%
mutate(
firstkid=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno>0 ~ 1,
TRUE ~ 0),
#when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
twin=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==1 ~ 1, #single birth
childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==2 ~ 2, #twin birth
TRUE ~ 0)
#when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
#when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
)
#second, remove individuals who have twins
twinid <- women_6wave_long$id[women_6wave_long$twin==2] #the id of women who have twin for their first birth
women_6wave_long1 <- women_6wave_long[!(women_6wave_long$id %in% twinid),] #now the data does not have twin situations
#third, remove observations when people start to have a second or higher-order birth.
women_6wave_long2 <- women_6wave_long1%>%
filter(childno<2)
##women_6wave_long2 is a cleaned data, which removes twin cases and observations when people start to have a second or higher-order birth
Use the staggered difference in difference method to estimate the group-period specific effect of first birth on women’s wellbeing and plot the result
Step 1: create the treat group variable, a variable that tell us at which wave a woman is treated.
Step 2: then estimate the group-period specific effect of first birth on women’s wellbeing
Step 3: plot the result
women_6wave_long3 <- women_6wave_long2 %>%
group_by(id) %>%
mutate(
wave=as.numeric(wave),
birthwave=case_when(firstkid==1 ~ wave,
TRUE ~ 99), #identify the timing of first birth
anchorwave=min(birthwave), #generate the anchor wave
treatgroup=case_when(anchorwave %in% c(2:6) ~ anchorwave,
anchorwave==99 ~0 )
)
did1 <- att_gt(yname = "sat", #dependent variable
tname = "wave", #time variable
idname = "id", #id
gname = "treatgroup", #the variable in data that contains the first period when a particular observation is treated.
xformla = ~ health, #when you don't have any covariates to control, use "~ 1"; if yes, you can add covariates here by ~ x1+x2
data = women_6wave_long3 #specify your data
)
## Warning in pre_process_did(yname = yname, tname = tname, idname = idname, :
## Dropped 2346 observations while converting to balanced panel.
summary(did1)
##
## Call:
## att_gt(yname = "sat", tname = "wave", idname = "id", gname = "treatgroup",
## xformla = ~health, data = women_6wave_long3)
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 2 2 0.6364 0.3337 -0.2890 1.5618
## 2 3 -0.1154 0.3966 -1.2152 0.9844
## 2 4 -0.0283 0.2839 -0.8155 0.7590
## 2 5 0.1268 0.2816 -0.6540 0.9077
## 2 6 -0.1344 0.2812 -0.9142 0.6455
## 3 2 0.1890 0.3822 -0.8710 1.2489
## 3 3 0.4683 0.3607 -0.5318 1.4684
## 3 4 0.2713 0.3614 -0.7309 1.2735
## 3 5 -0.3048 0.4083 -1.4369 0.8273
## 3 6 -0.0345 0.4100 -1.1713 1.1024
## 4 2 -0.5418 0.3706 -1.5694 0.4857
## 4 3 0.6849 0.2590 -0.0333 1.4031
## 4 4 0.4158 0.2366 -0.2403 1.0720
## 4 5 -1.0459 0.3530 -2.0249 -0.0670 *
## 4 6 -0.5254 0.2534 -1.2280 0.1772
## 5 2 0.1981 0.3640 -0.8112 1.2074
## 5 3 0.2336 0.3146 -0.6386 1.1059
## 5 4 0.5715 0.2239 -0.0493 1.1923
## 5 5 0.0140 0.2812 -0.7658 0.7938
## 5 6 -0.8749 0.3730 -1.9091 0.1594
## 6 2 -0.4040 0.3087 -1.2601 0.4520
## 6 3 0.1991 0.3112 -0.6639 1.0620
## 6 4 -0.2025 0.3146 -1.0749 0.6699
## 6 5 0.9996 0.2974 0.1749 1.8243 *
## 6 6 0.2850 0.2154 -0.3123 0.8823
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 1e-05
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(did1)
Estimate the weighted average effect of all group-time specific treatment effects
agg1 <- aggte(did1, type = "simple")
summary(agg1)
##
## Call:
## aggte(MP = did1, type = "simple")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## ATT Std. Error [ 95% Conf. Int.]
## -0.076 0.1315 -0.3337 0.1817
##
##
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
Estimate the average treatment effect by groups and plot the result
agg2 <- aggte(did1, type = "group")
summary(agg2)
##
## Call:
## aggte(MP = did1, type = "group")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## Overall summary of ATT's based on group/cohort aggregation:
## ATT Std. Error [ 95% Conf. Int.]
## -0.0729 0.1107 -0.2898 0.1439
##
##
## Group Effects:
## Group Estimate Std. Error [95% Simult. Conf. Band]
## 2 0.0970 0.2284 -0.4438 0.6379
## 3 0.1001 0.3331 -0.6889 0.8891
## 4 -0.3852 0.1923 -0.8407 0.0704
## 5 -0.4304 0.2583 -1.0423 0.1814
## 6 0.2850 0.2359 -0.2736 0.8437
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(agg2)
Estimate the average time-dynamic effect and plot the result
agg3 <- aggte(did1, type = "dynamic")
summary(agg3)
##
## Call:
## aggte(MP = did1, type = "dynamic")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## Overall summary of ATT's based on event-study/dynamic aggregation:
## ATT Std. Error [ 95% Conf. Int.]
## -0.1128 0.1414 -0.39 0.1644
##
##
## Dynamic Effects:
## Event time Estimate Std. Error [95% Simult. Conf. Band]
## -4 -0.4040 0.3078 -1.2309 0.4228
## -3 0.1986 0.2347 -0.4317 0.8289
## -2 -0.1713 0.1892 -0.6794 0.3369
## -1 0.6238 0.1541 0.2100 1.0377 *
## 0 0.3410 0.1307 -0.0100 0.6921
## 1 -0.4796 0.1785 -0.9592 0.0001
## 2 -0.3210 0.1814 -0.8083 0.1663
## 3 0.0301 0.2620 -0.6739 0.7340
## 4 -0.1344 0.2773 -0.8792 0.6105
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(agg3)