Now, the question is “does first birth affect women’s life satisfaction?” use different models and select the appropriate model
Prepare the dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(ggplot2)
library(plm)
library(lmtest)
##Import 6 waves of women data
for (i in 1:6) {
assign(paste0("women", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6.
read_dta(paste0("wave", i, "_women.dta"))
)
}
##Clean 6 waves of women data
clean_fun <- function(df) { df %>%
transmute(
id=zap_label(id), #remove label of id
age=zap_label(age), #remove label of age
wave=as.numeric(wave),
relstat=as_factor(relstat), #make relstat as a factor
relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
hlt=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(hlt1)),
nkidsbio=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6)),
)%>% drop_na() }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)
women1b <- women1a %>% filter(nkidsbio==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, hlt.1=hlt, nkidsbio.1=nkidsbio, sat.1=sat ) #rename variables
women2b <- women2a %>%
rename(wave.2=wave, age.2=age, relstat.2=relstat, hlt.2=hlt, nkidsbio.2=nkidsbio, sat.2=sat )
women3b <- women3a %>%
rename(wave.3=wave, age.3=age, relstat.3=relstat, hlt.3=hlt, nkidsbio.3=nkidsbio, sat.3=sat )
women4b <- women4a %>%
rename(wave.4=wave, age.4=age, relstat.4=relstat, hlt.4=hlt, nkidsbio.4=nkidsbio, sat.4=sat )
women5b <- women5a %>%
rename(wave.5=wave, age.5=age, relstat.5=relstat, hlt.5=hlt, nkidsbio.5=nkidsbio, sat.5=sat )
women6b <- women6a %>%
rename(wave.6=wave, age.6=age, relstat.6=relstat, hlt.6=hlt, nkidsbio.6=nkidsbio, sat.6=sat )
women_wide <- left_join(women1b, women2b, by = "id") %>% # left join women1b and women2b
left_join(women3b, by = "id") %>% # left join with women3b
left_join(women4b, by = "id") %>% # left join with women4b
left_join(women5b, by = "id") %>% # left join with women5b
left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them
women_wide$check <- paste(women_wide$wave.1, women_wide$wave.2, women_wide$wave.3,
women_wide$wave.4, women_wide$wave.5, women_wide$wave.6, sep='-')
women_long<- merged.stack(women_wide, #dataset for transfrom
var.stubs = c("age", "wave", "relstat", "hlt","nkidsbio", "sat"),
#var.stubs is to specify the prefixes of the variable groups
sep = ".") %>%
#sep is to specify the character that separates the "variable name" from the "times" in the source
drop_na(wave)
#drop the observations which did not join the wave
women_long <- women_long %>%
group_by(id) %>%
mutate(
firstkid=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio>0 ~ 1,
TRUE ~ 0),
firstkid2=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio==2 ~ 1,
TRUE ~ 0)
) #to identify individual whose first childbearing is twins
twinid <- women_long$id[women_long$firstkid2==1]
women_long <- women_long[!(women_long$id %in% twinid),] #remove respondents whose first childbearing is twins
women_long <- filter(women_long, nkidsbio<2) # remove repeated event of childbearing, only focus on having first child
women_long <- women_long %>%
select(-firstkid2)%>%
group_by(id) %>%
mutate(
wave=as.numeric(wave),
birthwave=case_when(firstkid==1 ~ wave), #identify the timing of first birth
birthwave1=min(birthwave, na.rm = TRUE) #generate a variable for timing of first birth
)
women_long$birthwave1[is.infinite(women_long$birthwave1)] <- 0 #those who are untreated the value is 0.
Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method
library(did)
## Warning: package 'did' was built under R version 4.3.2
did1 <- att_gt(yname = "sat", #dependent variable
tname = "wave", #time variable
idname = "id", #id
gname = "birthwave1", #the variable in data that contains the first period when a particular observation is treated.
xformla = ~ hlt, #when you don't have any covariates to control, use "~ 1"; if yes, you can add covariates here by ~ x1+x2
data = women_long #specify your data
)
## Warning in pre_process_did(yname = yname, tname = tname, idname = idname, :
## Dropped 2346 observations while converting to balanced panel.
summary(did1)
##
## Call:
## att_gt(yname = "sat", tname = "wave", idname = "id", gname = "birthwave1",
## xformla = ~hlt, data = women_long)
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 2 2 0.6364 0.3382 -0.2980 1.5709
## 2 3 -0.1154 0.4199 -1.2756 1.0448
## 2 4 -0.0283 0.2959 -0.8460 0.7895
## 2 5 0.1268 0.2929 -0.6826 0.9363
## 2 6 -0.1344 0.2728 -0.8881 0.6193
## 3 2 0.1890 0.3715 -0.8376 1.2155
## 3 3 0.4683 0.3571 -0.5184 1.4550
## 3 4 0.2713 0.3490 -0.6929 1.2356
## 3 5 -0.3048 0.4071 -1.4297 0.8200
## 3 6 -0.0345 0.4103 -1.1683 1.0994
## 4 2 -0.5418 0.3218 -1.4311 0.3475
## 4 3 0.6849 0.2517 -0.0105 1.3803
## 4 4 0.4158 0.2468 -0.2660 1.0977
## 4 5 -1.0459 0.3443 -1.9972 -0.0947 *
## 4 6 -0.5254 0.2615 -1.2479 0.1971
## 5 2 0.1981 0.3389 -0.7382 1.1344
## 5 3 0.2336 0.3242 -0.6623 1.1296
## 5 4 0.5715 0.2250 -0.0503 1.1933
## 5 5 0.0140 0.2897 -0.7866 0.8146
## 5 6 -0.8749 0.3582 -1.8647 0.1150
## 6 2 -0.4040 0.3032 -1.2417 0.4336
## 6 3 0.1991 0.3028 -0.6376 1.0357
## 6 4 -0.2025 0.3276 -1.1077 0.7026
## 6 5 0.9996 0.2869 0.2070 1.7923 *
## 6 6 0.2850 0.2189 -0.3199 0.8899
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 1e-05
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(did1)
Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method
agg1 <- aggte(did1, type = "simple")
summary(agg1)
##
## Call:
## aggte(MP = did1, type = "simple")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## ATT Std. Error [ 95% Conf. Int.]
## -0.076 0.128 -0.3269 0.1748
##
##
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method
agg2 <- aggte(did1, type = "group")
summary(agg2)
##
## Call:
## aggte(MP = did1, type = "group")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## Overall summary of ATT's based on group/cohort aggregation:
## ATT Std. Error [ 95% Conf. Int.]
## -0.0729 0.1067 -0.282 0.1362
##
##
## Group Effects:
## Group Estimate Std. Error [95% Simult. Conf. Band]
## 2 0.0970 0.2460 -0.4802 0.6743
## 3 0.1001 0.3303 -0.6749 0.8751
## 4 -0.3852 0.2029 -0.8613 0.0909
## 5 -0.4304 0.2566 -1.0326 0.1717
## 6 0.2850 0.2205 -0.2323 0.8024
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(agg2)
Run a staggered difference in difference using Callaway and Sant’Anna (2021)’s method
agg3 <- aggte(did1, type = "dynamic")
summary(agg3)
##
## Call:
## aggte(MP = did1, type = "dynamic")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## Overall summary of ATT's based on event-study/dynamic aggregation:
## ATT Std. Error [ 95% Conf. Int.]
## -0.1128 0.1387 -0.3847 0.1591
##
##
## Dynamic Effects:
## Event time Estimate Std. Error [95% Simult. Conf. Band]
## -4 -0.4040 0.3097 -1.2360 0.4279
## -3 0.1986 0.2459 -0.4620 0.8592
## -2 -0.1713 0.1721 -0.6335 0.2910
## -1 0.6238 0.1402 0.2473 1.0004 *
## 0 0.3410 0.1270 -0.0001 0.6822
## 1 -0.4796 0.1787 -0.9596 0.0005
## 2 -0.3210 0.1726 -0.7847 0.1427
## 3 0.0301 0.2533 -0.6504 0.7105
## 4 -0.1344 0.2921 -0.9192 0.6505
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ggdid(agg3)