Now, the question is “does first birth affect women’s life satisfaction?” use different models and select the approriate model

Prepare the dataset

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(broom)
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(plm) #linear models for panel data
library(did) #for difference in difference analysis


##Import 6 waves of women data
for (i in 1:2) {
  assign(paste0("women", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6. 
         read_dta(paste0("wave", i, "_women.dta"))
         )
} 

##Clean 6 waves of women data
clean_fun <- function(df) {  df %>% 
  transmute(
    id=zap_label(id), #remove label of id
    age=zap_label(age), #remove label of age
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    hlt=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    nkidsbio=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
#women3a <- clean_fun(women3)
#women4a <- clean_fun(women4)
#women5a <- clean_fun(women5)
#women6a <- clean_fun(women6)

women1b <- women1a %>% filter(nkidsbio==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, hlt.1=hlt, nkidsbio.1=nkidsbio, sat.1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave.2=wave, age.2=age, relstat.2=relstat, hlt.2=hlt, nkidsbio.2=nkidsbio, sat.2=sat )

#women3b <- women3a %>% 
#rename(wave.3=wave, age.3=age, relstat.3=relstat, hlt.3=hlt, nkidsbio.3=nkidsbio, sat.3=sat )

#women4b <- women4a %>% 
#rename(wave.4=wave, age.4=age, relstat.4=relstat, hlt.4=hlt, nkidsbio.4=nkidsbio, sat.4=sat )

#women5b <- women5a %>% 
#rename(wave.5=wave, age.5=age, relstat.5=relstat, hlt.5=hlt, nkidsbio.5=nkidsbio, sat.5=sat )

#women6b <- women6a %>% 
#rename(wave.6=wave, age.6=age, relstat.6=relstat, hlt.6=hlt, nkidsbio.6=nkidsbio, sat.6=sat )


women_wide <- left_join(women1b, women2b, by = "id") 
#%>%  # left join women1b and women2b
#  left_join(women3b, by = "id") %>% # left join with women3b
#  left_join(women4b, by = "id") %>% # left join with women4b
#  left_join(women5b, by = "id") %>% # left join with women5b
#  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them


## Check the participation over time
women_wide$check <- paste(women_wide$wave.1, women_wide$wave.2, sep='-')
#wide_data$check <- paste(wide_data$wave.1, wide_data$wave.2, wide_data$wave.3,
#                         wide_data$wave.4, wide_data$wave.5, wide_data$wave.6, sep='-')
table(women_wide$check)
## 
##  1-2 1-NA 
## 2821  949
women_long<- merged.stack(women_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "hlt","nkidsbio", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave


women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio>0 ~ 1,
                          TRUE ~ 0),
    firstkid2=case_when( nkidsbio!=dplyr::lag(nkidsbio, 1) & dplyr::lag(nkidsbio, 1)==0 & nkidsbio==2 ~ 1,
                          TRUE ~ 0)
    ) #to identify individual whose first childbearing is twins

twinid <- women_long$id[women_long$firstkid2==1]

women_long <- women_long[!(women_long$id %in% twinid),] #remove respondents whose first childbearing is twins
women_long <-  filter(women_long, nkidsbio<2) # remove repeated event of childbearing, only focus on having first child


women_long <- women_long %>% 
  select(-firstkid2)%>%
  group_by(id) %>% 
  mutate(
     wave=as.numeric(wave),
     birthwave=case_when(firstkid==1 ~ wave), #identify the timing of first birth
     birthwave1=min(birthwave, na.rm = TRUE) #generate a variable for timing of first birth
  )
women_long$birthwave1[is.infinite(women_long$birthwave1)] <- 0 #those who are untreated the value is 0.

No. 1

Question

Generate a balanced data based on dataset “women_long”

Answer

table(women_wide$check)
## 
##  1-2 1-NA 
## 2821  949
#we find that 949 have only participate in wave 1
women_balance <- filter(women_long, check=="1-2")

No. 2

Question

Run an ols regression to do the difference-in-difference estimation

Answer

women_balance <- women_balance %>% 
  group_by(id) %>% 
  mutate(
    treatgroup=sum(firstkid), #create a variable to identify the treat group
    treattime=case_when(wave==1 ~ 0,
                        wave==2 ~ 1), #create a variable to identify the treatment time
    grouptime=treatgroup*treattime #create an interaction between the treat group and treatment time.
  ) %>% 
  filter(check=="1-2") #drop individuals who only participate in the first wave. Now the data is balanced.

didmodel1 <- lm(sat ~ treatgroup* treattime, data = women_balance)
summary(didmodel1)
## 
## Call:
## lm(formula = sat ~ treatgroup * treattime, data = women_balance)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7775 -0.7775  0.2225  1.2225  2.3333 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.6675220  0.0314740 243.614   <2e-16 ***
## treatgroup           -0.0008553  0.1761170  -0.005   0.9961    
## treattime             0.1099707  0.0445110   2.471   0.0135 *  
## treatgroup:treattime  0.5122515  0.2490670   2.057   0.0398 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.644 on 5632 degrees of freedom
## Multiple R-squared:  0.002966,   Adjusted R-squared:  0.002435 
## F-statistic: 5.584 on 3 and 5632 DF,  p-value: 0.0008033

No. 3

Question

Run a twoway fixed regression to do the difference-in-difference estimation

Answer

women_balance1 <- pdata.frame(women_balance, index=c("id", "wave"))
didmodel2 <- plm(sat ~ grouptime, data=women_balance1, effect="twoway", model="within") ##
summary(didmodel2)
## Twoways effects Within Model
## 
## Call:
## plm(formula = sat ~ grouptime, data = women_balance1, effect = "twoway", 
##     model = "within")
## 
## Balanced Panel: n = 2818, T = 2, N = 5636
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -4.5550e+00 -4.4501e-01 -4.4409e-16  4.4501e-01  4.5550e+00 
## 
## Coefficients:
##           Estimate Std. Error t-value Pr(>|t|)   
## grouptime  0.51225    0.18152   2.822 0.004806 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4053.5
## Residual Sum of Squares: 4042.1
## R-Squared:      0.00282
## Adj. R-Squared: -0.99542
## F-statistic: 7.96361 on 1 and 2816 DF, p-value: 0.0048062

No. 4

Question

use att_gt() function under the “did” package to do the difference-in-difference estimation

Answer

didmodel3 <- att_gt(yname = "sat", #dependent variable
                        tname = "wave", #time
                        idname = "id", #id
                        gname = "birthwave1", #the variable in data that contains the first period when a particular observation is treated.
                        xformla = ~1, #when you don't have any covariates to control, use "~ 1"; if yes, you can add covariates here by ~ x1+x2
                        data = women_balance #specify your data
                        )
## No pre-treatment periods to test
summary(didmodel3)
## 
## Call:
## att_gt(yname = "sat", tname = "wave", idname = "id", gname = "birthwave1", 
##     xformla = ~1, data = women_balance)
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Pointwise  Conf. Band]  
##      2    2   0.5123     0.1631          0.1829      0.8416 *
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust