Now, the question is “does first birth affect women’s life satisfaction?”. Try to estimate the effect using difference in difference method

Prepare the dataset

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(texreg) #output result
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(plm) #linear models for panel data

##Import 6 waves of women data
women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")

##Clean 6 waves of women data
clean_fun <- function(df) {  df %>% 
  transmute(
    id, 
    age, 
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    health=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, health.1=health, childno.1=childno, sat.1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave.2=wave, age.2=age, relstat.2=relstat, health.2=health, childno.2=childno, sat.2=sat )

women3b <- women3a %>% 
rename(wave.3=wave, age.3=age, relstat.3=relstat, health.3=health, childno.3=childno, sat.3=sat )

women4b <- women4a %>% 
rename(wave.4=wave, age.4=age, relstat.4=relstat, health.4=health, childno.4=childno, sat.4=sat )

women5b <- women5a %>% 
rename(wave.5=wave, age.5=age, relstat.5=relstat, health.5=health, childno.5=childno, sat.5=sat )

women6b <- women6a %>% 
rename(wave.6=wave, age.6=age, relstat.6=relstat, health.6=health, childno.6=childno, sat.6=sat )

###two waves of women data
women_2wave_wide <- left_join(women1b, women2b, by = "id")  # left join women1b and women2b
#by using left_join I keep those have no kids in the first wave and follow them

women_2wave_wide$check <- paste(women_2wave_wide$wave.1, women_2wave_wide$wave.2,sep='-')

women_2wave_long<- merged.stack(women_2wave_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "health","childno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave

women_2wave_long <- women_2wave_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno>0 ~ 1,
                          TRUE ~ 0), 
    #when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
    twin=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==1 ~ 1, #single birth
                    childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==2 ~ 2, #twin birth
                    TRUE ~ 0)
    #when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
    #when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
    ) 

#lastly, remove individuals who have twins
twinid <- women_2wave_long$id[women_2wave_long$twin==2] #the id of women who have twin for their first birth
women_2wave_long1 <- women_2wave_long[!(women_2wave_long$id %in% twinid),] #now the data does not have twin situations

No. 1

Question

women_2wave_long1 is a unbalanced data of two waves. Generate a balanced data based on dataset “women_2wave_long1”

Answer

table(women_2wave_wide$check)
## 
##  1-2 1-NA 
## 2821  949
#we find that 949 have only participate in wave 1

women_2wave_balance <- filter(women_2wave_long1, check=="1-2")
#now "women_2wave_balance is the balanced data of two waves."

No. 2

Question

Run an ols regression to do the difference-in-difference estimation, based on the data generated from Question 1

Answer

women_2wave_balance <- women_2wave_balance %>% 
  group_by(id) %>% 
  mutate(
    treatgroup=sum(firstkid), #create a variable to identify the treat group
    treattime=case_when(wave==1 ~ 0,
                        wave==2 ~ 1), #create a variable to identify the treatment time
    group_time=treatgroup*treattime #create an interaction between the treat group and treatment time.
  ) 

didmodel1 <- lm(sat ~ treatgroup + treattime + group_time, data = women_2wave_balance)
summary(didmodel1)
## 
## Call:
## lm(formula = sat ~ treatgroup + treattime + group_time, data = women_2wave_balance)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7775 -0.7775  0.2225  1.2225  2.3333 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.6675220  0.0314740 243.614   <2e-16 ***
## treatgroup  -0.0008553  0.1761170  -0.005   0.9961    
## treattime    0.1099707  0.0445110   2.471   0.0135 *  
## group_time   0.5122515  0.2490670   2.057   0.0398 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.644 on 5632 degrees of freedom
## Multiple R-squared:  0.002966,   Adjusted R-squared:  0.002435 
## F-statistic: 5.584 on 3 and 5632 DF,  p-value: 0.0008033
#interpretation: having a first child will increase the life satisfaction of women by 0.51 points

No. 3

Question

Run a twoway fixed effect regression to do the difference-in-difference estimation, based on the data generated from Question 1

Answer

women_2wave_balance <- pdata.frame(women_2wave_balance, index=c("id", "wave"))
didmodel2 <- plm(sat ~ group_time, data=women_2wave_balance, effect="twoway", model="within") ##
summary(didmodel2)
## Twoways effects Within Model
## 
## Call:
## plm(formula = sat ~ group_time, data = women_2wave_balance, effect = "twoway", 
##     model = "within")
## 
## Balanced Panel: n = 2818, T = 2, N = 5636
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -4.5550e+00 -4.4501e-01 -4.4409e-16  4.4501e-01  4.5550e+00 
## 
## Coefficients:
##            Estimate Std. Error t-value Pr(>|t|)   
## group_time  0.51225    0.18152   2.822 0.004806 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4053.5
## Residual Sum of Squares: 4042.1
## R-Squared:      0.00282
## Adj. R-Squared: -0.99542
## F-statistic: 7.96361 on 1 and 2816 DF, p-value: 0.0048062
#interpretation: having a first child will increase the life satisfaction of women by 0.51 points

#Or you can 
didmodel3 <- plm(sat ~ childno, data=women_2wave_balance, effect="twoway", model="within") 
summary(didmodel3)
## Twoways effects Within Model
## 
## Call:
## plm(formula = sat ~ childno, data = women_2wave_balance, effect = "twoway", 
##     model = "within")
## 
## Balanced Panel: n = 2818, T = 2, N = 5636
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -4.5550e+00 -4.4501e-01 -4.4409e-16  4.4501e-01  4.5550e+00 
## 
## Coefficients:
##         Estimate Std. Error t-value Pr(>|t|)   
## childno  0.51225    0.18152   2.822 0.004806 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4053.5
## Residual Sum of Squares: 4042.1
## R-Squared:      0.00282
## Adj. R-Squared: -0.99542
## F-statistic: 7.96361 on 1 and 2816 DF, p-value: 0.0048062
#interpretation: having a first child will increase the life satisfaction of women by 0.51 points

No. 4

Question

Run a twoway fixed regression to do the difference-in-difference estimation, using the unbalanced data “women_2wave_long1”

Answer

women_2wave_long1 <- pdata.frame(women_2wave_long1, index=c("id", "wave"))
didmodel4 <- plm(sat ~ childno, data=women_2wave_long1, effect="twoway", model="within") 
summary(didmodel4)
## Twoways effects Within Model
## 
## Call:
## plm(formula = sat ~ childno, data = women_2wave_long1, effect = "twoway", 
##     model = "within")
## 
## Unbalanced Panel: n = 3767, T = 1-2, N = 6585
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -4.55499 -0.44501  0.00000  0.44501  4.55499 
## 
## Coefficients:
##         Estimate Std. Error t-value Pr(>|t|)   
## childno  0.51225    0.18152   2.822 0.004806 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    4053.5
## Residual Sum of Squares: 4042.1
## R-Squared:      0.00282
## Adj. R-Squared: -1.3315
## F-statistic: 7.96361 on 1 and 2816 DF, p-value: 0.0048062
#interpretation: having a first child will increase the life satisfaction of women by 0.51 points

No. 5

Question 6

please run the following code, and then perform a difference in difference model using six waves of data

###six waves of women data
women_6wave_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

women_6wave_long<- merged.stack(women_6wave_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "health","childno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave

women_6wave_long <- women_6wave_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno>0 ~ 1,
                          TRUE ~ 0), 
    #when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
    twin=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==1 ~ 1, #single birth
                    childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==2 ~ 2, #twin birth
                    TRUE ~ 0)
    #when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
    #when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
    ) 

#second, remove individuals who have twins
twinid <- women_6wave_long$id[women_6wave_long$twin==2] #the id of women who have twin for their first birth
women_6wave_long1 <- women_6wave_long[!(women_6wave_long$id %in% twinid),] #now the data does not have twin situations

#third, remove observations when people start to have a second or higher-order birth.
women_6wave_long2 <-  women_6wave_long1%>% 
  filter(childno<2)

##women_6wave_long2 is a cleaned data, which removes twin cases and observations when people start to have a second or higher-order birth

Answer

women_6wave_long2 <- pdata.frame(women_6wave_long2, index=c("id", "wave"))
didmodel5 <- plm(sat ~ childno, data=women_6wave_long2, effect="twoway", model="within") 
summary(didmodel5)
## Twoways effects Within Model
## 
## Call:
## plm(formula = sat ~ childno, data = women_6wave_long2, effect = "twoway", 
##     model = "within")
## 
## Unbalanced Panel: n = 3753, T = 1-6, N = 14809
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -7.949279 -0.470613  0.029387  0.558842  5.687805 
## 
## Coefficients:
##         Estimate Std. Error t-value  Pr(>|t|)    
## childno 0.257523   0.070628  3.6462 0.0002674 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    16905
## Residual Sum of Squares: 16885
## R-Squared:      0.0012017
## Adj. R-Squared: -0.33848
## F-statistic: 13.2948 on 1 and 11050 DF, p-value: 0.00026737
#interpretation: having a first child will increase the life satisfaction of women by 0.26 points