Now, the question is “does first birth affect women’s life satisfaction?”

Prepare the dataset

library(tidyverse) # recoding
library(haven) # import data.
library(janitor) #tabulation
library(texreg) # export result
library(splitstackshape) #transform wide data to long data
library(plm) # panel data analysis

##Import 6 waves of women data
women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")

##Clean 6 waves of women data
clean_fun <- function(df) {  df %>% 
  transmute(
    id, 
    age, 
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    health=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave_1=wave, age_1=age, relstat_1=relstat, health_1=health, childno_1=childno, sat_1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave_2=wave, age_2=age, relstat_2=relstat, health_2=health, childno_2=childno, sat_2=sat )

women3b <- women3a %>% 
rename(wave_3=wave, age_3=age, relstat_3=relstat, health_3=health, childno_3=childno, sat_3=sat )

women4b <- women4a %>% 
rename(wave_4=wave, age_4=age, relstatv4=relstat, health_4=health, childno_4=childno, sat_4=sat )

women5b <- women5a %>% 
rename(wave_5=wave, age_5=age, relstat_5=relstat, health_5=health, childno_5=childno, sat_5=sat )

women6b <- women6a %>% 
rename(wave_6=wave, age_6=age, relstat_6=relstat, health_6=health, childno_6=childno, sat_6=sat )


women_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

No. 1

Question

Transform the “women_wide” data to a long format, name it as “women_long”. Figure out the number of respondents and the number of total person-year observations.

Answer

women_long<- merged.stack(women_wide, #dataset for transfrom
                          var.stubs = c("wave", "age", "relstat", "health","childno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                          sep = "_") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                          drop_na(wave)
#drop the observations which did not join the wave

the number of respondents is 3770. the number of person-year observations is 15024.

No. 2

Question

Use pooled regression to estimate the effect of first births on women’s life satisfaction?

Answer

panel_data <- pdata.frame(women_long, index=c("id", "wave"))
pols <- plm(sat ~ childno, data=panel_data, model="pooling") 
summary(pols)
## Pooling Model
## 
## Call:
## plm(formula = sat ~ childno, data = panel_data, model = "pooling")
## 
## Unbalanced Panel: n = 3770, T = 1-6, N = 15024
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -7.73697 -0.63976  0.36024  1.36024  2.36024 
## 
## Coefficients:
##             Estimate Std. Error  t-value Pr(>|t|)    
## (Intercept) 7.639758   0.013807 553.3134  < 2e-16 ***
## childno     0.097211   0.043509   2.2343  0.02548 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    40514
## Residual Sum of Squares: 40501
## R-Squared:      0.00033219
## Adj. R-Squared: 0.00026565
## F-statistic: 4.99188 on 1 and 15022 DF, p-value: 0.025481

No. 3

Question

Use fixed effect to estimate the effect of childbearing on women’s life satisfaction?

Answer

fixed <- plm(sat ~ childno, data=panel_data, model="within") 
summary(fixed)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = sat ~ childno, data = panel_data, model = "within")
## 
## Unbalanced Panel: n = 3770, T = 1-6, N = 15024
## 
## Residuals:
##    Min. 1st Qu.  Median 3rd Qu.    Max. 
## -8.0000 -0.5000  0.0000  0.6000  5.6667 
## 
## Coefficients:
##         Estimate Std. Error t-value Pr(>|t|)
## childno 0.037299   0.052198  0.7146   0.4749
## 
## Total Sum of Squares:    17346
## Residual Sum of Squares: 17345
## R-Squared:      4.5372e-05
## Adj. R-Squared: -0.33496
## F-statistic: 0.510598 on 1 and 11253 DF, p-value: 0.47489

No. 4

Question

Compare the results from the pooled OLS and fixed effect. Please interpret the result of fixed effect.

Answer

texreg::screenreg(list(pols, fixed), #get the result from regressions named "pols" and "fixed".
                custom.model.names=c("Pooled regression", #rename the regression 
                                     "Fixed effect(within)"), #rename the regression 
                include.ci = FALSE, #not including confidence interval.
                center=TRUE) 
## 
## ====================================================
##              Pooled regression  Fixed effect(within)
## ----------------------------------------------------
## (Intercept)      7.64 ***                           
##                 (0.01)                              
## childno          0.10 *             0.04            
##                 (0.04)             (0.05)           
## ----------------------------------------------------
## R^2              0.00               0.00            
## Adj. R^2         0.00              -0.33            
## Num. obs.    15024              15024               
## ====================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Interpretation: the coefficient of childno in the fixed effect model is 0.04 which is non-significant at 5% significance level. Literally it means, when a women’s number of children increases by 1, her life satisfaction would increase by 0.04 unit on average.

No. 5

Question

  • Step 1: Do you think the results from Question No.3 reflect the effect of first child births? please tabulate the “childno” variable to understand the results
  • Step 2: Remove observations when individuals have twins at first birth
  • Step 3: Remove observations when individuals start to have a second birth

Answer for Step 1

table(panel_data$childno)
## 
##     0     1     2     3 
## 14044   809   167     4
#there are many cases that contains second or highe order births.
#the result from fixed effect reflect when a woman's number of children increases by 1, how her life satisfaction would change on average.
#it combines the effect of number of children increasing from 0 to 1, 1 to 2, or 2 to 3.

Answer for Step 2

#first, identify individuals who have twins
panel_data <- panel_data %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno>0 ~ 1,
                          TRUE ~ 0), 
    #when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
    #it contains cases of 0 to 1, i.e. single birth, 0 to 2,i.e. twin birth, 0 to 3, i.e. triple birth
    twin=case_when( childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==1 ~ 1, #single birth
                    childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==2 ~ 2, #twin birth,
                    childno!=dplyr::lag(childno, 1) & dplyr::lag(childno, 1)==0 & childno==3 ~ 3, #triple birth
                          TRUE ~ 0)
    #when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
    #when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
    #when the person has 0 children at t-1 while has 3 children at t, define it a triple birth, i.e. 3
    ) 

tabyl(panel_data, firstkid,twin)
##  firstkid     0   1  2
##         0 14674   0  0
##         1     0 333 17
#333 are single birth; 17 are likely to be twin births.

#second, remove individuals who have twins
twinid <- panel_data$id[panel_data$twin==2] #get the id of women who have twin for their first birth

panel_data_notwin <- panel_data%>% filter(!(id %in% twinid)) #remove all observations for women have twin cases

Answer for Step 3

tabyl(panel_data_notwin,childno)
##  childno     n      percent
##        0 14001 0.9375878926
##        1   808 0.0541083506
##        2   121 0.0081028594
##        3     3 0.0002008973
panel_data_final <-  filter(panel_data_notwin, childno<2) #keep observations right before women having second child

tabyl(panel_data_final,childno)
##  childno     n    percent
##        0 14001 0.94543858
##        1   808 0.05456142
tabyl(panel_data_final,firstkid, twin)
##  firstkid     0   1
##         0 14477   0
##         1     0 332
#now we are sure people here are those who only have one child

No. 6

Question

  • Step1: Now, please run fixed effect to find out the effect of first birth on women’s life satisfaction. Interpret the result.

  • Step2: Compare the result from Question 3, and is there any difference? Please interpret the result of this fixed effect model

Answer for Step 1

fixed2 <- plm(sat ~ childno, data=panel_data_final, model="within") 
summary(fixed2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = sat ~ childno, data = panel_data_final, model = "within")
## 
## Unbalanced Panel: n = 3753, T = 1-6, N = 14809
## 
## Residuals:
##     Min.  1st Qu.   Median  3rd Qu.     Max. 
## -8.00000 -0.50000  0.00000  0.59092  5.66667 
## 
## Coefficients:
##         Estimate Std. Error t-value Pr(>|t|)   
## childno 0.181837   0.068196  2.6664 0.007679 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    16966
## Residual Sum of Squares: 16956
## R-Squared:      0.00064269
## Adj. R-Squared: -0.33862
## F-statistic: 7.10949 on 1 and 11055 DF, p-value: 0.0076788
##Interpretation:
#A within-person change from childless to having a first child is associated with 0.18 scale points increase in life satisfaction for women (always correct interpretation).
#When having a first child, life satisfaction is 0.18 points higher than when being childless (always correct interpretation).

Answer for Step 2

texreg::screenreg(list(fixed, fixed2), 
                custom.model.names=c("Fixed effect(within)", 
                                     "Fixed effect(within)2"),
                include.ci = FALSE, 
                center=TRUE) 
## 
## ======================================================
##            Fixed effect(within)  Fixed effect(within)2
## ------------------------------------------------------
## childno        0.04                  0.18 **          
##               (0.05)                (0.07)            
## ------------------------------------------------------
## R^2            0.00                  0.00             
## Adj. R^2      -0.33                 -0.34             
## Num. obs.  15024                 14809                
## ======================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05

Now the coefficient of childno in the fixed effect (within)2 is positive and statistically significant.