Now, the question is “does first birth affect women’s life satisfaction?”

No. 1

Question

1. Import 6 waves of women’s data

Answer

#or you use loop to import to avoid repetitive coding, similar to forvalues in stata
library(tidyverse) 
library(haven) 
library(splitstackshape) 
library(ggplot2)
library(janitor)

women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")

No. 2

Question

2.1: Keep only variables across the 6 waves: id, age, wave, relstat, hlt1, nkidsbio, sat

2.2: clean variables and drop observations when they have missing values

Answer

clean_fun <- function(df) {  df %>% 
  transmute(
    id, 
    age, 
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    health=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

No. 3

Question

3. Keep those who have no kids initially and follow them across 6 waves, and generate a wide-formatted dataset for 6 waves

Answer

women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave_1=wave, age_1=age, relstat_1=relstat, health_1=health, childno_1=childno, sat_1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave_2=wave, age_2=age, relstat_2=relstat, health_2=health, childno_2=childno, sat_2=sat )

women3b <- women3a %>% 
rename(wave_3=wave, age_3=age, relstat_3=relstat, health_3=health, childno_3=childno, sat_3=sat )

women4b <- women4a %>% 
rename(wave_4=wave, age_4=age, relstat_4=relstat, health_4=health, childno_4=childno, sat_4=sat )

women5b <- women5a %>% 
rename(wave_5=wave, age_5=age, relstat_5=relstat, health_5=health, childno_5=childno, sat_5=sat )

women6b <- women6a %>% 
rename(wave_6=wave, age_6=age, relstat_6=relstat, health_6=health, childno_6=childno, sat_6=sat )


women_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

No. 4

Question

4. Find out how many women participate in all 6 waves? _____

Answer

women_wide$check <- paste(women_wide$wave_1, women_wide$wave_2, women_wide$wave_3,
                          women_wide$wave_4, women_wide$wave_5, women_wide$wave_6, sep='-')

tabyl(women_wide,check)
##             check    n     percent
##       1-2-3-4-5-6 1479 0.392307692
##      1-2-3-4-5-NA  204 0.054111406
##      1-2-3-4-NA-6   74 0.019628647
##     1-2-3-4-NA-NA  192 0.050928382
##      1-2-3-NA-5-6   72 0.019098143
##     1-2-3-NA-5-NA   27 0.007161804
##    1-2-3-NA-NA-NA  278 0.073740053
##      1-2-NA-4-5-6   58 0.015384615
##     1-2-NA-4-5-NA   14 0.003713528
##     1-2-NA-4-NA-6    2 0.000530504
##    1-2-NA-4-NA-NA   20 0.005305040
##   1-2-NA-NA-NA-NA  401 0.106366048
##      1-NA-3-4-5-6   90 0.023872679
##     1-NA-3-4-5-NA   10 0.002652520
##     1-NA-3-4-NA-6    5 0.001326260
##    1-NA-3-4-NA-NA   20 0.005305040
##     1-NA-3-NA-5-6    9 0.002387268
##    1-NA-3-NA-5-NA    6 0.001591512
##   1-NA-3-NA-NA-NA   35 0.009283820
##     1-NA-NA-4-5-6    1 0.000265252
##    1-NA-NA-4-5-NA    1 0.000265252
##   1-NA-NA-4-NA-NA    1 0.000265252
##  1-NA-NA-NA-NA-NA  771 0.204509284
#the answer is that 1479 women participated in all 6 waves

No. 5

Question

5. Find out how many childless women have their first child over the 6 waves? _____

Step1: first transform the data from wide to long

Step2: define the transition from childless to first child. Note that first childbearing could be a single birth or a twin

Step3: do tabulations to find out the number of first-childbearing event

Answer for Step 1

women_long<- merged.stack(women_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "health","childno", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = "_") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave

Answer for Step 2

women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    firstbirth=case_when( childno!=lag(childno, 1) & lag(childno, 1)==0 & childno>0 ~ 1,
                          TRUE ~ 0), 
    #when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
    twin=case_when( childno!=lag(childno, 1) & lag(childno, 1)==0 & childno==1 ~ "single birth", #single birth
                    childno!=lag(childno, 1) & lag(childno, 1)==0 & childno==2 ~ "likely twin", #twin birth
                          TRUE ~ "No birth") %>% as_factor()
    #when the person has 0 children at t-1 while has 1 child at t, define it a single birth
    #when the person has 0 children at t-1 while has 2 children at t, define it a likely twin birth
    ) 

Answer for Step 3

tabyl(women_long,firstbirth)
##  firstbirth     n    percent
##           0 14674 0.97670394
##           1   350 0.02329606
tabyl(women_long,check,twin )
##             check No birth likely twin single birth
##       1-2-3-4-5-6     8656           9          209
##      1-2-3-4-5-NA     1001           0           19
##      1-2-3-4-NA-6      361           0            9
##     1-2-3-4-NA-NA      757           1           10
##      1-2-3-NA-5-6      352           1            7
##     1-2-3-NA-5-NA      103           1            4
##    1-2-3-NA-NA-NA      815           0           19
##      1-2-NA-4-5-6      280           2            8
##     1-2-NA-4-5-NA       55           0            1
##     1-2-NA-4-NA-6        8           0            0
##    1-2-NA-4-NA-NA       58           0            2
##   1-2-NA-NA-NA-NA      788           0           14
##      1-NA-3-4-5-6      427           2           21
##     1-NA-3-4-5-NA       39           0            1
##     1-NA-3-4-NA-6       20           0            0
##    1-NA-3-4-NA-NA       57           0            3
##     1-NA-3-NA-5-6       33           1            2
##    1-NA-3-NA-5-NA       17           0            1
##   1-NA-3-NA-NA-NA       67           0            3
##     1-NA-NA-4-5-6        4           0            0
##    1-NA-NA-4-5-NA        3           0            0
##   1-NA-NA-4-NA-NA        2           0            0
##  1-NA-NA-NA-NA-NA      771           0            0
#350 women become mothers over the 6 waves; 333 women have given a single birth;17 women have given a birth of twins

No. 6

Question

6. Randomly select 10 individuals who have their first child over the 6 waves, and plot the life satisfaction

Step1: please find out at which wave the first childbearing happened

Step2: please randomly select 10 individuals who have first child over the 6 waves

Step3: please plot the life satisfaction of these 10 individuals and also highlight the time of first childbirth in the graph

Answer for Step1

women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    birthwave=case_when(firstbirth==1 ~ wave)
    #define the time when firstkid is 1, meaning that the person experience first childbearing event at this wave
  )

Answer for Step2

sample<- sample (women_long$id[women_long$firstbirth==1], size=10) #randomly select 10 individuals
#or you can just
birth_id <- women_long %>% 
              filter(firstbirth==1) %>%
              select(id)#restrict to individuals who have first birth
sample_id <- sample(birth_id$id, size=10) #randomly select 10 
sample <- filter(women_long, id%in%sample_id) #find the 10 individuals in the women_long data set by matching id

Answer for Step3

ggplot(data=sample )+ #use ggplot to see changes of sat over time
  geom_point(mapping=aes(x=wave,y=sat))+
  geom_vline(mapping=aes(xintercept = birthwave ))+
  facet_wrap(~ id, ncol=5) #this is new, to plot sat by id