Now, the question is “does first birth affect women’s life satisfaction?”

No. 1

Question

Step 1: Import 6 waves of women’s data

Answer

#or you use loop to import to avoid repetitive coding, similar to forvalues in stata
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(ggplot2)
for (i in 1:6) {
  assign(paste0("women", i), #assign is similar to <-; paste0 is to combine wave and i into a name, i ranges from 1 to 6. 
         read_dta(paste0("wave", i, "_women.dta"))
         )
} 

No. 2

Question

Step 2: Keep only variables across the 6 waves: id, age, wave, relstat, hlt, nkidsbio, sat

Step 3: clean variables and drop observations when they have missing values

Answer

clean_fun <- function(df) {  df %>% 
  transmute(
    id=zap_label(id), #remove label of id
    age=zap_label(age), #remove label of age
    wave=as.numeric(wave),
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    hlt=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    nkidsbio=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
                      TRUE ~ as.numeric(nkidsbio)), 
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
  )%>% drop_na()  }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)

No. 3

Question

Step 4: Keep those who have no kids initially and follow them across 6 waves, and generate a wide-formatted dataset for 6 waves

Answer

women1b <- women1a %>% filter(nkidsbio==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, hlt.1=hlt, nkidsbio.1=nkidsbio, sat.1=sat ) #rename variables

women2b <- women2a %>% 
rename(wave.2=wave, age.2=age, relstat.2=relstat, hlt.2=hlt, nkidsbio.2=nkidsbio, sat.2=sat )

women3b <- women3a %>% 
rename(wave.3=wave, age.3=age, relstat.3=relstat, hlt.3=hlt, nkidsbio.3=nkidsbio, sat.3=sat )

women4b <- women4a %>% 
rename(wave.4=wave, age.4=age, relstat.4=relstat, hlt.4=hlt, nkidsbio.4=nkidsbio, sat.4=sat )

women5b <- women5a %>% 
rename(wave.5=wave, age.5=age, relstat.5=relstat, hlt.5=hlt, nkidsbio.5=nkidsbio, sat.5=sat )

women6b <- women6a %>% 
rename(wave.6=wave, age.6=age, relstat.6=relstat, hlt.6=hlt, nkidsbio.6=nkidsbio, sat.6=sat )


women_wide <- left_join(women1b, women2b, by = "id") %>%  # left join women1b and women2b
  left_join(women3b, by = "id") %>% # left join with women3b
  left_join(women4b, by = "id") %>% # left join with women4b
  left_join(women5b, by = "id") %>% # left join with women5b
  left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them

No. 4

Question

Find out how many women participate in all 6 waves? _____

Answer

women_wide$check <- paste(women_wide$wave.1, women_wide$wave.2, women_wide$wave.3,
                          women_wide$wave.4, women_wide$wave.5, women_wide$wave.6, sep='-')

table(women_wide$check)
## 
##      1-2-3-4-5-6     1-2-3-4-5-NA     1-2-3-4-NA-6    1-2-3-4-NA-NA 
##             1479              204               74              192 
##     1-2-3-NA-5-6    1-2-3-NA-5-NA   1-2-3-NA-NA-NA     1-2-NA-4-5-6 
##               72               27              278               58 
##    1-2-NA-4-5-NA    1-2-NA-4-NA-6   1-2-NA-4-NA-NA  1-2-NA-NA-NA-NA 
##               14                2               20              401 
##     1-NA-3-4-5-6    1-NA-3-4-5-NA    1-NA-3-4-NA-6   1-NA-3-4-NA-NA 
##               90               10                5               20 
##    1-NA-3-NA-5-6   1-NA-3-NA-5-NA  1-NA-3-NA-NA-NA    1-NA-NA-4-5-6 
##                9                6               35                1 
##   1-NA-NA-4-5-NA  1-NA-NA-4-NA-NA 1-NA-NA-NA-NA-NA 
##                1                1              771

No. 5

Question

Find out how many childless women have their first child over the 6 waves? _____

Answer

women_long<- merged.stack(women_wide, #dataset for transfrom
                            var.stubs = c("age", "wave", "relstat", "hlt","nkidsbio", "sat"), 
#var.stubs is to specify the prefixes of the variable groups
                            sep = ".") %>%  
#sep is to specify the character that separates the "variable name" from the "times" in the source
                            drop_na(wave)
#drop the observations which did not join the wave

women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    firstkid=case_when( nkidsbio!=lag(nkidsbio, 1) & lag(nkidsbio, 1)==0 & nkidsbio>0 ~ 1,
                          TRUE ~ 0),
    firstkid2=case_when( nkidsbio!=lag(nkidsbio, 1) & lag(nkidsbio, 1)==0 & nkidsbio==2 ~ 1,
                          TRUE ~ 0)
    ) #to identify individual whose first childbearing is twins
table(women_long$firstkid)
## 
##     0     1 
## 14674   350
table(women_long$firstkid2)
## 
##     0     1 
## 15007    17

No. 6

Question

Randomly select 10 individuals who have their first child over the 6 waves, and plot the life satisfaction

Answer

women_long <- women_long %>% 
  group_by(id) %>% 
  mutate(
    kidwave=case_when(firstkid==1 ~ wave)
  )
sample_kid <- sample (women_long$id[women_long$firstkid==1], size=10) #randomly select 10 individuals

women_samplekid <- filter(women_long, id%in%sample_kid) #find the 10 individuals in the women_long data set by matching id
ggplot(data=women_samplekid )+ #use ggplot to see changes of sat over time
  geom_point(mapping=aes(x=wave,y=sat))+
  geom_vline(mapping=aes(xintercept = kidwave ))+
  facet_wrap(~ id, ncol=5) #this is new, to plot sat by id
## Warning: Removed 38 rows containing missing values (geom_vline).