Now, the question is “does first birth affect women’s life satisfaction?”
1. Import 6 waves of women’s data
#or you use loop to import to avoid repetitive coding, similar to forvalues in stata
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Handle labelled data.
library(splitstackshape) #transform wide data (with stacked variables) to long data
library(ggplot2)
women1 <- read_dta("wave1_women.dta")
women2 <- read_dta("wave2_women.dta")
women3 <- read_dta("wave3_women.dta")
women4 <- read_dta("wave4_women.dta")
women5 <- read_dta("wave5_women.dta")
women6 <- read_dta("wave6_women.dta")
2.1: Keep only variables across the 6 waves: id, age, wave, relstat, hlt1, nkidsbio, sat
2.2: clean variables and drop observations when they have missing values
clean_fun <- function(df) { df %>%
transmute(
id,
age,
wave=as.numeric(wave),
relstat=as_factor(relstat), #make relstat as a factor
relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for relstat
TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
health=case_when(hlt1<0 ~ as.numeric(NA), #specify when hlt1 is missing
TRUE ~ as.numeric(hlt1)),
childno=case_when(nkidsbio==-7~ as.numeric(NA), #specify when is missing for relstat
TRUE ~ as.numeric(nkidsbio)),
sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
TRUE ~ as.numeric(sat6)),
)%>% drop_na() }
women1a <- clean_fun(women1)
women2a <- clean_fun(women2)
women3a <- clean_fun(women3)
women4a <- clean_fun(women4)
women5a <- clean_fun(women5)
women6a <- clean_fun(women6)
3. Keep those who have no kids initially and follow them across 6 waves, and generate a wide-formatted dataset for 6 waves
women1b <- women1a %>% filter(childno==0)%>% #keep individuals who are childless in the first wave
rename(wave.1=wave, age.1=age, relstat.1=relstat, health.1=health, childno.1=childno, sat.1=sat ) #rename variables
women2b <- women2a %>%
rename(wave.2=wave, age.2=age, relstat.2=relstat, health.2=health, childno.2=childno, sat.2=sat )
women3b <- women3a %>%
rename(wave.3=wave, age.3=age, relstat.3=relstat, health.3=health, childno.3=childno, sat.3=sat )
women4b <- women4a %>%
rename(wave.4=wave, age.4=age, relstat.4=relstat, health.4=health, childno.4=childno, sat.4=sat )
women5b <- women5a %>%
rename(wave.5=wave, age.5=age, relstat.5=relstat, health.5=health, childno.5=childno, sat.5=sat )
women6b <- women6a %>%
rename(wave.6=wave, age.6=age, relstat.6=relstat, health.6=health, childno.6=childno, sat.6=sat )
women_wide <- left_join(women1b, women2b, by = "id") %>% # left join women1b and women2b
left_join(women3b, by = "id") %>% # left join with women3b
left_join(women4b, by = "id") %>% # left join with women4b
left_join(women5b, by = "id") %>% # left join with women5b
left_join(women6b, by = "id") # left join with women6b
#by using left_join I keep those have no kids in the first wave and follow them
4. Find out how many women participate in all 6 waves? _____
women_wide$check <- paste(women_wide$wave.1, women_wide$wave.2, women_wide$wave.3,
women_wide$wave.4, women_wide$wave.5, women_wide$wave.6, sep='-')
table(women_wide$check)
##
## 1-2-3-4-5-6 1-2-3-4-5-NA 1-2-3-4-NA-6 1-2-3-4-NA-NA
## 1479 204 74 192
## 1-2-3-NA-5-6 1-2-3-NA-5-NA 1-2-3-NA-NA-NA 1-2-NA-4-5-6
## 72 27 278 58
## 1-2-NA-4-5-NA 1-2-NA-4-NA-6 1-2-NA-4-NA-NA 1-2-NA-NA-NA-NA
## 14 2 20 401
## 1-NA-3-4-5-6 1-NA-3-4-5-NA 1-NA-3-4-NA-6 1-NA-3-4-NA-NA
## 90 10 5 20
## 1-NA-3-NA-5-6 1-NA-3-NA-5-NA 1-NA-3-NA-NA-NA 1-NA-NA-4-5-6
## 9 6 35 1
## 1-NA-NA-4-5-NA 1-NA-NA-4-NA-NA 1-NA-NA-NA-NA-NA
## 1 1 771
#the answer is that 1479 women participated in all 6 waves
5. Find out how many childless women have their first child over the 6 waves? _____
Step1: first transform the data from wide to long
Step2: define the transition from childless to first child. Note that first childbearing could be a single birth or a twin
Step3: do tabulations to find out the number of first-childbearing event
women_long<- merged.stack(women_wide, #dataset for transfrom
var.stubs = c("age", "wave", "relstat", "health","childno", "sat"),
#var.stubs is to specify the prefixes of the variable groups
sep = ".") %>%
#sep is to specify the character that separates the "variable name" from the "times" in the source
drop_na(wave)
#drop the observations which did not join the wave
women_long <- women_long %>%
group_by(id) %>%
mutate(
firstkid=case_when( childno!=lag(childno, 1) & lag(childno, 1)==0 & childno>0 ~ 1,
TRUE ~ 0),
#when the person has 0 children at t-1 while has at least 1 child at t, define it first childbirth
twin=case_when( childno!=lag(childno, 1) & lag(childno, 1)==0 & childno==1 ~ 1, #single birth
childno!=lag(childno, 1) & lag(childno, 1)==0 & childno==2 ~ 2, #twin birth
TRUE ~ 0)
#when the person has 0 children at t-1 while has 1 child at t, define it a single birth, i.e. 1
#when the person has 0 children at t-1 while has 2 children at t, define it a twin birth, i.e. 2
)
table(women_long$firstkid)
##
## 0 1
## 14674 350
table(women_long$twin)
##
## 0 1 2
## 14674 333 17
#350 women become mothers over the 6 waves; 333 women have given a single birth;17 women have given a birth of twins
6. Randomly select 10 individuals who have their first child over the 6 waves, and plot the life satisfaction
Step1: please find out at which wave the first childbearing happened
Step2: please randomly select 10 individuals who have first child over the 6 waves
Step3: please plot the life satisfaction of these 10 individuals and also highlight the time of first childbirth in the graph
women_long <- women_long %>%
group_by(id) %>%
mutate(
birthwave=case_when(firstkid==1 ~ wave)
#define the time when firstkid is 1, meaning that the person experience first childbearing event at this wave
)
sample<- sample (women_long$id[women_long$firstkid==1], size=10) #randomly select 10 individuals
#or you can just
birth_id <- women_long %>%
filter(firstkid==1) %>%
select(id)#restrict to individuals who have first child
sample_id <- sample(birth_id$id, size=10) #randomly select 10
sample <- filter(women_long, id%in%sample_id) #find the 10 individuals in the women_long data set by matching id
ggplot(data=sample )+ #use ggplot to see changes of sat over time
geom_point(mapping=aes(x=wave,y=sat))+
geom_vline(mapping=aes(xintercept = birthwave ))+
facet_wrap(~ id, ncol=5) #this is new, to plot sat by id