Data preparation

1.Libary the packages

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor)# Cleaning data
library(ggplot2) # Allows us to create nice figures.
library(skimr)
library(splitstackshape)

2.Import the six waves of data

wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")

3.Clean the data

## Import data
wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")

## Clean data
clean_fun <- function(df) {  df %>% 
  transmute(
    id, 
    age, 
    wave,
    sex=as_factor(sex_gen), #make sex_gen as a factor
    relstat=as_factor(relstat), #make relstat as a factor
    relstat=case_when(relstat== "-7 Incomplete data" ~ as.character(NA), #specify when is missing 
                      TRUE ~ as.character(relstat))%>% as_factor(), #make relstat as a factor again
    health=case_when(hlt1<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(hlt1)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6)),
    partner=case_when(relstat %in% c("1 Never married single","6 Divorced/separated single","9 Widowed single") ~ "No",
                    # when relstat has any of the situations, I assign "No"
                    relstat %in% c("2 Never married LAT","3 Never married COHAB",
                                   "4 Married COHAB","5 Married noncohabiting",
                                   "7 Divorced/separated LAT","8 Divorced/separated COHAB",
                                   "10 Widowed LAT", "11 Widowed COHAB") ~ 'Yes') %>% 
             as_factor() %>%
             fct_relevel("No", "Yes")
                    # when relstat has any of the situations, I assign "Yes"
  )%>% drop_na()  }
wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)

4.Sample selection

wave1b <- wave1a %>% filter(partner=="No")%>%
  rename(wave_1=wave, age_1=age, sex_1=sex, relstat_1=relstat, health_1=health, sat_1=sat, partner_1=partner) #rename variables
wave2b <- wave2a %>% 
  rename(wave_2=wave, age_2=age, sex_2=sex, relstat_2=relstat, health_2=health, sat_2=sat, partner_2=partner)
wave3b <- wave3a %>% 
  rename(wave_3=wave, age_3=age, sex_3=sex, relstat_3=relstat, health_3=health, sat_3=sat, partner_3=partner)
wave4b <- wave4a %>% 
  rename(wave_4=wave, age_4=age, sex_4=sex, relstat_4=relstat, health_4=health, sat_4=sat, partner_4=partner)
wave5b <- wave5a %>% 
  rename(wave_5=wave, age_5=age, sex_5=sex, relstat_5=relstat, health_5=health, sat_5=sat, partner_5=partner)
wave6b <- wave6a %>% 
  rename(wave_6=wave, age_6=age, sex_6=sex, relstat_6=relstat, health_6=health, sat_6=sat, partner_6=partner)

5.Create a simple dataset containing two waves

twowaves <- left_join(wave1b, wave2b, by = "id") 

## Check the participation over time
twowaves$check <- paste(twowaves$wave_1, twowaves$wave_2, sep='-')
table(twowaves$check)

## 
##  1-2 1-NA 
## 1971  620

6.Create a full dataset containing six waves

sixwaves_wide <- left_join(wave1b, wave2b, by = "id") %>%  # left join wave1b and wave2b
  left_join(wave3b, by = "id") %>% # left join with wave3b
  left_join(wave4b, by = "id") %>% # left join with wave4b
  left_join(wave5b, by = "id") %>% # left join with wave5b
  left_join(wave6b, by = "id") # left join with wave6b
#by using left_join I keep those single in wave1 and follow them


## Check the participation over time
sixwaves_wide$check <- paste(sixwaves_wide$wave_1, sixwaves_wide$wave_2, sixwaves_wide$wave_3,
                         sixwaves_wide$wave_4, sixwaves_wide$wave_5, sixwaves_wide$wave_6, sep='-')
table(sixwaves_wide$check)

## 
##      1-2-3-4-5-6     1-2-3-4-5-NA     1-2-3-4-NA-6    1-2-3-4-NA-NA 
##             1045              142               48              154 
##     1-2-3-NA-5-6    1-2-3-NA-5-NA   1-2-3-NA-NA-NA     1-2-NA-4-5-6 
##               45               16              202               27 
##    1-2-NA-4-5-NA    1-2-NA-4-NA-6   1-2-NA-4-NA-NA  1-2-NA-NA-NA-NA 
##               10                1               14              267 
##     1-NA-3-4-5-6    1-NA-3-4-5-NA    1-NA-3-4-NA-6   1-NA-3-4-NA-NA 
##               30               10                6               12 
##    1-NA-3-NA-5-6   1-NA-3-NA-5-NA  1-NA-3-NA-NA-NA    1-NA-NA-4-5-6 
##                4                1               30                9 
##  1-NA-NA-NA-5-NA 1-NA-NA-NA-NA-NA 
##                1              517

7.Generate a panel dataset for two waves

twowaves_long<- merged.stack(twowaves, #dataset for transfrom
                          var.stubs = c("age", "wave", "sex","relstat", "health", "sat", "partner"), 
                          #var.stubs is to specify the prefixes of the variable groups
                          sep = "_") %>%  
  #sep is to specify the character that separates the "variable name" from the "times" in the source
  drop_na(wave)

8.Generate a panel dataset for six waves

sixwaves_long<- merged.stack(sixwaves_wide, #dataset for transfrom
                          var.stubs = c("age", "wave", "sex","relstat", "health", "sat", "partner"), 
                          #var.stubs is to specify the prefixes of the variable groups
                          sep = "_") %>%  
  #sep is to specify the character that separates the "variable name" from the "times" in the source
  drop_na(wave)
#drop the observations which did not join the wave

Data preparation in Session 8

Mengni Chen

2024-11-13

Data preparation

1.Libary the packages

2.Import the six waves of data

3.Clean the data

4.Sample selection

5.Create a simple dataset containing two waves

6.Create a full dataset containing six waves

7.Generate a panel dataset for two waves

8.Generate a panel dataset for six waves