Data cleaning

Library and Import

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor)# Cleaning data
library(ggplot2) # Allows us to create nice figures.
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
library(broom) # Allows us to turn model objects into tibbles.

wave1 <- read_dta("anchor1_50percent_Eng.dta")
# sample size =6201

Recoding

wave1b <- wave1 %>% 
  transmute(
    age,
    
    sat6=case_when(sat6<0 ~ as.numeric(NA), #specify when sat should be considered missing
                   TRUE ~ as.numeric(sat6)),
    
    sex_gen=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as categorical, and drop unused level
    
    relstat=as_factor(relstat), #treat relationship status as categorical
    
    relstat_new1=case_when(
      relstat=="-7 Incomplete data" ~ as.character(NA),#specify when it should be missing
      TRUE ~ as.character(relstat)
                         ) %>% as_factor() %>% fct_drop() 
      #make relstat as a factor, and then drop unused levels in relstat_new1
            ) %>%   
  drop_na() #drop all observations with missing values in the sample
# sample size change from 6201 to 6162

Further selection

wave1c <- wave1b %>% 
  mutate(
    relstat_new2=case_when(
      relstat_new1 %in% c("1 Never married single") ~ "single",
      #treat 'never married single' as 'single'
      relstat_new1 %in% c("2 Never married LAT", 
                     "3 Never married COHAB",
                     "4 Married COHAB",
                     "5 Married noncohabiting") ~ 'partnered',
      #treat the 4 situations as "partnered"
      relstat_new1 %in% c("6 Divorced/separated single",
                     "7 Divorced/separated LAT",
                     "8 Divorced/separated COHAB") ~ 'separated',
      #treat the 3 situations as "separated"
      relstat_new1 %in% c("9 Widowed single",
                     "10 Widowed LAT") ~ 'widowed'
      #treat the 2 situations as "widowed"
                          ) %>% as_factor()# make relstat_new2 as factor
         ) %>% 
  filter(relstat_new2!= "widowed" &  relstat_new2!= "separated") #only 4 widowed and 284 separated, dropping.
# sample size change to 5874 after dropping widowed and separated

Data cleaning

Mengni Chen

2024-10-17

Library and Import

Recoding

Further selection