1.Please copy the following code to clean wave1 data of pairfam. How many variables are there in "wave1b"? How many observations are removed after the cleaning.

library(tidyverse) #for recoding
library(haven) #for importing data
library(janitor) #for tabulation
library(estimatr) #robust standard error OLS
library(texreg) #export regression result
library(ggplot2) #for plotting 
library(oaxaca) #for Blinder-Oaxaca Decomposition

rm(list=ls())#clear all the dataset you created in your r project

wave1 <- read_dta("anchor1_50percent_Eng.dta")

wave1b <- wave1 %>% 
  transmute(
           age,
           highedu=case_when(isced<0 ~ as.character(NA),#isced<0 missing in highedu
                             isced %in% c(7,8) ~ "tertiary edu", #1 means attained teritary education
                             isced %in% c(0:6) ~ "below tertiary"  #0 means attained below teritary education
                            ) %>% as_factor(),
           #create highedu and treat it as a categorical
           
           gender=as_factor(sex_gen) %>% fct_drop(), 
           #treat sex as a categorical, and drop unused levels
           
           nkids=case_when(nkids<0 ~ as.numeric(NA),
                           TRUE ~ as.numeric(nkids)),
           #specify when nkids should be missing
           
           sat=case_when(sat6<0 ~ as.numeric(NA), 
                             TRUE ~ as.numeric(sat6)), 
           #specify when sat should be considered missing
          
           relstat=as_factor(relstat), #treat relationship status as categorical
           relstat1=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), 
                              TRUE ~ as.character(relstat)) %>%  
           #specify when relstat1 should be missing
             as_factor() %>% fct_drop(),
           #make relstat1 as a factor again & drop unused levels
          
           health=case_when(
                  hlt1<0 ~ as.numeric(NA),#specify when it should be missing
                  TRUE ~ as.numeric(hlt1))
           )  %>%  
  drop_na() #remove all observations that are missing
#after cleaning, in wave1b, there are 8 var; 
#there are 6120 observations. 6201-6139=62 cases that are removed.

2.Please copy the following codes to create a variable named "marital". Then provide a frequency table for marital variable. How many respondents are never-married, how many are married, and how many are divorced?

wave1b <- wave1b %>% 
  mutate(
    marital=case_when(
      relstat1 %in% c("1 Never married single",
                     "2 Never married LAT",
                     "3 Never married COHAB") ~ "Nevermarried",
      # when relstat1 has any of the three situations, I assign "Nevermarried" to new variable "marital"
      relstat1 %in% c("4 Married COHAB",
                     "5 Married noncohabiting") ~ 'Married',
      # when relstat1 has any of the two situations, I assign "Married" to new variable "marital
      relstat1 %in% c("6 Divorced/separated single",
                     "7 Divorced/separated LAT",
                     "8 Divorced/separated COHAB") ~ 'Divorced',
      # when relstat1 has any of the three situations, I assign "Divorced" to new variable "marital"
      relstat1 %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widowed'
      # when relstat1 has any of the two situations, I assign "Widow" to new variable "marital"
                     ) %>% as_factor()
        )
tabyl(wave1b$marital)
#  wave1b$marital    n percent
#    Nevermarried 4103 0.66835
#         Married 1749 0.28490
#        Divorced  283 0.04610
#         Widowed    4 0.00065
#there are 4103 never-married, 1749 married, and 283 divorced.

4.I want to compare the life satisfaction of never-married and married groups.

wave1c  <-  wave1b %>% 
  filter(marital=="Nevermarried" | marital== "Married")
#wave1c is restricted to only nevermarried and married, n=5852.

average <-  wave1c%>% 
  group_by(marital) %>% 
  summarise(
    ave_sat=mean(sat),
    ave_age=mean(age),
    ave_nkids=mean(nkids),
    ave_health=mean(health),
    prop_female=sum(gender=="2 Female")/n(),
    prop_highedu=sum(highedu=="tertiary edu")/n()
  )

average  
# # A tibble: 2 × 7
#   marital      ave_sat ave_age ave_nkids ave_health prop_female prop_highedu
#   <fct>          <dbl>   <dbl>     <dbl>      <dbl>       <dbl>        <dbl>
# 1 Nevermarried    7.60    22.0     0.129       3.87       0.466        0.120
# 2 Married         7.80    33.5     1.67        3.76       0.592        0.301
#compare to the never-married group, married group have higher satisfaction, older ages, more children,slightly less healthy, more female and more educated. 
OLS_nevermarried <- lm_robust(formula = sat ~ age + nkids + health + gender + highedu, subset=(marital=="Nevermarried"), data = wave1c)
OLS_married <- lm_robust(formula = sat ~ age + nkids + health + gender + highedu, subset=(marital=="Married"), data = wave1c)
#subset=() here is to specify which sample for the OLS
screenreg(list(OLS_nevermarried,OLS_married),include.ci = FALSE, digits = 3, 
                  custom.model.names = c("OLS for nevermarried", "OLS for married"),#rename the models
                  single.row =TRUE #to make the coef. and standard error in the same row
          )
# 
# ===============================================================
#                      OLS for nevermarried  OLS for married     
# ---------------------------------------------------------------
# (Intercept)             6.873 (0.165) ***     5.705 (0.366) ***
# age                    -0.055 (0.005) ***    -0.001 (0.010)    
# nkids                   0.096 (0.066)         0.038 (0.043)    
# health                  0.472 (0.031) ***     0.505 (0.045) ***
# gender2 Female          0.094 (0.051)         0.160 (0.080) *  
# highedutertiary edu     0.538 (0.083) ***     0.225 (0.082) ** 
# ---------------------------------------------------------------
# R^2                     0.122                 0.089            
# Adj. R^2                0.121                 0.086            
# Num. obs.            4103                  1749                
# RMSE                    1.615                 1.599            
# ===============================================================
# *** p < 0.001; ** p < 0.01; * p < 0.05
#the regression coef. of age, gender, and highedu looks very different, in terms of the direction and effect size.

5.Why the life satisfaction of the married is higher than that of the nevermarried? Is it mainly because of different characteristics in the two groups, in terms of age, number of children, gender, health or education attainment, or because these variables' association with life satisfaction are different for the nevermarried and married.

#first you have to specify the control and treated group
wave1c <- wave1c %>% 
  mutate(
    marital_new1=case_when(
      marital %in% c("Nevermarried") ~ TRUE, #oaxaca treat those TRUE as the control group,i.e. group B
      marital %in% c("Married") ~ FALSE) #oaxaca treat those FALSE as the treated group, i.e. group A
         )

result <- oaxaca(formula = sat ~ age +  nkids + health + gender + highedu | marital_new1 ,data = wave1c)


overall0 <- result$twofold$overall[1,]
variables0 <- result$twofold$variables[[1]]
result0 <- rbind(variables0,overall0)[,c(1:5)] 
result0
#                     group.weight coef(explained) se(explained) coef(unexplained) se(unexplained)
# (Intercept)                    0           0.000        0.0000            -1.167           0.395
# age                            0          -0.639        0.0564             1.827           0.405
# nkids                          0           0.148        0.0947            -0.096           0.117
# health                         0          -0.049        0.0127             0.122           0.218
# gender2 Female                 0           0.012        0.0067             0.039           0.056
# highedutertiary edu            0           0.097        0.0140            -0.094           0.038
# overall0                       0          -0.430        0.0845             0.629           0.102
#finding: the life satisfaction of married is higher than that of nevermarried is mainly due to coefficient effect (overall coefficient effect=0.629, overall composition effect=-0.430).
overall1 <- result$twofold$overall[2,]
variables1 <- result$twofold$variables[[2]]
result1 <- rbind(variables1,overall1)[,c(1:5)] 
result1
#                     group.weight coef(explained) se(explained) coef(unexplained) se(unexplained)
# (Intercept)                    1          0.0000        0.0000           -1.1674          0.3954
# age                            1         -0.0096        0.1240            1.1978          0.2659
# nkids                          1          0.0589        0.0648           -0.0075          0.0091
# health                         1         -0.0522        0.0143            0.1250          0.2239
# gender2 Female                 1          0.0200        0.0099            0.0304          0.0441
# highedutertiary edu            1          0.0407        0.0155           -0.0374          0.0153
# overall1                       1          0.0578        0.1191            0.1410          0.1224
#finding: the life satisfaction of married is higher than that of nevermarried is mainly due to coefficient effect (overall coefficient effect=0.1410, overall composition effect=0.0578).
overall_pooled <- result$twofold$overall[5,]
variables_pooled <- result$twofold$variables[[5]]
result_pooled <- rbind(variables_pooled,overall_pooled)[,c(1:5)] 
result_pooled
#                     group.weight coef(explained) se(explained) coef(unexplained) se(unexplained)
# (Intercept)                   -1           0.000        0.0000            -1.167           0.395
# age                           -1          -0.409        0.0455             1.597           0.382
# nkids                         -1           0.329        0.0513            -0.277           0.051
# health                        -1          -0.051        0.0129             0.124           0.220
# gender2 Female                -1           0.017        0.0058             0.034           0.052
# highedutertiary edu           -1           0.073        0.0096            -0.070           0.027
# overall_pooled                -1          -0.041        0.0365             0.240           0.034
#finding: the life satisfaction of married is higher than that of nevermarried is mainly due to coefficient effect (overall coefficient effect=0.240, overall composition effect=-0.041).
plot(result, decomposition = "twofold", type="overall", group.weight = -1, title = "Overall:ref=pooled")

plot(result, decomposition = "twofold",  type="variables", group.weight = -1, title = "variables:ref=pooled")