No. 1

Question

1.1 Based on pairfam wave1 dataset, compile a dataset consists of variables as following:

age, sex(sex_gen), life satisfaction(sat6), relationship(relstat), no.of children(nkidsbio)

1.2 Drop observations with missing values.

1.3 What is the original sample size and what is the sample size after cleaning?

Answer

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor) #cleaning data
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born
dataset2 <- dataset1 %>% 
  transmute(
           age, 
           sex=as_factor(sex_gen) %>% fct_drop(), 
           #treat sex as a categorical, and drop unused levels
           nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
                              TRUE ~ as.numeric(nkidsbio)),
           #specify when nkidsbio should be missing
           sat6=case_when(sat6<0 ~ as.numeric(NA), 
                          TRUE ~ as.numeric(sat6)), 
          #specify when sat should be considered missing
           relstat=as_factor(relstat), #treat relationship status as categorical
           relstat1=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), 
                              TRUE ~ as.character(relstat)) %>%  
          #specify when relstat1 should be missing
             as_factor() %>% fct_drop(),
          #make relstat1 as a factor again & drop unused levels
           )  %>%  
  drop_na() #remove all observations that are missing

#original sample size is 6201; the clean sampled size is 6158.

No. 2

Question

2.1 Generate a new variable “marital” for relationship status with categories of “Never married”, “Married”, “Separate/Divorced”, “Widowed”.

2.2 Generate a new variable “parenthood” based on nkidsbio with categories of “Have kids”, “No kids”.

Answer

tabyl(dataset2,relstat1)#check the distribution first.
##                     relstat1    n      percent
##       1 Never married single 2445 0.3970444950
##              4 Married COHAB 1733 0.2814225398
##          2 Never married LAT 1011 0.1641766807
##        3 Never married COHAB  658 0.1068528743
##   8 Divorced/separated COHAB   76 0.0123416694
##             9 Widowed single    3 0.0004871712
##  6 Divorced/separated single  145 0.0235466060
##     7 Divorced/separated LAT   63 0.0102305943
##      5 Married noncohabiting   23 0.0037349789
##               10 Widowed LAT    1 0.0001623904
dataset3 <- dataset2 %>% 
  mutate(
    marital=case_when(
      relstat1 %in% c("1 Never married single",
                     "2 Never married LAT",
                     "3 Never married COHAB") ~ "Nevermarried",
      # when relstat1 has any of the three situations, I assign "Nevermarried" to new variable "marital"
      relstat1 %in% c("4 Married COHAB",
                     "5 Married noncohabiting") ~ 'Married',
      # when relstat1 has any of the two situations, I assign "Married" to new variable "marital
      relstat1 %in% c("6 Divorced/separated single",
                     "7 Divorced/separated LAT",
                     "8 Divorced/separated COHAB") ~ 'Divorced',
      # when relstat1 has any of the three situations, I assign "Divorced" to new variable "marital"
      relstat1 %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widowed'
      # when relstat1 has any of the two situations, I assign "Widow" to new variable "marital"
                     ) %>% as_factor(),#make marital a categorical variable
    parenthood=case_when(
      nkidsbio>0 ~ "Have kids",#when nkidsbio should be "Have kids"
      nkidsbio==0 ~ "No kids" #when nkidsbio should be "No kids"
                        ) %>%as_factor() #make parenthood a categorical variable 
    )

No. 3

Question

3.1 Estimate the life satisfaction for the married and the divorced individuals

3.2 What is the average life satisfaction for the divorced individuals?

Answer

dataset4 <- dataset3%>%
  filter(marital!= "Widowed" &  marital!= "Nevermarried")
#drop the widowed and never-married

mar_sat <- dataset4 %>%  
  group_by(marital) %>% 
  dplyr::summarise(
    mean_sat6=mean(sat6), #calculate the mean of sat6 by marital1
          ) 
mar_sat
## # A tibble: 2 × 2
##   marital  mean_sat6
##   <fct>        <dbl>
## 1 Married       7.80
## 2 Divorced      6.68
# average life satisfaction for the divorced individuals is 6.676.

No. 4

Question

I want to understand whether the divorced people have lower life satisfaction than the married ones. Please use standard error robust OLS to do the modelling

4.1 Model1: regress life satisfaction on age and marital status

4.2 What is the effect of age on the life satisfaction?

4.3 What is the effect of divorce on the life satisfaction, when age is controlled?

4.4 output the result to a html file.

Answer

regression1 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital )
summary(regression1)
## 
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                  Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)      7.896879   0.298478 26.4572 7.741e-133  7.31153  8.48223 2037
## age             -0.002768   0.008792 -0.3148  7.529e-01 -0.02001  0.01447 2037
## maritalDivorced -1.125749   0.134957 -8.3416  1.333e-16 -1.39042 -0.86108 2037
## 
## Multiple R-squared:  0.04751 ,   Adjusted R-squared:  0.04658 
## F-statistic: 35.23 on 2 and 2037 DF,  p-value: 9.053e-16
texreg::htmlreg(regression1, 
        include.ci = FALSE, 
        file = "regression1.html") #html
## The table was written to the file 'regression1.html'.
#4.2 : when the marital status is controlled, with one year increase in age, the life satisfaction decreased by 0.0028. But this age effect is not statistically significant.
#4.3 : when the age is controlled, a divorced individuals has lower life satisfaction than a married individual by amount of 1.126 score. This effect of marital status is statistically significant.

No. 5

Question

I want to understand whether the divorced people have lower life satisfaction than the married ones. Please standard error robust OLS to do the modelling

5.1 Please change the reference category of marital, using “divorced” as the reference.

5.2 Do a Model 2 where independent variables are age, marital, parenthood

Answer

dataset4$marital <- fct_relevel(dataset4$marital, "Divorced")

regression2 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital + parenthood )
summary(regression2)
## 
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood, data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                      Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          6.770486    0.33181 20.4048 2.353e-84  6.11976  7.42121
## age                 -0.003605    0.00939 -0.3839 7.011e-01 -0.02202  0.01481
## maritalMarried       1.123965    0.13503  8.3237 1.542e-16  0.85915  1.38878
## parenthoodHave kids  0.036610    0.11227  0.3261 7.444e-01 -0.18356  0.25678
##                       DF
## (Intercept)         2036
## age                 2036
## maritalMarried      2036
## parenthoodHave kids 2036
## 
## Multiple R-squared:  0.04757 ,   Adjusted R-squared:  0.04616 
## F-statistic: 23.47 on 3 and 2036 DF,  p-value: 6.2e-15

No. 6

Question

Using standard error robust OLS to do the following models

6.1 Model3: regress life satisfaction on age, marital status, parenthood status, and sex. Using “Married” as the reference group.

6.2 Model4: based on Model 3, add an interaction between marital status and sex.

6.3 Export results of Model 3 and Model 4 to a html file and rename the names of coefficents.

6.4 Does the effect of marital status differ by gender?

Answer

dataset4$marital <- fct_relevel(dataset4$marital, "Married")

regression3 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital + parenthood + sex )
summary(regression3)
## 
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood + sex, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                     Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper
## (Intercept)          7.81240    0.31294 24.9646 3.183e-120  7.19868  8.42611
## age                 -0.00242    0.00951 -0.2545  7.991e-01 -0.02107  0.01623
## maritalDivorced     -1.13145    0.13556 -8.3465  1.281e-16 -1.39729 -0.86560
## parenthoodHave kids  0.02751    0.11241  0.2447  8.067e-01 -0.19294  0.24796
## sex2 Female          0.08437    0.08063  1.0463  2.955e-01 -0.07376  0.24251
##                       DF
## (Intercept)         2035
## age                 2035
## maritalDivorced     2035
## parenthoodHave kids 2035
## sex2 Female         2035
## 
## Multiple R-squared:  0.04809 ,   Adjusted R-squared:  0.04622 
## F-statistic: 17.72 on 4 and 2035 DF,  p-value: 2.616e-14
regression4 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital*sex + parenthood  )
summary(regression4)
## 
## Call:
## lm_robust(formula = sat6 ~ age + marital * sex + parenthood, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                              Estimate Std. Error t value   Pr(>|t|) CI Lower
## (Intercept)                  7.825246   0.314066 24.9159 8.218e-120  7.20932
## age                         -0.002425   0.009513 -0.2549  7.989e-01 -0.02108
## maritalDivorced             -1.209278   0.248957 -4.8574  1.280e-06 -1.69752
## sex2 Female                  0.069171   0.082399  0.8395  4.013e-01 -0.09242
## parenthoodHave kids          0.023058   0.112595  0.2048  8.378e-01 -0.19775
## maritalDivorced:sex2 Female  0.118446   0.296005  0.4001  6.891e-01 -0.46206
##                             CI Upper   DF
## (Intercept)                  8.44117 2034
## age                          0.01623 2034
## maritalDivorced             -0.72104 2034
## sex2 Female                  0.23077 2034
## parenthoodHave kids          0.24387 2034
## maritalDivorced:sex2 Female  0.69895 2034
## 
## Multiple R-squared:  0.0482 ,    Adjusted R-squared:  0.04586 
## F-statistic: 14.22 on 5 and 2034 DF,  p-value: 1.065e-13
texreg::htmlreg(list(regression3, regression4),
        include.ci = FALSE, digits = 3, 
        file = "regression3&4.html") #html
## The table was written to the file 'regression3&4.html'.
texreg::htmlreg(
          list(regression3, regression4),
          custom.coef.names = c("Intercept",
                                "Age",
                                "Divorced (Ref.=Married)",
                                "Have kids (Ref.=no kid)",
                                "Female (Ref.=Male)",
                                "Divorced x Female"
                               ),
                include.ci = FALSE, digits = 3,
                file = "regression3&4_renamed.html") #html
## The table was written to the file 'regression3&4_renamed.html'.
#6.4: The effect of marital status does not differ by gender significantly.

No. 7

Question

Using standard error robust OLS to do the following models

7.1 Model5: regress life satisfaction on age, marital status, parenthood status, and sex. Add an interaction between marital status and and age.

7.2 What is the difference between the age effect among the married and the age effect among the divorced?

Answer

regression5 <- lm_robust(data = dataset4,
                         formula = sat6 ~ marital*age + sex + parenthood  )
summary(regression5)
## 
## Call:
## lm_robust(formula = sat6 ~ marital * age + sex + parenthood, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                      Estimate Std. Error  t value   Pr(>|t|) CI Lower CI Upper
## (Intercept)          7.685949   0.319905 24.02570 1.720e-112  7.05857  8.31333
## maritalDivorced     -0.036013   1.178056 -0.03057  9.756e-01 -2.34633  2.27431
## age                  0.001368   0.009774  0.13991  8.887e-01 -0.01780  0.02054
## sex2 Female          0.086532   0.080706  1.07219  2.838e-01 -0.07174  0.24481
## parenthoodHave kids  0.025359   0.112806  0.22480  8.222e-01 -0.19587  0.24659
## maritalDivorced:age -0.031990   0.033903 -0.94357  3.455e-01 -0.09848  0.03450
##                       DF
## (Intercept)         2034
## maritalDivorced     2034
## age                 2034
## sex2 Female         2034
## parenthoodHave kids 2034
## maritalDivorced:age 2034
## 
## Multiple R-squared:  0.04874 ,   Adjusted R-squared:  0.0464 
## F-statistic: 14.56 on 5 and 2034 DF,  p-value: 4.757e-14
texreg::htmlreg(list(regression5),
                custom.coef.names = c("Intercept",
                                "Divorced (Ref.=Married)",
                                "Age",
                                "Female (Ref.=Male)",
                                "Have kids (Ref.=no kid)",
                                "Divorced x Age"
                               ),
                include.ci = FALSE, digits = 3, 
                file = "regression5.html") #html
## The table was written to the file 'regression5.html'.
#7.2: The difference between the age effect among the married and the age effect among the divorced is 0.03199, but this difference is not statistically significant.