Exercise 4

Warming up

Use pairfam wave1 dataset, please check the following variables in the questionnaire and codebook:

age,
sex_gen,
sat6,
sat1i1,
relstat,
nkidsbio,
cohort,
val1i7

Question

Which variable measures satisfaction with family?
Which variable measures attitude towards marriage?

Answer for a.

sat1i1

Answer for b.

val1i7

No. 1

Question

1.1 Use pairfam wave1 dataset, compile a dataset consisting of variables stated in Warming up: 1) age, 2) sex_gen, 3) sat6, 4) sat1i1, 5) relstat, 6) nkidsbio, 7) cohort, 8) val1i7

1.2 Check the variables to see any of them having invalid answers.

1.3 Drop observations with missing values.

1.4 What is the original sample size and what is the sample size after cleaning?

Answer 1.1

library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor) #cleaning data
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.

pairfam <- read_dta("anchor1_50percent_Eng.dta")
dataset1 <- select(pairfam, 
                   age, 
                   sex_gen, 
                   sat6,
                   sat1i1, 
                   relstat, 
                   nkidsbio,
                   cohort,
                   val1i7)

Answer 1.2

tabyl(dataset1,age)

##  age   n     percent
##   14  41 0.006611837
##   15 708 0.114175133
##   16 722 0.116432833
##   17 667 0.107563296
##   18  35 0.005644251
##   24  24 0.003870343
##   25 577 0.093049508
##   26 678 0.109337204
##   27 647 0.104338010
##   28  87 0.014029995
##   34  22 0.003547815
##   35 502 0.080954685
##   36 618 0.099661345
##   37 772 0.124496049
##   38 101 0.016287696

tabyl(dataset1,sex_gen)

##  sex_gen    n   percent
##        1 3029 0.4884696
##        2 3172 0.5115304

tabyl(dataset1,sat6) # 5 cases with invalid answer

##  sat6    n      percent
##    -2    2 0.0003225286
##    -1    3 0.0004837929
##     0   26 0.0041928721
##     1   18 0.0029027576
##     2   45 0.0072568940
##     3  110 0.0177390743
##     4  133 0.0214481535
##     5  395 0.0636994033
##     6  508 0.0819222706
##     7 1178 0.1899693598
##     8 1877 0.3026931140
##     9 1157 0.1865828092
##    10  749 0.1207869698

tabyl(dataset1,sat1i1) #23 cases with invalid answer

##  sat1i1    n     percent
##      -2   13 0.002096436
##      -1   10 0.001612643
##       0  103 0.016610224
##       1   50 0.008063216
##       2   89 0.014352524
##       3  208 0.033542977
##       4  219 0.035316884
##       5  593 0.095629737
##       6  494 0.079664570
##       7 1154 0.186099016
##       8 1490 0.240283825
##       9  773 0.124657313
##      10 1005 0.162070634

tabyl(dataset1,relstat) #34 cases with invalid answer

##  relstat    n      percent
##       -7   34 0.0054829866
##        1 2448 0.3947750363
##        2 1012 0.1631994840
##        3  660 0.1064344461
##        4 1735 0.2797935817
##        5   23 0.0037090792
##        6  146 0.0235445896
##        7   63 0.0101596517
##        8   76 0.0122560877
##        9    3 0.0004837929
##       10    1 0.0001612643

#or
tabyl(as_factor(dataset1$relstat))

##  as_factor(dataset1$relstat)    n      percent
##           -7 Incomplete data   34 0.0054829866
##       1 Never married single 2448 0.3947750363
##          2 Never married LAT 1012 0.1631994840
##        3 Never married COHAB  660 0.1064344461
##              4 Married COHAB 1735 0.2797935817
##      5 Married noncohabiting   23 0.0037090792
##  6 Divorced/separated single  146 0.0235445896
##     7 Divorced/separated LAT   63 0.0101596517
##   8 Divorced/separated COHAB   76 0.0122560877
##             9 Widowed single    3 0.0004837929
##               10 Widowed LAT    1 0.0001612643
##             11 Widowed COHAB    0 0.0000000000

tabyl(dataset1,nkidsbio) #4 cases with invalid answer

##  nkidsbio    n      percent
##        -7    4 0.0006450572
##         0 4136 0.6669891953
##         1  839 0.1353007579
##         2  868 0.1399774230
##         3  277 0.0446702145
##         4   54 0.0087082729
##         5   17 0.0027414933
##         6    4 0.0006450572
##        10    2 0.0003225286

tabyl(dataset1,cohort)

##  cohort    n   percent
##       1 2173 0.3504274
##       2 2013 0.3246251
##       3 2015 0.3249476

tabyl(as_factor(dataset1$val1i7)) #63cases with invalid answer

##         as_factor(dataset1$val1i7)    n     percent
##              -5 Inconsistent value    0 0.000000000
##  -4 Filter error / Incorrect entry    0 0.000000000
##                  -3 Does not apply    0 0.000000000
##                       -2 No answer   10 0.001612643
##                      -1 Don't know   53 0.008547009
##              1 Disagree completely  913 0.147234317
##                                  2  800 0.129011450
##                                  3 1328 0.214159007
##                                  4 1152 0.185776488
##                 5 Agree completely 1945 0.313659087

Answer 1.3

dataset2 <- dataset1 %>% 
  transmute(
           age, 
           sex=as_factor(sex_gen) %>% fct_drop(), 
           #treat sex as a categorical, and drop unused levels
           
           sat6=case_when(sat6<0 ~ as.numeric(NA), 
                          TRUE ~ as.numeric(sat6)), 
           #specify when sat should be considered missing
           
           fam_sat=case_when(sat1i1<0 ~ as.numeric(NA),
                          TRUE ~ as.numeric(sat1i1)),
           #specify when fam_sat should be considered missing
           
           relstat=as_factor(relstat), #treat relationship status as categorical
           relstat1=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), 
                              TRUE ~ as.character(relstat)) %>%  
           #specify when relstat1 should be missing
             as_factor() %>% fct_drop(),#make relstat1 as a factor again & drop unused levels
           
           nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
                              TRUE ~ as.numeric(nkidsbio)),
           #specify when nkidsbio should be missing
           
           cohort=as_factor(cohort) %>% fct_drop(),
           #treat sex as a categorical, and drop unused levels
           
           mar_att=as_factor(val1i7),
           mar_att1=case_when(mar_att %in% c("-2 No answer","-1 Don't know") ~ as.character(NA),
                              TRUE ~ as.character(mar_att)%>%  
           #specify when val1i5 should be missing
             as_factor() %>% fct_drop()#make mar_att1 as a factor again & drop unused levels
                              )
           )  %>%  
  drop_na() #remove all observations that are missing

Answer 1.4

count(dataset1)

## # A tibble: 1 × 1
##       n
##   <int>
## 1  6201

count(dataset2)

## # A tibble: 1 × 1
##       n
##   <int>
## 1  6074

original sample size is 6201; the clean sampled size is 6074.

No. 2

Question

Generate a new variable “marital” for relationship status with categories of “Never married”, “Married”, “Divorced”(including separated), “Widowed”, regardless of whether individuals are cohabiting, noncohabiting, LAT.
Generate a new variable “parenthood” based on nkidsbio with categories of “Have kids”, “No kids”.

Answer

tabyl(dataset2,relstat1)#check the distribution first.

##                     relstat1    n      percent
##       1 Never married single 2408 0.3964438591
##              4 Married COHAB 1705 0.2807046427
##          2 Never married LAT 1001 0.1648007903
##        3 Never married COHAB  654 0.1076720448
##   8 Divorced/separated COHAB   75 0.0123477116
##             9 Widowed single    3 0.0004939085
##  6 Divorced/separated single  142 0.0233783339
##     7 Divorced/separated LAT   62 0.0102074416
##      5 Married noncohabiting   23 0.0037866315
##               10 Widowed LAT    1 0.0001646362

dataset3 <- dataset2 %>% 
  mutate(
    marital=case_when(
      relstat1 %in% c("1 Never married single",
                     "2 Never married LAT",
                     "3 Never married COHAB") ~ "Nevermarried",
      # when relstat1 has any of the three situations, I assign "Nevermarried" to new variable "marital"
      relstat1 %in% c("4 Married COHAB",
                     "5 Married noncohabiting") ~ 'Married',
      # when relstat1 has any of the two situations, I assign "Married" to new variable "marital
      relstat1 %in% c("6 Divorced/separated single",
                     "7 Divorced/separated LAT",
                     "8 Divorced/separated COHAB") ~ 'Divorced',
      # when relstat1 has any of the three situations, I assign "Divorced" to new variable "marital"
      relstat1 %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widowed'
      # when relstat1 has any of the two situations, I assign "Widow" to new variable "marital"
                     ) %>% as_factor(),#make marital a categorical variable
    parenthood=case_when(
      nkidsbio>0 ~ "Have kids",#when nkidsbio should be "Have kids"
      nkidsbio==0 ~ "No kids" #when nkidsbio should be "No kids"
                        ) %>%as_factor() #make parenthood a categorical variable 
    )

No. 3

Question

Estimate the average life satisfaction and average satifaction with family for the married and the divorced individuals.

Answer

dataset4 <- dataset3%>%
  filter(marital!= "Widowed" &  marital!= "Nevermarried")
#drop the widowed and never-married

mar_sat <- dataset4 %>%  
  group_by(marital) %>% 
  summarise(
    mean_sat6=mean(sat6), #calculate the mean of sat6 by marital1
    mean_famsat=mean(fam_sat)      ) 
mar_sat

## # A tibble: 2 × 3
##   marital  mean_sat6 mean_famsat
##   <fct>        <dbl>       <dbl>
## 1 Married       7.80        7.20
## 2 Divorced      6.67        6.54

# average life satisfaction for the divorced individuals is 6.676.

No. 4

Question

Generate a table to show the distribution of attitudes toward marriage for the married and the divorced individuals.
Then use a bar chart to show the distributions of attitudes the married and the divorced individuals.

Answer a.

dataset4$marital <- fct_drop(dataset4$marital) #drop unused levels
distribute_table<- tabyl(dataset4, mar_att1, marital) %>%
    adorn_totals("row") %>%#add row %
    adorn_percentages("col") %>% #add col
    adorn_pct_formatting()  #format the percentage
distribute_table

##               mar_att1 Married Divorced
##  1 Disagree completely   12.2%    40.1%
##                      2   12.0%    15.8%
##                      3   20.9%    22.6%
##                      4   16.9%     9.3%
##     5 Agree completely   38.1%    12.2%
##                  Total  100.0%   100.0%

Answer a.

plot1<- ggplot(data = dataset4,
                 mapping=aes(x = mar_att1,
                             y = ..prop..,# ask R to show % rather than absolute count
                             group = marital,#calculate the % of sat6 within each group of relstat_new
                             fill = marital)#color the bars different by relstat_new
                 )+
          geom_bar()+
          facet_wrap(~ marital)+
          coord_flip()+ #swap the coordinate system to make a horizontal bar, so that the labels are clear
          labs(title="Attitude: Marriage is a lifelong union that should not be broken")
  
plot1

No. 5

Question

I want to use OLS regression to examine whether the divorced people have lower life satisfaction than the married ones. Please use standard error robust OLS to do the modelling

5.1 Model1: regress life satisfaction on age and marital status.

5.2 What is your interpretation the coefficient of age and marital?

5.3 output the result to a html file.

Answer 5.1

regression1 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital )
summary(regression1)

## 
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                 Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)      7.93862   0.298652  26.582 1.306e-133  7.35292  8.52432 2004
## age             -0.00409   0.008795  -0.465  6.420e-01 -0.02134  0.01316 2004
## maritalDivorced -1.12413   0.134097  -8.383  9.587e-17 -1.38711 -0.86114 2004
## 
## Multiple R-squared:  0.04766 ,   Adjusted R-squared:  0.04671 
## F-statistic: 35.57 on 2 and 2004 DF,  p-value: 6.616e-16

Answer 5.2

Interpreting the age coefficient: When the marital status is controlled, with one year increase in age, the life satisfaction decreased by 0.00409. But this age effect is not statistically significant at 5%.
Interpreting the marital coefficient: When the age is controlled, a divorced individuals has lower life satisfaction than a married individual by amount of 1.124 score. This effect of marital status is statistically significant.

Answer 5.3

# I export it into a html file. You can do whatever you want, say, word, or excel.
texreg::htmlreg(regression1, 
        include.ci = FALSE, 
        file = "output1.html") #html

## The table was written to the file 'output1.html'.

No. 6

Question

6.1 Please change the reference category of marital, using “divorced” as the reference in the regression conducted in Question 5.

6.2 Add a new variable, “parenthood” to your model in Question 6.1. Is there any change in the coefficents of age and marital?

Answer 6.1

dataset4$marital <- fct_relevel(dataset4$marital, "Divorced")

regression1a <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital )
summary(regression1a)

## 
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper   DF
## (Intercept)     6.81450   0.329093  20.707 1.794e-86  6.16910  7.45990 2004
## age            -0.00409   0.008795  -0.465 6.420e-01 -0.02134  0.01316 2004
## maritalMarried  1.12413   0.134097   8.383 9.587e-17  0.86114  1.38711 2004
## 
## Multiple R-squared:  0.04766 ,   Adjusted R-squared:  0.04671 
## F-statistic: 35.57 on 2 and 2004 DF,  p-value: 6.616e-16

Answer 6.2

regression2 <- lm_robust(data = dataset4,
                         formula = sat6 ~ age + marital + parenthood )
summary(regression2)

## 
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood, data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                      Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          6.814169   0.329195 20.6995 2.057e-86  6.16857  7.45977
## age                 -0.005173   0.009375 -0.5518 5.811e-01 -0.02356  0.01321
## maritalMarried       1.121696   0.134090  8.3653 1.109e-16  0.85873  1.38467
## parenthoodHave kids  0.047014   0.112394  0.4183 6.758e-01 -0.17341  0.26744
##                       DF
## (Intercept)         2003
## age                 2003
## maritalMarried      2003
## parenthoodHave kids 2003
## 
## Multiple R-squared:  0.04775 ,   Adjusted R-squared:  0.04632 
## F-statistic: 23.69 on 3 and 2003 DF,  p-value: 4.569e-15

The coefficient of “married” gets slightly smaller. The coefficient of age gets slightly more negative.

No. 7

Question

Using standard error robust OLS to do the following models

7.1 Model1: regress life satisfaction with family on age, marital status, parenthood status, and sex. Using “Married” as the reference group.

7.2 Model2: based on Model 1, add an interaction between marital status and sex.

7.3 Export results of Model 1 and Model 2 to a html file and rename the names of coefficients in a clearer way to understand. Does the association between marital status and family life satisfaction differ by gender?

Answer 7.1

dataset4$marital <- fct_relevel(dataset4$marital, "Married") 

model1 <- lm_robust(data = dataset4,
                         formula = fam_sat ~ age + marital + parenthood + sex )
summary(model1 )

## 
## Call:
## lm_robust(formula = fam_sat ~ age + marital + parenthood + sex, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                     Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          5.89003    0.45459  12.957 6.284e-37   4.9985  6.78155
## age                  0.05499    0.01335   4.118 3.981e-05   0.0288  0.08118
## maritalDivorced     -0.70494    0.16974  -4.153 3.420e-05  -1.0378 -0.37205
## parenthoodHave kids -0.50482    0.14393  -3.507 4.625e-04  -0.7871 -0.22255
## sex2 Female         -0.19946    0.10979  -1.817 6.941e-02  -0.4148  0.01586
##                       DF
## (Intercept)         2002
## age                 2002
## maritalDivorced     2002
## parenthoodHave kids 2002
## sex2 Female         2002
## 
## Multiple R-squared:  0.02361 ,   Adjusted R-squared:  0.02166 
## F-statistic: 11.49 on 4 and 2002 DF,  p-value: 3.208e-09

Answer 7.2

model2 <- lm_robust(data = dataset4,
                         formula = fam_sat ~ age + marital*sex + parenthood  )
summary(model2)

## 
## Call:
## lm_robust(formula = fam_sat ~ age + marital * sex + parenthood, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                             Estimate Std. Error t value  Pr(>|t|) CI Lower
## (Intercept)                  5.85184    0.45541 12.8497 2.271e-36  4.95872
## age                          0.05505    0.01336  4.1201 3.941e-05  0.02885
## maritalDivorced             -0.48264    0.27038 -1.7850 7.441e-02 -1.01290
## sex2 Female                 -0.15543    0.11584 -1.3417 1.798e-01 -0.38261
## parenthoodHave kids         -0.49235    0.14393 -3.4208 6.368e-04 -0.77460
## maritalDivorced:sex2 Female -0.34132    0.34621 -0.9859 3.243e-01 -1.02030
##                             CI Upper   DF
## (Intercept)                  6.74497 2001
## age                          0.08125 2001
## maritalDivorced              0.04762 2001
## sex2 Female                  0.07176 2001
## parenthoodHave kids         -0.21009 2001
## maritalDivorced:sex2 Female  0.33765 2001
## 
## Multiple R-squared:  0.02414 ,   Adjusted R-squared:  0.0217 
## F-statistic: 9.218 on 5 and 2001 DF,  p-value: 1.096e-08

Answer 7.3

texreg::htmlreg(
          list(model1, model2),
          custom.coef.names = c("Intercept",
                                "Age",
                                "Divorced (Ref.=Married)",
                                "Have kids (Ref.=no kid)",
                                "Female (Ref.=Male)",
                                "Divorced x Female"
                               ),
                include.ci = FALSE, digits = 3,
                file = "models_rename.html") #html

## The table was written to the file 'models_rename.html'.

As the interaction between marital status and sex is not significant at 5% significance level, I can say the association between marital status and family life satisfaction does not differ by gender.

No. 8

Question

Using standard error robust OLS to do the following models

Add an interaction between marital status and and age in Model 1 (Question 7.1). Does the association between age and family life satisfaction differ by marital status?

Answer 8.1

model3 <- lm_robust(data = dataset4,
                         formula = fam_sat ~ marital*age + sex + parenthood  )
summary(model3)

## 
## Call:
## lm_robust(formula = fam_sat ~ marital * age + sex + parenthood, 
##     data = dataset4)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                     Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          5.72962    0.46763 12.2524 2.472e-33  4.81253  6.64672
## maritalDivorced      0.69487    1.59051  0.4369 6.622e-01 -2.42437  3.81410
## age                  0.05977    0.01391  4.2968 1.816e-05  0.03249  0.08706
## sex2 Female         -0.19644    0.10977 -1.7896 7.367e-02 -0.41171  0.01883
## parenthoodHave kids -0.50691    0.14445 -3.5093 4.593e-04 -0.79020 -0.22363
## maritalDivorced:age -0.04083    0.04574 -0.8927 3.721e-01 -0.13053  0.04887
##                       DF
## (Intercept)         2001
## maritalDivorced     2001
## age                 2001
## sex2 Female         2001
## parenthoodHave kids 2001
## maritalDivorced:age 2001
## 
## Multiple R-squared:  0.02418 ,   Adjusted R-squared:  0.02174 
## F-statistic: 9.498 on 5 and 2001 DF,  p-value: 5.761e-09

texreg::htmlreg(list(model3),
                custom.coef.names = c("Intercept",
                                "Divorced (Ref.=Married)",
                                "Age",
                                "Female (Ref.=Male)",
                                "Have kids (Ref.=no kid)",
                                "Divorced x Age"
                               ),
                custom.model.names=c("Model3_age inter"),
                include.ci = FALSE, digits = 3, 
                file = "models3.html") #html

## The table was written to the file 'models3.html'.

As the interaction between marital status and age is not significant at 5% significance level, I can say the association between age and family life satisfaction does not differ by marital status.

Exercise 4

Mengni Chen

24-09-2025

Warming up

Question

Answer for a.

Answer for b.

No. 1

Question

Answer 1.1

Answer 1.2

Answer 1.3

Answer 1.4

No. 2

Question

Answer

No. 3

Question

Answer

No. 4

Question

Answer a.

Answer a.

No. 5

Question

Answer 5.1

Answer 5.2

Answer 5.3

No. 6

Question

Answer 6.1

Answer 6.2

No. 7

Question

Answer 7.1

Answer 7.2

Answer 7.3

No. 8

Question

Answer 8.1