No. 1

Question

With a dataset of age, sex, martial status, parenthood status, life satisfaction, and weight

  • Model 1: regress life satisfaction on age, sex, martial status, parenthood status
  • Model 2: regress life satisfaction on age, sex, martial status, parenthood status, interaction between sex and martial status
  • Model 3: regress life satisfaction on age, sex, martial status, parenthood status, interaction between sex and parenthood status
  • Model 4: regress life satisfaction on age, sex, martial status, parenthood status, interaction between martial and parenthood status
  • Model 5: regress life satisfaction on age, sex, martial status, parenthood, interaction between sex and martial status, interaction between sex and parenthood status, interaction between marital and parenthood status
#Run the following to prepare your dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(Hmisc) # Weighting
library(ggplot2) # Allows us to create nice figures.
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born, weight
dataset2 <- dataset1 %>% 
  transmute(
           age=zap_labels(age),
           sex_gen=as_factor(sex_gen) %>% fct_drop(), 
           nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),TRUE ~ as.numeric(nkidsbio)) %>% 
             zap_label(), 
           sat6=case_when(sat6<0 ~ as.numeric(NA), 
                          TRUE ~ as.numeric(sat6)) %>% #specify when sat should be considered missing
             zap_label(),
           relstat=as_factor(relstat), 
           relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), 
                             TRUE ~ as.character(relstat)) %>%  
             as_factor() %>%  
             fct_drop(), 
           cdweight=zap_label(cdweight))  %>%  
  drop_na() 
dataset3 <- dataset2 %>% 
  mutate(
    marital=case_when(
      relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried",
      relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married',
      relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced',
      relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow'
    ) %>% as_factor(), 
    parenthood=case_when(
      nkidsbio>0 ~ "Have kids",
      nkidsbio==0 ~ "No kids") %>% 
      as_factor()  
    )%>% 
  filter(marital!= "Widow")

Answer

model1 <- lm_robust(formula = sat6 ~ age +  sex_gen + marital + parenthood, data = dataset3, weights = cdweight)
summary(model1)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen + marital + parenthood, 
##     data = dataset3, weights = cdweight)
## 
## Weighted, Standard error type:  HC2 
## 
## Coefficients:
##                     Estimate Std. Error t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          8.53299   0.101602 83.9848 0.000e+00  8.33382  8.73217
## age                 -0.04473   0.004602 -9.7191 3.606e-22 -0.05375 -0.03571
## sex_gen2 Female      0.03982   0.051325  0.7759 4.379e-01 -0.06079  0.14044
## maritalMarried       0.70646   0.092586  7.6303 2.699e-14  0.52495  0.88796
## maritalDivorced     -0.55662   0.177203 -3.1411 1.691e-03 -0.90400 -0.20924
## parenthoodHave kids  0.04590   0.092878  0.4942 6.212e-01 -0.13617  0.22797
##                       DF
## (Intercept)         6148
## age                 6148
## sex_gen2 Female     6148
## maritalMarried      6148
## maritalDivorced     6148
## parenthoodHave kids 6148
## 
## Multiple R-squared:  0.04644 ,   Adjusted R-squared:  0.04567 
## F-statistic: 35.11 on 5 and 6148 DF,  p-value: < 2.2e-16
model2 <- lm_robust(formula = sat6 ~ age +  sex_gen*marital + parenthood, data = dataset3, weights = cdweight)
summary(model2)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen * marital + parenthood, 
##     data = dataset3, weights = cdweight)
## 
## Weighted, Standard error type:  HC2 
## 
## Coefficients:
##                                 Estimate Std. Error t value  Pr(>|t|) CI Lower
## (Intercept)                      8.52739   0.102039 83.5695 0.000e+00  8.32736
## age                             -0.04470   0.004605 -9.7072 4.048e-22 -0.05373
## sex_gen2 Female                  0.05188   0.062022  0.8365 4.029e-01 -0.06970
## maritalMarried                   0.75108   0.113696  6.6060 4.279e-11  0.52820
## maritalDivorced                 -0.69430   0.296572 -2.3411 1.926e-02 -1.27569
## parenthoodHave kids              0.04023   0.092849  0.4333 6.648e-01 -0.14179
## sex_gen2 Female:maritalMarried  -0.07517   0.111234 -0.6758 4.992e-01 -0.29323
## sex_gen2 Female:maritalDivorced  0.23749   0.336909  0.7049 4.809e-01 -0.42297
##                                 CI Upper   DF
## (Intercept)                      8.72742 6146
## age                             -0.03567 6146
## sex_gen2 Female                  0.17347 6146
## maritalMarried                   0.97397 6146
## maritalDivorced                 -0.11292 6146
## parenthoodHave kids              0.22224 6146
## sex_gen2 Female:maritalMarried   0.14288 6146
## sex_gen2 Female:maritalDivorced  0.89795 6146
## 
## Multiple R-squared:  0.04672 ,   Adjusted R-squared:  0.04564 
## F-statistic: 25.49 on 7 and 6146 DF,  p-value: < 2.2e-16
model3 <- lm_robust(formula = sat6 ~ age +  marital + sex_gen*parenthood, data = dataset3, weights = cdweight)
summary(model3)
## 
## Call:
## lm_robust(formula = sat6 ~ age + marital + sex_gen * parenthood, 
##     data = dataset3, weights = cdweight)
## 
## Weighted, Standard error type:  HC2 
## 
## Coefficients:
##                                      Estimate Std. Error  t value  Pr(>|t|)
## (Intercept)                          8.545482   0.102278 83.55112 0.000e+00
## age                                 -0.044742   0.004603 -9.71935 3.599e-22
## maritalMarried                       0.710073   0.092718  7.65839 2.174e-14
## maritalDivorced                     -0.558290   0.177113 -3.15217 1.628e-03
## sex_gen2 Female                      0.012087   0.061776  0.19565 8.449e-01
## parenthoodHave kids                 -0.005611   0.115252 -0.04869 9.612e-01
## sex_gen2 Female:parenthoodHave kids  0.091956   0.110740  0.83038 4.064e-01
##                                     CI Lower CI Upper   DF
## (Intercept)                          8.34498  8.74598 6147
## age                                 -0.05377 -0.03572 6147
## maritalMarried                       0.52831  0.89183 6147
## maritalDivorced                     -0.90549 -0.21109 6147
## sex_gen2 Female                     -0.10902  0.13319 6147
## parenthoodHave kids                 -0.23155  0.22032 6147
## sex_gen2 Female:parenthoodHave kids -0.12513  0.30905 6147
## 
## Multiple R-squared:  0.04658 ,   Adjusted R-squared:  0.04565 
## F-statistic: 29.35 on 6 and 6147 DF,  p-value: < 2.2e-16
model4 <- lm_robust(formula = sat6 ~ age +  sex_gen + marital*parenthood, data = dataset3, weights = cdweight)
summary(model4)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen + marital * parenthood, 
##     data = dataset3, weights = cdweight)
## 
## Weighted, Standard error type:  HC2 
## 
## Coefficients:
##                                     Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                          8.51169   0.102365 83.1502 0.000e+00
## age                                 -0.04325   0.004679 -9.2430 3.238e-20
## sex_gen2 Female                      0.04928   0.051414  0.9585 3.379e-01
## maritalMarried                       0.52119   0.125159  4.1643 3.166e-05
## maritalDivorced                     -0.55506   0.386008 -1.4379 1.505e-01
## parenthoodHave kids                 -0.16787   0.135641 -1.2376 2.159e-01
## maritalMarried:parenthoodHave kids   0.41169   0.177384  2.3209 2.033e-02
## maritalDivorced:parenthoodHave kids  0.16529   0.434183  0.3807 7.035e-01
##                                     CI Lower CI Upper   DF
## (Intercept)                          8.31101  8.71236 6146
## age                                 -0.05243 -0.03408 6146
## sex_gen2 Female                     -0.05151  0.15007 6146
## maritalMarried                       0.27584  0.76655 6146
## maritalDivorced                     -1.31177  0.20166 6146
## parenthoodHave kids                 -0.43378  0.09803 6146
## maritalMarried:parenthoodHave kids   0.06395  0.75942 6146
## maritalDivorced:parenthoodHave kids -0.68586  1.01644 6146
## 
## Multiple R-squared:  0.04774 ,   Adjusted R-squared:  0.04665 
## F-statistic: 26.53 on 7 and 6146 DF,  p-value: < 2.2e-16
model5 <- lm_robust(formula = sat6 ~ age +  sex_gen*marital + sex_gen*parenthood + marital*parenthood, data = dataset3, weights = cdweight)
summary(model5)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen * marital + sex_gen * 
##     parenthood + marital * parenthood, data = dataset3, weights = cdweight)
## 
## Weighted, Standard error type:  HC2 
## 
## Coefficients:
##                                     Estimate Std. Error t value  Pr(>|t|)
## (Intercept)                          8.51607   0.102971 82.7034 0.000e+00
## age                                 -0.04335   0.004681 -9.2610 2.743e-20
## sex_gen2 Female                      0.04442   0.063007  0.7050 4.808e-01
## maritalMarried                       0.66539   0.162694  4.0898 4.373e-05
## maritalDivorced                     -0.58942   0.436013 -1.3518 1.765e-01
## parenthoodHave kids                 -0.31184   0.175935 -1.7725 7.637e-02
## sex_gen2 Female:maritalMarried      -0.24832   0.175795 -1.4125 1.578e-01
## sex_gen2 Female:maritalDivorced      0.08666   0.365845  0.2369 8.128e-01
## sex_gen2 Female:parenthoodHave kids  0.24013   0.176603  1.3597 1.740e-01
## maritalMarried:parenthoodHave kids   0.41784   0.177308  2.3566 1.848e-02
## maritalDivorced:parenthoodHave kids  0.13458   0.438904  0.3066 7.591e-01
##                                     CI Lower CI Upper   DF
## (Intercept)                          8.31422  8.71793 6143
## age                                 -0.05253 -0.03418 6143
## sex_gen2 Female                     -0.07910  0.16794 6143
## maritalMarried                       0.34645  0.98433 6143
## maritalDivorced                     -1.44416  0.26532 6143
## parenthoodHave kids                 -0.65673  0.03306 6143
## sex_gen2 Female:maritalMarried      -0.59294  0.09630 6143
## sex_gen2 Female:maritalDivorced     -0.63052  0.80385 6143
## sex_gen2 Female:parenthoodHave kids -0.10608  0.58633 6143
## maritalMarried:parenthoodHave kids   0.07025  0.76543 6143
## maritalDivorced:parenthoodHave kids -0.72582  0.99499 6143
## 
## Multiple R-squared:  0.04852 ,   Adjusted R-squared:  0.04697 
## F-statistic:  18.9 on 10 and 6143 DF,  p-value: < 2.2e-16

No. 2

Question

Combine model 1 to model 5, output the results to a word file and label the names of independent variables to make readers easy to understand.

Answer

htmlreg(list(model1, model2, model3, model4, model5), include.ci = FALSE,
        custom.coef.names = c("Intercept", 
                              "Age", 
                              "Female",
                              "Married", 
                              "Divorced",
                              "Have kids", 
                              "Female x married",
                              "Female x divorced",
                              "Female x have kids",
                              "Married x have kids",
                              "Divorced x have kids"),
        caption = "My regression table", 
        caption.above = TRUE,
        single.row = TRUE,
        digits = 3,
        file = "Regression.doc")
## The table was written to the file 'Regression.doc'.

No. 3

Question

Save the result of Model 5 as a dataset

Answer

result_m5<- model5 %>% tidy()
result_m5
##                                   term    estimate   std.error  statistic
## 1                          (Intercept)  8.51607485 0.102971268 82.7034085
## 2                                  age -0.04335266 0.004681201 -9.2610107
## 3                      sex_gen2 Female  0.04441926 0.063007202  0.7049870
## 4                       maritalMarried  0.66538729 0.162694340  4.0897999
## 5                      maritalDivorced -0.58941660 0.436013369 -1.3518315
## 6                  parenthoodHave kids -0.31183846 0.175935007 -1.7724640
## 7       sex_gen2 Female:maritalMarried -0.24831929 0.175795053 -1.4125499
## 8      sex_gen2 Female:maritalDivorced  0.08666231 0.365844524  0.2368829
## 9  sex_gen2 Female:parenthoodHave kids  0.24012529 0.176602963  1.3596901
## 10  maritalMarried:parenthoodHave kids  0.41783963 0.177308234  2.3565720
## 11 maritalDivorced:parenthoodHave kids  0.13458419 0.438904344  0.3066367
##         p.value    conf.low   conf.high   df outcome
## 1  0.000000e+00  8.31421510  8.71793460 6143    sat6
## 2  2.743123e-20 -0.05252945 -0.03417586 6143    sat6
## 3  4.808450e-01 -0.07909692  0.16793544 6143    sat6
## 4  4.372627e-05  0.34644940  0.98432518 6143    sat6
## 5  1.764790e-01 -1.44415551  0.26532231 6143    sat6
## 6  7.636712e-02 -0.65673270  0.03305577 6143    sat6
## 7  1.578388e-01 -0.59293916  0.09630059 6143    sat6
## 8  8.127555e-01 -0.63052108  0.80384571 6143    sat6
## 9  1.739779e-01 -0.10607837  0.58632895 6143    sat6
## 10 1.847571e-02  0.07025339  0.76542587 6143    sat6
## 11 7.591303e-01 -0.72582205  0.99499042 6143    sat6
#or you can have the following code
result_m5<- tidy(model5)

No. 4

Question

Plot the result in a point chart, and also display the confidence interval of each coefficient

Answer

ggplot(data = result_m5,
       aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dashed line  

to make your graph a nicer one

result_m5$predictor <- c("Intercept", 
                              "Age", 
                              "Female",
                              "Married", 
                              "Divorced",
                              "Have kids", 
                              "Female x married",
                              "Female x divorced",
                              "Female x have kids",
                              "Married x have kids",
                              "Divorced x have kids")
#generate a variable "predictor" in the dataset "result_m5" where the IVs are labelled in a way easy to understand
ggplot(data = result_m5,
       aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
  geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line  

or you may not want to show the intercept

result_m5$predictor <- c("Intercept", 
                              "Age", 
                              "Female",
                              "Married", 
                              "Divorced",
                              "Have kids", 
                              "Female x married",
                              "Female x divorced",
                              "Female x have kids",
                              "Married x have kids",
                              "Divorced x have kids")
result_m5 <- filter(result_m5, predictor!="Intercept")

#generate a variable "predictor" in the dataset "result_m5" where the IVs are labelled in a way easy to understand
ggplot(data = result_m5,
       aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
  geom_pointrange() + #this is the code to plot point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dashed line