No. 1

Question

Copy the following code to obtain dataset3. With dataset3, please do the following regression.

  • Model 1: regress life satisfaction on age, sex, martial status, parenthood status
  • Model 2: regress life satisfaction on age, sex, martial status, parenthood status, interaction between sex and martial status
#Run the following to prepare your dataset
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(ggplot2) # Allows us to create nice figures.
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.

#import data
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born

#create dataset2
dataset2 <- dataset1 %>% 
  transmute(
           age,
           sex_gen=as_factor(sex_gen) %>% fct_drop(), 
           nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
                              TRUE ~ as.numeric(nkidsbio)), 
           sat6=case_when(sat6<0 ~ as.numeric(NA), 
                          TRUE ~ as.numeric(sat6)), #specify when sat should be considered missing
           relstat=as_factor(relstat), 
           relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), 
                             TRUE ~ as.character(relstat)) %>%  
             as_factor() %>%  
             fct_drop()
           )  %>%  
  drop_na() 

#create dataset3
dataset3 <- dataset2 %>% 
  mutate(
    marital=case_when(
      relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried",
      relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married',
      relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced',
      relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow'
    ) %>% as_factor(), 
    parenthood=case_when(
      nkidsbio>0 ~ "Have kids",
      nkidsbio==0 ~ "No kids") %>% 
      as_factor()  
    )%>% 
  filter(marital!= "Widow")

Answer

model1 <- lm_robust(formula = sat6 ~ age +  sex_gen + marital + parenthood, data = dataset3)
summary(model1)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen + marital + parenthood, 
##     data = dataset3)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                     Estimate Std. Error  t value  Pr(>|t|) CI Lower CI Upper
## (Intercept)          8.52022   0.087294  97.6037 0.000e+00  8.34910  8.69135
## age                 -0.04259   0.003852 -11.0543 3.843e-28 -0.05014 -0.03503
## sex_gen2 Female      0.03697   0.044375   0.8332 4.048e-01 -0.05002  0.12396
## maritalMarried       0.67765   0.080028   8.4677 3.099e-17  0.52077  0.83453
## maritalDivorced     -0.41739   0.147118  -2.8371 4.568e-03 -0.70579 -0.12898
## parenthoodHave kids  0.01415   0.081160   0.1744 8.616e-01 -0.14495  0.17326
##                       DF
## (Intercept)         6148
## age                 6148
## sex_gen2 Female     6148
## maritalMarried      6148
## maritalDivorced     6148
## parenthoodHave kids 6148
## 
## Multiple R-squared:  0.04032 ,   Adjusted R-squared:  0.03954 
## F-statistic: 42.91 on 5 and 6148 DF,  p-value: < 2.2e-16
model2 <- lm_robust(formula = sat6 ~ age +  sex_gen*marital + parenthood, data = dataset3)
summary(model2)
## 
## Call:
## lm_robust(formula = sat6 ~ age + sex_gen * marital + parenthood, 
##     data = dataset3)
## 
## Standard error type:  HC2 
## 
## Coefficients:
##                                  Estimate Std. Error   t value  Pr(>|t|)
## (Intercept)                      8.521746   0.087984  96.85557 0.000e+00
## age                             -0.042555   0.003853 -11.04338 4.330e-28
## sex_gen2 Female                  0.032771   0.053094   0.61722 5.371e-01
## maritalMarried                   0.683116   0.098286   6.95026 4.023e-12
## maritalDivorced                 -0.509612   0.249568  -2.04198 4.120e-02
## parenthoodHave kids              0.011517   0.081296   0.14167 8.873e-01
## sex_gen2 Female:maritalMarried  -0.005617   0.097550  -0.05758 9.541e-01
## sex_gen2 Female:maritalDivorced  0.142108   0.288261   0.49298 6.220e-01
##                                 CI Lower CI Upper   DF
## (Intercept)                      8.34927  8.69423 6146
## age                             -0.05011 -0.03500 6146
## sex_gen2 Female                 -0.07131  0.13685 6146
## maritalMarried                   0.49044  0.87579 6146
## maritalDivorced                 -0.99885 -0.02037 6146
## parenthoodHave kids             -0.14785  0.17089 6146
## sex_gen2 Female:maritalMarried  -0.19685  0.18562 6146
## sex_gen2 Female:maritalDivorced -0.42298  0.70720 6146
## 
## Multiple R-squared:  0.04039 ,   Adjusted R-squared:  0.0393 
## F-statistic: 30.68 on 7 and 6146 DF,  p-value: < 2.2e-16

No. 2

Question

Combine model 1 to model 2, output the results to a html file and label the names of independent variables to make readers easy to understand.

Answer

#first create Regression.html to see the order of variables
htmlreg(list(model1, model2), include.ci = FALSE,
        caption = "My regression table", 
        caption.above = TRUE,
        single.row = TRUE,
        digits = 3,
        file = "Regression.html")
## The table was written to the file 'Regression.html'.
#first create Regression_readable names.html to change the variable names accroding to the order of the variables shown in Regression.html
htmlreg(list(model1, model2), include.ci = FALSE,
        custom.coef.names = c("Intercept", 
                              "Age", 
                              "Female",
                              "Married", 
                              "Divorced",
                              "Have kids", 
                              "Female x married",
                              "Female x divorced"),
        caption = "My regression table", 
        caption.above = TRUE,
        single.row = TRUE,
        digits = 3,
        file = "Regression_readable names.html")
## The table was written to the file 'Regression_readable names.html'.

No. 3

Question

Save the result of Model 2 as a dataset

Answer

result_m2<- model2 %>% broom::tidy()
result_m2
##                              term     estimate   std.error    statistic
## 1                     (Intercept)  8.521746003 0.087984058  96.85556900
## 2                             age -0.042555021 0.003853442 -11.04337806
## 3                 sex_gen2 Female  0.032770627 0.053094178   0.61721694
## 4                  maritalMarried  0.683116214 0.098286482   6.95025602
## 5                 maritalDivorced -0.509611544 0.249567598  -2.04197799
## 6             parenthoodHave kids  0.011517126 0.081296432   0.14166828
## 7  sex_gen2 Female:maritalMarried -0.005617192 0.097550190  -0.05758258
## 8 sex_gen2 Female:maritalDivorced  0.142108391 0.288261430   0.49298441
##        p.value    conf.low   conf.high   df outcome
## 1 0.000000e+00  8.34926645  8.69422556 6146    sat6
## 2 4.329806e-28 -0.05010912 -0.03500092 6146    sat6
## 3 5.371145e-01 -0.07131255  0.13685380 6146    sat6
## 4 4.022887e-12  0.49044030  0.87579212 6146    sat6
## 5 4.119631e-02 -0.99885140 -0.02037169 6146    sat6
## 6 8.873467e-01 -0.14785234  0.17088659 6146    sat6
## 7 9.540830e-01 -0.19684971  0.18561533 6146    sat6
## 8 6.220412e-01 -0.42298492  0.70720170 6146    sat6
#or you can have the following code
result_m2<- tidy(model2)

No. 4

Question

Plot the result in a point chart, and also display the confidence interval of each coefficient

Answer

ggplot(data = result_m2,
       aes(x = term, y = estimate, ymin = conf.low, ymax = conf.high)) +
  geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed") # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dashed line  

to make your graph a nicer one

result_m2$predictor <- c("Intercept", 
                              "Age", 
                              "Female",
                              "Married", 
                              "Divorced",
                              "Have kids", 
                              "Female x married",
                              "Female x divorced") #create a variable named "predictor" in the result_m2 dataset.
#generate a variable "predictor" in the dataset "result_m2" where the IVs are labelled in a way easy to understand
result_m2$predictor <- as_factor(result_m2$predictor)
ggplot(data = result_m2,
       aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
  geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed")+ # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line  
  ylab("Regression coefficient with CI") #change the name of yaxis


#you can also test what if you don't make result_m2$predictor as a factor.

#if you don't want to include intercept
ggplot(data = result_m2[-1,], #remove row 1, as intercetp is located in row 1
       aes(x = predictor, y = estimate, ymin = conf.low, ymax = conf.high)) + #now i change x=term to x= predictor
  geom_pointrange() + #this is the code to ploty point with confidence interval, but you have to specify ymin and ymax for the CI
  coord_flip() +
  geom_hline(yintercept = 0, color = "red", lty = "dashed")+ # yintercept is to make y=0 as the ref. line, lty is to specify the line is a dahed line 
  ylab("Regression coefficient with CI") #change the name of yaxis

No. 5

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).

  • Step 1: please import data of wave1-6 (i.e.from “data anchor1_50percent_Eng.dta” to “data anchor6_50percent_Eng.dta”) and check whether the coding and levels are consistent across 6 waves

Answer

wave1 <- read_dta("anchor1_50percent_Eng.dta")
wave2 <- read_dta("anchor2_50percent_Eng.dta")
wave3 <- read_dta("anchor3_50percent_Eng.dta")
wave4 <- read_dta("anchor4_50percent_Eng.dta")
wave5 <- read_dta("anchor5_50percent_Eng.dta")
wave6 <- read_dta("anchor6_50percent_Eng.dta")

No. 6

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).

  • Step 2: check whether the coding and levels are consistent across 6 waves for these variables:
    • age,
    • gender(sex_gen),
    • labor force status(lfs),
    • number of children(nkidsbio),
    • life satisfaction (sat6)

Answer

#check coding across 6 waves

sex_fun <- function(df) {
  table(as_factor(df$sex_gen))
        }
sapply(mget(paste0("wave", 1:6)), sex_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -10 not in demodiff                   0     0     0     0     0     0
## -7 Incomplete data                    0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## 1 Male                             3029  2197  1905  1668  1493  1342
## 2 Female                           3172  2339  2050  1813  1626  1477
#same coding for gender

lfs_fun <- function(df) {
  table(as_factor(df$lfs))
        }
sapply(mget(paste0("wave", 1:6)), lfs_fun)
##                                                        wave1 wave2 wave3 wave4
## -7 Incomplete data                                        12    22    24     7
## -3 Does not apply                                          0     0     0     0
## 1 nw, education                                         2229  1441  1093   725
## 2 nw, parental leave                                     237   146   148   116
## 3 nw, homemaker                                          253   120    97    85
## 4 nw, unemployed                                         297   235   180   156
## 5 nw, military service                                     9     8    33    30
## 6 nw, retired                                             19    19    21    22
## 7 nw, other                                               33    15    21    26
## 8 w, vocational training                                 308   387   371   381
## 9 w, full-time employment                               1929  1337  1206  1159
## 10 w, part-time employment                               468   419   405   415
## 11 w, marginal employment (geringfügige Beschäftigung)   142   137   129   144
## 12 w, self-employed                                      202   164   159   153
## 13 w, other                                               63    86    68    62
##                                                        wave5 wave6
## -7 Incomplete data                                         4     1
## -3 Does not apply                                          0     0
## 1 nw, education                                          499   425
## 2 nw, parental leave                                      90    78
## 3 nw, homemaker                                           69    50
## 4 nw, unemployed                                         133   124
## 5 nw, military service                                    44    16
## 6 nw, retired                                             22    27
## 7 nw, other                                               28    19
## 8 w, vocational training                                 324   232
## 9 w, full-time employment                               1127  1109
## 10 w, part-time employment                               409   388
## 11 w, marginal employment (geringfügige Beschäftigung)   146   146
## 12 w, self-employed                                      167   156
## 13 w, other                                               57    48
#same coding for labor force participation

sat_fun <- function(df) {
  table(as_factor(df$sat6))
        }
sapply(mget(paste0("wave", 1:6)), sat_fun)
##                                   wave1 wave2 wave3 wave4 wave5 wave6
## -5 Inconsistent value                 0     0     0     0     0     0
## -4 Filter error / Incorrect entry     0     0     0     0     0     0
## -3 Does not apply                     0     0     0     0     0     0
## -2 No answer                          2     5     4     4     3     4
## -1 Don't know                         3     1     1     0     0     0
## 0 Very dissatisfied                  26    15     9    13     4     5
## 1                                    18     5    10    12     7     5
## 2                                    45    27    38    34    22    20
## 3                                   110    58    57    56    60    42
## 4                                   133    84    88    72    75    81
## 5                                   395   249   221   205   188   130
## 6                                   508   316   291   282   219   223
## 7                                  1178   898   863   726   691   592
## 8                                  1877  1417  1282  1138  1042   974
## 9                                  1157   933   701   631   563   492
## 10 Very satisfied                   749   528   390   308   245   251
#same coding for life satisfaction

age_fun <- function(df) {
  summary(df$age)
        }
sapply(mget(paste0("wave", 1:6)), age_fun)
##            wave1   wave2    wave3    wave4    wave5    wave6
## Min.    14.00000 15.0000 16.00000 17.00000 18.00000 19.00000
## 1st Qu. 17.00000 17.0000 18.00000 19.00000 20.00000 21.00000
## Median  26.00000 27.0000 28.00000 29.00000 30.00000 31.00000
## Mean    25.83728 26.4235 27.24526 28.46596 29.54569 30.66761
## 3rd Qu. 35.00000 36.0000 37.00000 38.00000 39.00000 40.00000
## Max.    38.00000 39.0000 40.00000 41.00000 42.00000 43.00000
# no meaningless age

kid_fun <- function(df) {
  summary(df$nkidsbio)
        }
sapply(mget(paste0("wave", 1:6)), kid_fun)
##             wave1  wave2      wave3      wave4      wave5      wave6
## Min.    -7.000000 -7.000  0.0000000  0.0000000  0.0000000  0.0000000
## 1st Qu.  0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Median   0.000000  0.000  0.0000000  0.0000000  0.0000000  0.0000000
## Mean     0.600387  0.625  0.6558786  0.7199081  0.7608208  0.8031217
## 3rd Qu.  1.000000  1.000  1.0000000  1.0000000  2.0000000  2.0000000
## Max.    10.000000 10.000 10.0000000 10.0000000 10.0000000 10.0000000
#in wave1 and 2, there are respondents who have no. of children <0.

No. 7

Question

Now, you are going to compile a dataset that have variables: id, age, gender(sex_gen), labor force status(lfs), number of children(nkidsbio), and life satisfaction (sat6).

  • Step 3: clean the variables across 6 waves and drop all missing values. For each wave, please keep id, wave, age, sex_gen, lfs, nkidsbio, and sat6. And make sure these variable are cleaned.

Answer

clean_fun <- function(df) {
df %>% 
  transmute(
    id, 
    wave,
    age, 
    sex=as_factor(sex_gen), #make sex_gen as a factor
    lfs=as_factor(lfs), #make lfs as a factor
    lfs=case_when(lfs== "-7 Incomplete data" ~ as.character(NA), #specify when is missing for lfs
                      TRUE ~ as.character(lfs))%>%  
      as_factor(), #make lfs as a factor again
    kidno=case_when(nkidsbio<0 ~ as.numeric(NA),  #specify when hlt1 is missing 
                   TRUE ~ as.numeric(nkidsbio)),
    sat=case_when(sat6<0 ~ as.numeric(NA), #specify when sat6 is missing
                   TRUE ~ as.numeric(sat6))
                   ) %>% drop_na()
}

wave1a <- clean_fun(wave1)
wave2a <- clean_fun(wave2)
wave3a <- clean_fun(wave3)
wave4a <- clean_fun(wave4)
wave5a <- clean_fun(wave5)
wave6a <- clean_fun(wave6)