Prepare the dataset for analysis
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(Hmisc) # Weighting
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born, weight
dataset2 <- dataset1 %>%
transmute(
age=zap_labels(age), #remove labels of age
sex_gen=as_factor(sex_gen) %>% fct_drop(), #treat sex_gen as a categorical, and drop unused levels
nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),TRUE ~ as.numeric(nkidsbio)) %>%
zap_label(), #specify when nkidsbio should be missing and then remove the label of nkidsbio
sat6=case_when(sat6<0 ~ as.numeric(NA), TRUE ~ as.numeric(sat6)) %>% #specify when sat should be considered missing
zap_label(), #remove labels of sat6
relstat=as_factor(relstat), #treat relationship status as categorical
relstat=case_when(relstat=="-7 Incomplete data" ~ as.character(NA), TRUE ~ as.character(relstat)) %>% #specify when relstat should be considered missing
as_factor() %>% #make relstat as a factor again, as the ~ as.character(NA) has made it a character variable
fct_drop(), #drop unused levels in relstat
cdweight=zap_label(cdweight)) %>% #remove labels of cdweight
drop_na() #remove all observations that are missing
dataset3 <- dataset2 %>%
mutate(
marital=case_when(
relstat %in% c("1 Never married single","2 Never married LAT","3 Never married COHAB") ~ "Nevermarried",
# when relstat has any of the three situations, I assign "Nevermarried" to new variable "marital1"
relstat %in% c("4 Married COHAB","5 Married noncohabiting") ~ 'Married',
# when relstat has any of the two situations, I assign "Married" to new variable "marital1
relstat %in% c("6 Divorced/separated single","7 Divorced/separated LAT","8 Divorced/separated COHAB") ~ 'Divorced',
# when relstat has any of the three situations, I assign "Divorced" to new variable "marital1"
relstat %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widow'
# when relstat has any of the two situations, I assign "Widow" to new variable "marital1"
) %>% as_factor(),
#after case_when, marital is a character variable, so make it a facto nor
parenthood=case_when(
nkidsbio>0 ~ "Have kids",
# when nkidsbio should be "Have kids"
nkidsbio==0 ~ "No kids") %>% # when nkidsbio should be "No kids"
as_factor() #make parenthood a categorical variable
)
dataset4 <- filter(dataset3, marital!= "Widow") #exclude respondents who are widow
No. 1
Question
- 1) Based on the dataset cleaned above, please create model1:
Regress life satisfaction on marital status and parenthood while
considering the weight and robust standard error
- 2) Add age varialbe to Model1
Answer
#1):Create model1: Regress life satisfaction on marital status and parenthood while considering the weight and robust standard error
model1 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood,
weights = cdweight)
#ols regression with considering weight and robust standard errors
summary(model1)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood, data = dataset4,
## weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 7.5285 0.03204 234.992 0.000e+00 7.4657 7.59131
## maritalMarried 0.4271 0.08673 4.925 8.663e-07 0.2571 0.59715
## maritalDivorced -0.8738 0.17238 -5.069 4.116e-07 -1.2117 -0.53588
## parenthoodHave kids -0.2109 0.08680 -2.430 1.513e-02 -0.3811 -0.04076
## DF
## (Intercept) 6150
## maritalMarried 6150
## maritalDivorced 6150
## parenthoodHave kids 6150
##
## Multiple R-squared: 0.02103 , Adjusted R-squared: 0.02055
## F-statistic: 27.01 on 3 and 6150 DF, p-value: < 2.2e-16
#2) Add age varialbe to Model1
model2 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood + age,
weights = cdweight)
#ols regression with considering weight and robust standard errors
summary(model2)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood + age, data = dataset4,
## weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 8.55612 0.097218 88.0098 0.000e+00 8.36554 8.74670
## maritalMarried 0.71006 0.092423 7.6827 1.802e-14 0.52887 0.89124
## maritalDivorced -0.55182 0.176797 -3.1212 1.810e-03 -0.89840 -0.20523
## parenthoodHave kids 0.05082 0.092537 0.5492 5.829e-01 -0.13058 0.23223
## age -0.04497 0.004593 -9.7906 1.802e-22 -0.05397 -0.03596
## DF
## (Intercept) 6149
## maritalMarried 6149
## maritalDivorced 6149
## parenthoodHave kids 6149
## age 6149
##
## Multiple R-squared: 0.04632 , Adjusted R-squared: 0.0457
## F-statistic: 43.84 on 4 and 6149 DF, p-value: < 2.2e-16
No. 2
Question
- Export the results of the two models to a html format and
change the names of variables so that the output is easy to
read
Answer
#Export to html
texreg::htmlreg(list(model1, model2),
custom.coef.names=c(
"Intercept",
"Married",
"Divorced",
"Have kids",
"Age"),
include.ci = FALSE,
file = "MyOLSModels.html") #html
## The table was written to the file 'MyOLSModels.html'.
No. 3
Question
- Model3: Add an interaction between parenthood and gender to
Model 2
- Does the relationship between parenthood and life
satisfaction vary across gender?
- How to interpret the interaction”
Answer
model3 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood*sex_gen + age,
weights = cdweight)
summary(model3)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood * sex_gen + age,
## data = dataset4, weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.545482 0.102278 83.55112 0.000e+00
## maritalMarried 0.710073 0.092718 7.65839 2.174e-14
## maritalDivorced -0.558290 0.177113 -3.15217 1.628e-03
## parenthoodHave kids -0.005611 0.115252 -0.04869 9.612e-01
## sex_gen2 Female 0.012087 0.061776 0.19565 8.449e-01
## age -0.044742 0.004603 -9.71935 3.599e-22
## parenthoodHave kids:sex_gen2 Female 0.091956 0.110740 0.83038 4.064e-01
## CI Lower CI Upper DF
## (Intercept) 8.34498 8.74598 6147
## maritalMarried 0.52831 0.89183 6147
## maritalDivorced -0.90549 -0.21109 6147
## parenthoodHave kids -0.23155 0.22032 6147
## sex_gen2 Female -0.10902 0.13319 6147
## age -0.05377 -0.03572 6147
## parenthoodHave kids:sex_gen2 Female -0.12513 0.30905 6147
##
## Multiple R-squared: 0.04658 , Adjusted R-squared: 0.04565
## F-statistic: 29.35 on 6 and 6147 DF, p-value: < 2.2e-16
texreg::screenreg(list(model3),
custom.model.names="OLS", #change the model name
custom.coef.names=c( #change the variable name
"Intercept",
"Married",
"Divorced",
"Have kids",
"Female",
"Age",
"Have kids X Female"
),
include.ci = FALSE, digits = 3)
##
## ================================
## OLS
## --------------------------------
## Intercept 8.545 ***
## (0.102)
## Married 0.710 ***
## (0.093)
## Divorced -0.558 **
## (0.177)
## Have kids -0.006
## (0.115)
## Female 0.012
## (0.062)
## Age -0.045 ***
## (0.005)
## Have kids X Female 0.092
## (0.111)
## --------------------------------
## R^2 0.047
## Adj. R^2 0.046
## Num. obs. 6154
## RMSE 1.758
## ================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
#Interpretation: A male respondent who has kids has lower life satisfaction than a male respondents who has no kids by about 0.006. This association is reversed for female. A female respondent who has kids has higher life satisfaction than a female respondents who has no kids by about (-0.006+0.092). Nevertheless, the interaction is not signicant at 5%.
No. 4
Question
- Model4: Add an interaction between age and gender to Model
2
- Does the relationship between age and life satisfaction vary
across gender?
- How to interpret the interaction
Answer
model4 <- lm_robust(data = dataset4,
formula = sat6 ~ marital + parenthood + age*sex_gen,
weights = cdweight)
summary(model4)
##
## Call:
## lm_robust(formula = sat6 ~ marital + parenthood + age * sex_gen,
## data = dataset4, weights = cdweight)
##
## Weighted, Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 8.71844 0.124333 70.1218 0.000e+00 8.474707 8.96218
## maritalMarried 0.70530 0.092665 7.6113 3.122e-14 0.523644 0.88696
## maritalDivorced -0.56368 0.177008 -3.1845 1.457e-03 -0.910673 -0.21668
## parenthoodHave kids 0.03547 0.093082 0.3811 7.032e-01 -0.147000 0.21795
## age -0.05159 0.005477 -9.4208 6.195e-21 -0.062330 -0.04086
## sex_gen2 Female -0.35978 0.151285 -2.3782 1.743e-02 -0.656356 -0.06321
## age:sex_gen2 Female 0.01500 0.005912 2.5377 1.118e-02 0.003414 0.02659
## DF
## (Intercept) 6147
## maritalMarried 6147
## maritalDivorced 6147
## parenthoodHave kids 6147
## age 6147
## sex_gen2 Female 6147
## age:sex_gen2 Female 6147
##
## Multiple R-squared: 0.04761 , Adjusted R-squared: 0.04668
## F-statistic: 29.57 on 6 and 6147 DF, p-value: < 2.2e-16
texreg::screenreg(list(model4),
custom.model.names="OLS", #change the model name
custom.coef.names=c( #change the variable name
"Intercept",
"Married",
"Divorced",
"Have kids",
"Age",
"Female",
"Age X Female"
),
include.ci = FALSE, digits = 3)
##
## ==========================
## OLS
## --------------------------
## Intercept 8.718 ***
## (0.124)
## Married 0.705 ***
## (0.093)
## Divorced -0.564 **
## (0.177)
## Have kids 0.035
## (0.093)
## Age -0.052 ***
## (0.005)
## Female -0.360 *
## (0.151)
## Age X Female 0.015 *
## (0.006)
## --------------------------
## R^2 0.048
## Adj. R^2 0.047
## Num. obs. 6154
## RMSE 1.757
## ==========================
## *** p < 0.001; ** p < 0.01; * p < 0.05
#Interpretation: The relationship between age and life satisfaction varies across gender.
#As the age of the male respondent increases by 1 year, the life satisfaction decreases by 0.052.
#For females, such negative association is weaker. As the age of the female respondent increases by 1 year, the life satisfaction decreases by (-0.052+0.015).