1.1 Based on pairfam wave1 dataset, compile a dataset consists of variables as following:
age, sex(sex_gen), life satisfaction(sat6), relationship(relstat), no.of children(nkidsbio)
1.2 Drop observations with missing values.
1.3 What is the original sample size and what is the sample size after cleaning?
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor) #cleaning data
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
dataset1 <- read_dta("anchor1_50percent_Eng.dta")
#a dataset of age, sex, life satisfaction, no.of children ever born
dataset2 <- dataset1 %>%
transmute(
age,
sex=as_factor(sex_gen) %>% fct_drop(),
#treat sex as a categorical, and drop unused levels
nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
TRUE ~ as.numeric(nkidsbio)),
#specify when nkidsbio should be missing
sat6=case_when(sat6<0 ~ as.numeric(NA),
TRUE ~ as.numeric(sat6)),
#specify when sat should be considered missing
relstat=as_factor(relstat), #treat relationship status as categorical
relstat1=case_when(relstat=="-7 Incomplete data" ~ as.character(NA),
TRUE ~ as.character(relstat)) %>%
#specify when relstat1 should be missing
as_factor() %>% fct_drop(),
#make relstat1 as a factor again & drop unused levels
) %>%
drop_na() #remove all observations that are missing
#original sample size is 6201; the clean sampled size is 6158.
2.1 Generate a new variable “marital” for relationship status with categories of “Never married”, “Married”, “Separate/Divorced”, “Widowed”.
2.2 Generate a new variable “parenthood” based on nkidsbio with categories of “Have kids”, “No kids”.
tabyl(dataset2,relstat1)#check the distribution first.
## relstat1 n percent
## 1 Never married single 2445 0.3970444950
## 4 Married COHAB 1733 0.2814225398
## 2 Never married LAT 1011 0.1641766807
## 3 Never married COHAB 658 0.1068528743
## 8 Divorced/separated COHAB 76 0.0123416694
## 9 Widowed single 3 0.0004871712
## 6 Divorced/separated single 145 0.0235466060
## 7 Divorced/separated LAT 63 0.0102305943
## 5 Married noncohabiting 23 0.0037349789
## 10 Widowed LAT 1 0.0001623904
dataset3 <- dataset2 %>%
mutate(
marital=case_when(
relstat1 %in% c("1 Never married single",
"2 Never married LAT",
"3 Never married COHAB") ~ "Nevermarried",
# when relstat1 has any of the three situations, I assign "Nevermarried" to new variable "marital"
relstat1 %in% c("4 Married COHAB",
"5 Married noncohabiting") ~ 'Married',
# when relstat1 has any of the two situations, I assign "Married" to new variable "marital
relstat1 %in% c("6 Divorced/separated single",
"7 Divorced/separated LAT",
"8 Divorced/separated COHAB") ~ 'Divorced',
# when relstat1 has any of the three situations, I assign "Divorced" to new variable "marital"
relstat1 %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widowed'
# when relstat1 has any of the two situations, I assign "Widow" to new variable "marital"
) %>% as_factor(),#make marital a categorical variable
parenthood=case_when(
nkidsbio>0 ~ "Have kids",#when nkidsbio should be "Have kids"
nkidsbio==0 ~ "No kids" #when nkidsbio should be "No kids"
) %>%as_factor() #make parenthood a categorical variable
)
3.1 Estimate the life satisfaction for the married and the divorced individuals
3.2 What is the average life satisfaction for the divorced individuals?
dataset4 <- dataset3%>%
filter(marital!= "Widowed" & marital!= "Nevermarried")
#drop the widowed and never-married
mar_sat <- dataset4 %>%
group_by(marital) %>%
dplyr::summarise(
mean_sat6=mean(sat6), #calculate the mean of sat6 by marital1
)
mar_sat
## # A tibble: 2 × 2
## marital mean_sat6
## <fct> <dbl>
## 1 Married 7.80
## 2 Divorced 6.68
# average life satisfaction for the divorced individuals is 6.676.
I want to understand whether the divorced people have lower life satisfaction than the married ones. Please use standard error robust OLS to do the modelling
4.1 Model1: regress life satisfaction on age and marital status
4.2 What is the effect of age on the life satisfaction?
4.3 What is the effect of divorce on the life satisfaction, when age is controlled?
4.4 output the result to a html file.
regression1 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital )
summary(regression1)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 7.896879 0.298478 26.4572 7.741e-133 7.31153 8.48223 2037
## age -0.002768 0.008792 -0.3148 7.529e-01 -0.02001 0.01447 2037
## maritalDivorced -1.125749 0.134957 -8.3416 1.333e-16 -1.39042 -0.86108 2037
##
## Multiple R-squared: 0.04751 , Adjusted R-squared: 0.04658
## F-statistic: 35.23 on 2 and 2037 DF, p-value: 9.053e-16
texreg::htmlreg(regression1,
include.ci = FALSE,
file = "regression1.html") #html
## The table was written to the file 'regression1.html'.
#4.2 : when the marital status is controlled, with one year increase in age, the life satisfaction decreased by 0.0028. But this age effect is not statistically significant.
#4.3 : when the age is controlled, a divorced individuals has lower life satisfaction than a married individual by amount of 1.126 score. This effect of marital status is statistically significant.
I want to understand whether the divorced people have lower life satisfaction than the married ones. Please standard error robust OLS to do the modelling
5.1 Please change the reference category of marital, using “divorced” as the reference.
5.2 Do a Model 2 where independent variables are age, marital, parenthood
dataset4$marital <- fct_relevel(dataset4$marital, "Divorced")
regression2 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital + parenthood )
summary(regression2)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood, data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 6.770486 0.33181 20.4048 2.353e-84 6.11976 7.42121
## age -0.003605 0.00939 -0.3839 7.011e-01 -0.02202 0.01481
## maritalMarried 1.123965 0.13503 8.3237 1.542e-16 0.85915 1.38878
## parenthoodHave kids 0.036610 0.11227 0.3261 7.444e-01 -0.18356 0.25678
## DF
## (Intercept) 2036
## age 2036
## maritalMarried 2036
## parenthoodHave kids 2036
##
## Multiple R-squared: 0.04757 , Adjusted R-squared: 0.04616
## F-statistic: 23.47 on 3 and 2036 DF, p-value: 6.2e-15
Using standard error robust OLS to do the following models
6.1 Model3: regress life satisfaction on age, marital status, parenthood status, and sex. Using “Married” as the reference group.
6.2 Model4: based on Model 3, add an interaction between marital status and sex.
6.3 Export results of Model 3 and Model 4 to a html file and rename the names of coefficents.
6.4 Does the effect of marital status differ by gender?
dataset4$marital <- fct_relevel(dataset4$marital, "Married")
regression3 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital + parenthood + sex )
summary(regression3)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood + sex,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 7.81240 0.31294 24.9646 3.183e-120 7.19868 8.42611
## age -0.00242 0.00951 -0.2545 7.991e-01 -0.02107 0.01623
## maritalDivorced -1.13145 0.13556 -8.3465 1.281e-16 -1.39729 -0.86560
## parenthoodHave kids 0.02751 0.11241 0.2447 8.067e-01 -0.19294 0.24796
## sex2 Female 0.08437 0.08063 1.0463 2.955e-01 -0.07376 0.24251
## DF
## (Intercept) 2035
## age 2035
## maritalDivorced 2035
## parenthoodHave kids 2035
## sex2 Female 2035
##
## Multiple R-squared: 0.04809 , Adjusted R-squared: 0.04622
## F-statistic: 17.72 on 4 and 2035 DF, p-value: 2.616e-14
regression4 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital*sex + parenthood )
summary(regression4)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital * sex + parenthood,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower
## (Intercept) 7.825246 0.314066 24.9159 8.218e-120 7.20932
## age -0.002425 0.009513 -0.2549 7.989e-01 -0.02108
## maritalDivorced -1.209278 0.248957 -4.8574 1.280e-06 -1.69752
## sex2 Female 0.069171 0.082399 0.8395 4.013e-01 -0.09242
## parenthoodHave kids 0.023058 0.112595 0.2048 8.378e-01 -0.19775
## maritalDivorced:sex2 Female 0.118446 0.296005 0.4001 6.891e-01 -0.46206
## CI Upper DF
## (Intercept) 8.44117 2034
## age 0.01623 2034
## maritalDivorced -0.72104 2034
## sex2 Female 0.23077 2034
## parenthoodHave kids 0.24387 2034
## maritalDivorced:sex2 Female 0.69895 2034
##
## Multiple R-squared: 0.0482 , Adjusted R-squared: 0.04586
## F-statistic: 14.22 on 5 and 2034 DF, p-value: 1.065e-13
texreg::htmlreg(list(regression3, regression4),
include.ci = FALSE, digits = 3,
file = "regression3&4.html") #html
## The table was written to the file 'regression3&4.html'.
texreg::htmlreg(
list(regression3, regression4),
custom.coef.names = c("Intercept",
"Age",
"Divorced (Ref.=Married)",
"Have kids (Ref.=no kid)",
"Female (Ref.=Male)",
"Divorced x Female"
),
include.ci = FALSE, digits = 3,
file = "regression3&4_renamed.html") #html
## The table was written to the file 'regression3&4_renamed.html'.
#6.4: The effect of marital status does not differ by gender significantly.
Using standard error robust OLS to do the following models
7.1 Model5: regress life satisfaction on age, marital status, parenthood status, and sex. Add an interaction between marital status and and age.
7.2 What is the difference between the age effect among the married and the age effect among the divorced?
regression5 <- lm_robust(data = dataset4,
formula = sat6 ~ marital*age + sex + parenthood )
summary(regression5)
##
## Call:
## lm_robust(formula = sat6 ~ marital * age + sex + parenthood,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 7.685949 0.319905 24.02570 1.720e-112 7.05857 8.31333
## maritalDivorced -0.036013 1.178056 -0.03057 9.756e-01 -2.34633 2.27431
## age 0.001368 0.009774 0.13991 8.887e-01 -0.01780 0.02054
## sex2 Female 0.086532 0.080706 1.07219 2.838e-01 -0.07174 0.24481
## parenthoodHave kids 0.025359 0.112806 0.22480 8.222e-01 -0.19587 0.24659
## maritalDivorced:age -0.031990 0.033903 -0.94357 3.455e-01 -0.09848 0.03450
## DF
## (Intercept) 2034
## maritalDivorced 2034
## age 2034
## sex2 Female 2034
## parenthoodHave kids 2034
## maritalDivorced:age 2034
##
## Multiple R-squared: 0.04874 , Adjusted R-squared: 0.0464
## F-statistic: 14.56 on 5 and 2034 DF, p-value: 4.757e-14
texreg::htmlreg(list(regression5),
custom.coef.names = c("Intercept",
"Divorced (Ref.=Married)",
"Age",
"Female (Ref.=Male)",
"Have kids (Ref.=no kid)",
"Divorced x Age"
),
include.ci = FALSE, digits = 3,
file = "regression5.html") #html
## The table was written to the file 'regression5.html'.
#7.2: The difference between the age effect among the married and the age effect among the divorced is 0.03199, but this difference is not statistically significant.