Use pairfam wave1 dataset, please check the following variables in the questionnaire and codebook:
sat1i1
val1i7
1.1 Use pairfam wave1 dataset, compile a dataset consisting of variables stated in Warming up: 1) age, 2) sex_gen, 3) sat6, 4) sat1i1, 5) relstat, 6) nkidsbio, 7) cohort, 8) val1i7
1.2 Check the variables to see any of them having invalid answers.
1.3 Drop observations with missing values.
1.4 What is the original sample size and what is the sample size after cleaning?
library(tidyverse) # Add the tidyverse package to my current library.
library(haven) # Import data.
library(janitor) #cleaning data
library(estimatr) # Allows us to estimate (cluster-)robust standard errors.
library(texreg) # Allows us to make nicely-formatted Html & Latex regression tables.
pairfam <- read_dta("anchor1_50percent_Eng.dta")
dataset1 <- select(pairfam,
age,
sex_gen,
sat6,
sat1i1,
relstat,
nkidsbio,
cohort,
val1i7)
tabyl(dataset1,age)
## age n percent
## 14 41 0.006611837
## 15 708 0.114175133
## 16 722 0.116432833
## 17 667 0.107563296
## 18 35 0.005644251
## 24 24 0.003870343
## 25 577 0.093049508
## 26 678 0.109337204
## 27 647 0.104338010
## 28 87 0.014029995
## 34 22 0.003547815
## 35 502 0.080954685
## 36 618 0.099661345
## 37 772 0.124496049
## 38 101 0.016287696
tabyl(dataset1,sex_gen)
## sex_gen n percent
## 1 3029 0.4884696
## 2 3172 0.5115304
tabyl(dataset1,sat6) # 5 cases with invalid answer
## sat6 n percent
## -2 2 0.0003225286
## -1 3 0.0004837929
## 0 26 0.0041928721
## 1 18 0.0029027576
## 2 45 0.0072568940
## 3 110 0.0177390743
## 4 133 0.0214481535
## 5 395 0.0636994033
## 6 508 0.0819222706
## 7 1178 0.1899693598
## 8 1877 0.3026931140
## 9 1157 0.1865828092
## 10 749 0.1207869698
tabyl(dataset1,sat1i1) #23 cases with invalid answer
## sat1i1 n percent
## -2 13 0.002096436
## -1 10 0.001612643
## 0 103 0.016610224
## 1 50 0.008063216
## 2 89 0.014352524
## 3 208 0.033542977
## 4 219 0.035316884
## 5 593 0.095629737
## 6 494 0.079664570
## 7 1154 0.186099016
## 8 1490 0.240283825
## 9 773 0.124657313
## 10 1005 0.162070634
tabyl(dataset1,relstat) #34 cases with invalid answer
## relstat n percent
## -7 34 0.0054829866
## 1 2448 0.3947750363
## 2 1012 0.1631994840
## 3 660 0.1064344461
## 4 1735 0.2797935817
## 5 23 0.0037090792
## 6 146 0.0235445896
## 7 63 0.0101596517
## 8 76 0.0122560877
## 9 3 0.0004837929
## 10 1 0.0001612643
#or
tabyl(as_factor(dataset1$relstat))
## as_factor(dataset1$relstat) n percent
## -7 Incomplete data 34 0.0054829866
## 1 Never married single 2448 0.3947750363
## 2 Never married LAT 1012 0.1631994840
## 3 Never married COHAB 660 0.1064344461
## 4 Married COHAB 1735 0.2797935817
## 5 Married noncohabiting 23 0.0037090792
## 6 Divorced/separated single 146 0.0235445896
## 7 Divorced/separated LAT 63 0.0101596517
## 8 Divorced/separated COHAB 76 0.0122560877
## 9 Widowed single 3 0.0004837929
## 10 Widowed LAT 1 0.0001612643
## 11 Widowed COHAB 0 0.0000000000
tabyl(dataset1,nkidsbio) #4 cases with invalid answer
## nkidsbio n percent
## -7 4 0.0006450572
## 0 4136 0.6669891953
## 1 839 0.1353007579
## 2 868 0.1399774230
## 3 277 0.0446702145
## 4 54 0.0087082729
## 5 17 0.0027414933
## 6 4 0.0006450572
## 10 2 0.0003225286
tabyl(dataset1,cohort)
## cohort n percent
## 1 2173 0.3504274
## 2 2013 0.3246251
## 3 2015 0.3249476
tabyl(as_factor(dataset1$val1i7)) #63cases with invalid answer
## as_factor(dataset1$val1i7) n percent
## -5 Inconsistent value 0 0.000000000
## -4 Filter error / Incorrect entry 0 0.000000000
## -3 Does not apply 0 0.000000000
## -2 No answer 10 0.001612643
## -1 Don't know 53 0.008547009
## 1 Disagree completely 913 0.147234317
## 2 800 0.129011450
## 3 1328 0.214159007
## 4 1152 0.185776488
## 5 Agree completely 1945 0.313659087
dataset2 <- dataset1 %>%
transmute(
age,
sex=as_factor(sex_gen) %>% fct_drop(),
#treat sex as a categorical, and drop unused levels
sat6=case_when(sat6<0 ~ as.numeric(NA),
TRUE ~ as.numeric(sat6)),
#specify when sat should be considered missing
fam_sat=case_when(sat1i1<0 ~ as.numeric(NA),
TRUE ~ as.numeric(sat1i1)),
#specify when fam_sat should be considered missing
relstat=as_factor(relstat), #treat relationship status as categorical
relstat1=case_when(relstat=="-7 Incomplete data" ~ as.character(NA),
TRUE ~ as.character(relstat)) %>%
#specify when relstat1 should be missing
as_factor() %>% fct_drop(),#make relstat1 as a factor again & drop unused levels
nkidsbio=case_when(nkidsbio<0 ~ as.numeric(NA),
TRUE ~ as.numeric(nkidsbio)),
#specify when nkidsbio should be missing
cohort=as_factor(cohort) %>% fct_drop(),
#treat sex as a categorical, and drop unused levels
mar_att=as_factor(val1i7),
mar_att1=case_when(mar_att %in% c("-2 No answer","-1 Don't know") ~ as.character(NA),
TRUE ~ as.character(mar_att)%>%
#specify when val1i5 should be missing
as_factor() %>% fct_drop()#make mar_att1 as a factor again & drop unused levels
)
) %>%
drop_na() #remove all observations that are missing
count(dataset1)
## # A tibble: 1 × 1
## n
## <int>
## 1 6201
count(dataset2)
## # A tibble: 1 × 1
## n
## <int>
## 1 6074
original sample size is 6201; the clean sampled size is 6074.
Generate a new variable “marital” for relationship status with categories of “Never married”, “Married”, “Divorced”(including separated), “Widowed”, regardless of whether individuals are cohabiting, noncohabiting, LAT.
Generate a new variable “parenthood” based on nkidsbio with categories of “Have kids”, “No kids”.
tabyl(dataset2,relstat1)#check the distribution first.
## relstat1 n percent
## 1 Never married single 2408 0.3964438591
## 4 Married COHAB 1705 0.2807046427
## 2 Never married LAT 1001 0.1648007903
## 3 Never married COHAB 654 0.1076720448
## 8 Divorced/separated COHAB 75 0.0123477116
## 9 Widowed single 3 0.0004939085
## 6 Divorced/separated single 142 0.0233783339
## 7 Divorced/separated LAT 62 0.0102074416
## 5 Married noncohabiting 23 0.0037866315
## 10 Widowed LAT 1 0.0001646362
dataset3 <- dataset2 %>%
mutate(
marital=case_when(
relstat1 %in% c("1 Never married single",
"2 Never married LAT",
"3 Never married COHAB") ~ "Nevermarried",
# when relstat1 has any of the three situations, I assign "Nevermarried" to new variable "marital"
relstat1 %in% c("4 Married COHAB",
"5 Married noncohabiting") ~ 'Married',
# when relstat1 has any of the two situations, I assign "Married" to new variable "marital
relstat1 %in% c("6 Divorced/separated single",
"7 Divorced/separated LAT",
"8 Divorced/separated COHAB") ~ 'Divorced',
# when relstat1 has any of the three situations, I assign "Divorced" to new variable "marital"
relstat1 %in% c("9 Widowed single","10 Widowed LAT") ~ 'Widowed'
# when relstat1 has any of the two situations, I assign "Widow" to new variable "marital"
) %>% as_factor(),#make marital a categorical variable
parenthood=case_when(
nkidsbio>0 ~ "Have kids",#when nkidsbio should be "Have kids"
nkidsbio==0 ~ "No kids" #when nkidsbio should be "No kids"
) %>%as_factor() #make parenthood a categorical variable
)
Estimate the average life satisfaction and average satifaction with family for the married and the divorced individuals.
dataset4 <- dataset3%>%
filter(marital!= "Widowed" & marital!= "Nevermarried")
#drop the widowed and never-married
mar_sat <- dataset4 %>%
group_by(marital) %>%
summarise(
mean_sat6=mean(sat6), #calculate the mean of sat6 by marital1
mean_famsat=mean(fam_sat) )
mar_sat
## # A tibble: 2 × 3
## marital mean_sat6 mean_famsat
## <fct> <dbl> <dbl>
## 1 Married 7.80 7.20
## 2 Divorced 6.67 6.54
# average life satisfaction for the divorced individuals is 6.676.
Generate a table to show the distribution of attitudes toward marriage for the married and the divorced individuals.
Then use a bar chart to show the distributions of attitudes the married and the divorced individuals.
dataset4$marital <- fct_drop(dataset4$marital) #drop unused levels
distribute_table<- tabyl(dataset4, mar_att1, marital) %>%
adorn_totals("row") %>%#add row %
adorn_percentages("col") %>% #add col
adorn_pct_formatting() #format the percentage
distribute_table
## mar_att1 Married Divorced
## 1 Disagree completely 12.2% 40.1%
## 2 12.0% 15.8%
## 3 20.9% 22.6%
## 4 16.9% 9.3%
## 5 Agree completely 38.1% 12.2%
## Total 100.0% 100.0%
plot1<- ggplot(data = dataset4,
mapping=aes(x = mar_att1,
y = ..prop..,# ask R to show % rather than absolute count
group = marital,#calculate the % of sat6 within each group of relstat_new
fill = marital)#color the bars different by relstat_new
)+
geom_bar()+
facet_wrap(~ marital)+
coord_flip()+ #swap the coordinate system to make a horizontal bar, so that the labels are clear
labs(title="Attitude: Marriage is a lifelong union that should not be broken")
plot1
I want to use OLS regression to examine whether the divorced people have lower life satisfaction than the married ones. Please use standard error robust OLS to do the modelling
5.1 Model1: regress life satisfaction on age and marital status.
5.2 What is your interpretation the coefficient of age and marital?
5.3 output the result to a html file.
regression1 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital )
summary(regression1)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 7.93862 0.298652 26.582 1.306e-133 7.35292 8.52432 2004
## age -0.00409 0.008795 -0.465 6.420e-01 -0.02134 0.01316 2004
## maritalDivorced -1.12413 0.134097 -8.383 9.587e-17 -1.38711 -0.86114 2004
##
## Multiple R-squared: 0.04766 , Adjusted R-squared: 0.04671
## F-statistic: 35.57 on 2 and 2004 DF, p-value: 6.616e-16
Interpreting the age coefficient: When the marital status is controlled, with one year increase in age, the life satisfaction decreased by 0.00409. But this age effect is not statistically significant at 5%.
Interpreting the marital coefficient: When the age is controlled, a divorced individuals has lower life satisfaction than a married individual by amount of 1.124 score. This effect of marital status is statistically significant.
# I export it into a html file. You can do whatever you want, say, word, or excel.
texreg::htmlreg(regression1,
include.ci = FALSE,
file = "output1.html") #html
## The table was written to the file 'output1.html'.
6.1 Please change the reference category of marital, using “divorced” as the reference in the regression conducted in Question 5.
6.2 Add a new variable, “parenthood” to your model in Question 6.1. Is there any change in the coefficents of age and marital?
dataset4$marital <- fct_relevel(dataset4$marital, "Divorced")
regression1a <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital )
summary(regression1a)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital, data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
## (Intercept) 6.81450 0.329093 20.707 1.794e-86 6.16910 7.45990 2004
## age -0.00409 0.008795 -0.465 6.420e-01 -0.02134 0.01316 2004
## maritalMarried 1.12413 0.134097 8.383 9.587e-17 0.86114 1.38711 2004
##
## Multiple R-squared: 0.04766 , Adjusted R-squared: 0.04671
## F-statistic: 35.57 on 2 and 2004 DF, p-value: 6.616e-16
regression2 <- lm_robust(data = dataset4,
formula = sat6 ~ age + marital + parenthood )
summary(regression2)
##
## Call:
## lm_robust(formula = sat6 ~ age + marital + parenthood, data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 6.814169 0.329195 20.6995 2.057e-86 6.16857 7.45977
## age -0.005173 0.009375 -0.5518 5.811e-01 -0.02356 0.01321
## maritalMarried 1.121696 0.134090 8.3653 1.109e-16 0.85873 1.38467
## parenthoodHave kids 0.047014 0.112394 0.4183 6.758e-01 -0.17341 0.26744
## DF
## (Intercept) 2003
## age 2003
## maritalMarried 2003
## parenthoodHave kids 2003
##
## Multiple R-squared: 0.04775 , Adjusted R-squared: 0.04632
## F-statistic: 23.69 on 3 and 2003 DF, p-value: 4.569e-15
The coefficient of “married” gets slightly smaller. The coefficient of age gets slightly more negative.
Using standard error robust OLS to do the following models
7.1 Model1: regress life satisfaction with family on age, marital status, parenthood status, and sex. Using “Married” as the reference group.
7.2 Model2: based on Model 1, add an interaction between marital status and sex.
7.3 Export results of Model 1 and Model 2 to a html file and rename the names of coefficients in a clearer way to understand. Does the association between marital status and family life satisfaction differ by gender?
dataset4$marital <- fct_relevel(dataset4$marital, "Married")
model1 <- lm_robust(data = dataset4,
formula = fam_sat ~ age + marital + parenthood + sex )
summary(model1 )
##
## Call:
## lm_robust(formula = fam_sat ~ age + marital + parenthood + sex,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 5.89003 0.45459 12.957 6.284e-37 4.9985 6.78155
## age 0.05499 0.01335 4.118 3.981e-05 0.0288 0.08118
## maritalDivorced -0.70494 0.16974 -4.153 3.420e-05 -1.0378 -0.37205
## parenthoodHave kids -0.50482 0.14393 -3.507 4.625e-04 -0.7871 -0.22255
## sex2 Female -0.19946 0.10979 -1.817 6.941e-02 -0.4148 0.01586
## DF
## (Intercept) 2002
## age 2002
## maritalDivorced 2002
## parenthoodHave kids 2002
## sex2 Female 2002
##
## Multiple R-squared: 0.02361 , Adjusted R-squared: 0.02166
## F-statistic: 11.49 on 4 and 2002 DF, p-value: 3.208e-09
model2 <- lm_robust(data = dataset4,
formula = fam_sat ~ age + marital*sex + parenthood )
summary(model2)
##
## Call:
## lm_robust(formula = fam_sat ~ age + marital * sex + parenthood,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower
## (Intercept) 5.85184 0.45541 12.8497 2.271e-36 4.95872
## age 0.05505 0.01336 4.1201 3.941e-05 0.02885
## maritalDivorced -0.48264 0.27038 -1.7850 7.441e-02 -1.01290
## sex2 Female -0.15543 0.11584 -1.3417 1.798e-01 -0.38261
## parenthoodHave kids -0.49235 0.14393 -3.4208 6.368e-04 -0.77460
## maritalDivorced:sex2 Female -0.34132 0.34621 -0.9859 3.243e-01 -1.02030
## CI Upper DF
## (Intercept) 6.74497 2001
## age 0.08125 2001
## maritalDivorced 0.04762 2001
## sex2 Female 0.07176 2001
## parenthoodHave kids -0.21009 2001
## maritalDivorced:sex2 Female 0.33765 2001
##
## Multiple R-squared: 0.02414 , Adjusted R-squared: 0.0217
## F-statistic: 9.218 on 5 and 2001 DF, p-value: 1.096e-08
texreg::htmlreg(
list(model1, model2),
custom.coef.names = c("Intercept",
"Age",
"Divorced (Ref.=Married)",
"Have kids (Ref.=no kid)",
"Female (Ref.=Male)",
"Divorced x Female"
),
include.ci = FALSE, digits = 3,
file = "models_rename.html") #html
## The table was written to the file 'models_rename.html'.
As the interaction between marital status and sex is not significant at 5% significance level, I can say the association between marital status and family life satisfaction does not differ by gender.
Using standard error robust OLS to do the following models
Add an interaction between marital status and and age in Model 1 (Question 7.1). Does the association between age and family life satisfaction differ by marital status?
model3 <- lm_robust(data = dataset4,
formula = fam_sat ~ marital*age + sex + parenthood )
summary(model3)
##
## Call:
## lm_robust(formula = fam_sat ~ marital * age + sex + parenthood,
## data = dataset4)
##
## Standard error type: HC2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper
## (Intercept) 5.72962 0.46763 12.2524 2.472e-33 4.81253 6.64672
## maritalDivorced 0.69487 1.59051 0.4369 6.622e-01 -2.42437 3.81410
## age 0.05977 0.01391 4.2968 1.816e-05 0.03249 0.08706
## sex2 Female -0.19644 0.10977 -1.7896 7.367e-02 -0.41171 0.01883
## parenthoodHave kids -0.50691 0.14445 -3.5093 4.593e-04 -0.79020 -0.22363
## maritalDivorced:age -0.04083 0.04574 -0.8927 3.721e-01 -0.13053 0.04887
## DF
## (Intercept) 2001
## maritalDivorced 2001
## age 2001
## sex2 Female 2001
## parenthoodHave kids 2001
## maritalDivorced:age 2001
##
## Multiple R-squared: 0.02418 , Adjusted R-squared: 0.02174
## F-statistic: 9.498 on 5 and 2001 DF, p-value: 5.761e-09
texreg::htmlreg(list(model3),
custom.coef.names = c("Intercept",
"Divorced (Ref.=Married)",
"Age",
"Female (Ref.=Male)",
"Have kids (Ref.=no kid)",
"Divorced x Age"
),
custom.model.names=c("Model3_age inter"),
include.ci = FALSE, digits = 3,
file = "models3.html") #html
## The table was written to the file 'models3.html'.
As the interaction between marital status and age is not significant at 5% significance level, I can say the association between age and family life satisfaction does not differ by marital status.