# center age since it is a large continuous interaction term
# so the regression coefficients can be interpreted more intuitively
ess_sub$agea_centered <- ess_sub$agea - min(ess_sub$agea)
ess_sub_1$age_centered <- ess_sub_1$age - min(ess_sub_1$age)
# age_centered is essentially "years after 18"
ess_sub$gndr <- ess_sub$gndr - min(ess_sub$gndr)
ess_sub_1$gender <- ess_sub_1$gender - min(ess_sub_1$gender)
# gender was changed to 0 = male and 1 = female, by the same logic
# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
age_homog_data <- ess_sub %>% filter(age_homo == 1)
all_homog_data <- ess_sub %>% filter(all_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0 & age_homo == 0)
# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
age_homog_mean <- age_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
all_homog_mean <- all_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_line(data = all_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_point(data = all_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + geom_line(data = age_homog_mean, aes(x = eisced,
y = mean_income), color = 'green') + geom_point(data = age_homog_mean, aes(x = eisced,
y = mean_income), color = 'green') + labs(x = "Highest Level of Education ES - ISCED",
y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
theme(text = element_text(family = "Times New Roman")) +
annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
annotate("text", x = 6, y = 5.6, label = "Age Homogamy", color = 'green') +
annotate("text", x = 6, y = 5.4, label = "All", color = 'purple') +
annotate("text", x = 6, y = 5.2, label = "None", color = 'black')
ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
# time trend of general returns to education
# split dataset by years and find regression coefficients
# to identify the returns to education in each year
ess_2010 <- ess_sub %>% filter(year == 2010)
ess_2012 <- ess_sub %>% filter(year == 2012)
ess_2014 <- ess_sub %>% filter(year == 2014)
ess_2016 <- ess_sub %>% filter(year == 2016)
ess_2018 <- ess_sub %>% filter(year == 2018)
ess_2020 <- ess_sub %>% filter(year == 2020)
summary(lm(hinctnta ~ eisced, data = ess_2010))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2010)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9489 -1.4042 0.3274 1.5958 4.3274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.21737 0.21421 24.356 <2e-16 ***
## eisced 0.45526 0.04691 9.704 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.154 on 586 degrees of freedom
## Multiple R-squared: 0.1384, Adjusted R-squared: 0.137
## F-statistic: 94.17 on 1 and 586 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced, data = ess_2012))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2012)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0455 -1.2144 0.4967 1.4967 4.2434
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.29878 0.26330 20.125 < 2e-16 ***
## eisced 0.45779 0.05551 8.247 1.5e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.293 on 491 degrees of freedom
## Multiple R-squared: 0.1217, Adjusted R-squared: 0.1199
## F-statistic: 68.01 on 1 and 491 DF, p-value: 1.502e-15
summary(lm(hinctnta ~ eisced, data = ess_2014))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2200 -1.3601 0.6866 1.7800 4.6399
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.88351 0.26560 18.387 <2e-16 ***
## eisced 0.47664 0.05354 8.902 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.303 on 551 degrees of freedom
## Multiple R-squared: 0.1257, Adjusted R-squared: 0.1242
## F-statistic: 79.25 on 1 and 551 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced, data = ess_2016))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9749 -1.5809 0.2069 1.8130 4.3887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.21741 0.26663 19.568 < 2e-16 ***
## eisced 0.39392 0.05303 7.428 5.16e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.191 on 475 degrees of freedom
## Multiple R-squared: 0.1041, Adjusted R-squared: 0.1022
## F-statistic: 55.18 on 1 and 475 DF, p-value: 5.156e-13
summary(lm(hinctnta ~ eisced, data = ess_2018))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8864 -1.2994 0.2876 1.8746 4.3526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.23442 0.25893 20.215 < 2e-16 ***
## eisced 0.41300 0.05006 8.251 9.82e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.203 on 607 degrees of freedom
## Multiple R-squared: 0.1008, Adjusted R-squared: 0.09935
## F-statistic: 68.07 on 1 and 607 DF, p-value: 9.825e-16
summary(lm(hinctnta ~ eisced, data = ess_2020))
##
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2020)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7268 -1.3917 0.2786 1.9381 3.9488
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.71606 0.40124 14.246 < 2e-16 ***
## eisced 0.33512 0.07569 4.427 1.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.2 on 261 degrees of freedom
## Multiple R-squared: 0.06985, Adjusted R-squared: 0.06629
## F-statistic: 19.6 on 1 and 261 DF, p-value: 1.404e-05
# record the coefficients and corresponding year to make a graph
returns <- c(0.4553, 0.4578, 0.4766, 0.3940, 0.4130, 0.3351)
years <- c(2010, 2012, 2014, 2016, 2018, 2020)
# make the plot and save it as image
# r runs into an error with times new roman, I don't know why. It works
# for the plot saved with ggsave though
returns_time <- ggplot(data = NULL, aes(x = years, y = returns)) +
geom_line() + geom_point() + labs(x = "Year", y = "Returns to Education") +
scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
theme(text = element_text(family = "Times New Roman")) +
geom_text(aes(label = returns), vjust = -1)
ggsave("outcome temporal graph.png", plot = returns_time, width = 9, height = 6)
print(returns_time)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
# regressions to get returns to education of different marriages
# order: educational, occupational, all, none
# returns to education for each group = regression coefficient for eisced
model_edu <- lm(hinctnta ~ eisced + agea_centered + gndr + health
+ eisced:agea_centered + eisced:gndr, data = edu_homog_data)
model_occu <- lm(hinctnta ~ eisced + agea_centered + gndr + health
+ eisced:agea_centered + eisced:gndr, data = occu_homog_data)
model_age <- lm(hinctnta ~ eisced + agea_centered + gndr + health
+ eisced:agea_centered + eisced:gndr, data = age_homog_data)
model_all <- lm(hinctnta ~ eisced + agea_centered + gndr + health
+ eisced:agea_centered + eisced:gndr, data = all_homog_data)
model_none <- lm(hinctnta ~ eisced + agea_centered + gndr + health
+ eisced:agea_centered + eisced:gndr, data = non_homog_data)
model_base <- lm(household_income ~ highest_edu + age_centered + gender + health
+ highest_edu:age_centered + highest_edu:gender, data = ess_sub_1)
# assumptions test on the base model
# Normal Distribution of Errors
qqnorm(residuals(model_base))
qqline(residuals(model_base))
# Full Rank / No Perfect Multicollinearity
print(vif(model_base, type = 'predictor'))
## GVIFs computed for predictors
## GVIF Df GVIF^(1/(2*Df)) Interacts With
## highest_edu 1.046148 5 1.004522 age_centered, gender
## age_centered 2.353206 3 1.153303 highest_edu
## gender 6.618458 3 1.370228 highest_edu
## health 1.046148 1 1.022814 --
## Other Predictors
## highest_edu health
## age_centered gender, health
## gender age_centered, health
## health highest_edu, age_centered, gender
# Zero Expected Error
print(mean(residuals(model_base)))
## [1] -1.466815e-16
# verify the significance of educational returns in general by doing the
# 4 step infer analysis
test_stat <- ess_sub_1 %>%
specify(explanatory = highest_edu,
response = household_income) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "slope")
observed_slope <- coef(model_base)[["highest_edu"]]
p_value <- test_stat %>%
get_p_value(obs_stat = observed_slope, direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
# p value = 0
infer_plot <- ggplot(test_stat, aes(x = stat)) +
geom_histogram(binwidth = .025, color = "black", fill = "white") +
geom_vline(xintercept = observed_slope, color = "red", size = 1) +
theme_minimal() +
theme( plot.background = element_rect(fill = "white", colour = NA))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# does not overlap
# model summary
# 2 other models to compare: model with only education, model with
# additional interaction term
model_general <- lm(household_income ~ highest_edu, data = ess_sub_1)
model_no_inter <- lm(household_income ~ highest_edu + age_centered + gender
+ health, data = ess_sub_1)
modelsummary(
list(model_general, model_base, model_no_inter),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept")
(1)
(2)
(3)
highest_edu
0.4 (0.0)***
0.3 (0.1)***
0.4 (0.0)***
age_centered
0.0 (0.0)
0.0 (0.0)***
gender
-0.7 (0.2)**
-0.3 (0.1)***
health
-0.4 (0.0)***
-0.4 (0.0)***
highest_edu × age_centered
0.0 (0.0)
highest_edu × gender
0.1 (0.0)+
Num.Obs.
2983
2983
2983
R2
0.113
0.140
0.139
R2 Adj.
0.112
0.138
0.138
AIC
13245.4
13160.9
13161.2
BIC
13263.4
13209.0
13197.2
Log.Lik.
-6619.681
-6572.473
-6574.599
RMSE
2.23
2.19
2.19
# now compare regression coefficients of different marriage types
# to answer the research question
# first identify coefficients of all regression models
summary(model_none)