data cleaning
# read the ESS File
ess <- read.fst("All-ESS-Data.fst")
# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")
# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
gndr, health, iscoco, iscocop, essround)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
isco08 = case_when(isco08 %in% c(0:999) ~ 0,
isco08 %in% c(1000:1999) ~ 1,
isco08 %in% c(2000:2999) ~ 2,
isco08 %in% c(3000:3999) ~ 3,
isco08 %in% c(4000:4999) ~ 4,
isco08 %in% c(5000:5999) ~ 5,
isco08 %in% c(6000:6999) ~ 6,
isco08 %in% c(7000:7999) ~ 7,
isco08 %in% c(8000:8999) ~ 8,
isco08 %in% c(9000:9999) ~ 9),
isco08p = case_when(isco08p %in% c(0:999) ~ 0,
isco08p %in% c(1000:1999) ~ 1,
isco08p %in% c(2000:2999) ~ 2,
isco08p %in% c(3000:3999) ~ 3,
isco08p %in% c(4000:4999) ~ 4,
isco08p %in% c(5000:5999) ~ 5,
isco08p %in% c(6000:6999) ~ 6,
isco08p %in% c(7000:7999) ~ 7,
isco08p %in% c(8000:8999) ~ 8,
isco08p %in% c(9000:9999) ~ 9),
iscoco = case_when(iscoco %in% c(0:999) ~ 0,
iscoco %in% c(1000:1999) ~ 1,
iscoco %in% c(2000:2999) ~ 2,
iscoco %in% c(3000:3999) ~ 3,
iscoco %in% c(4000:4999) ~ 4,
iscoco %in% c(5000:5999) ~ 5,
iscoco %in% c(6000:6999) ~ 6,
iscoco %in% c(7000:7999) ~ 7,
iscoco %in% c(8000:8999) ~ 8,
iscoco %in% c(9000:9999) ~ 9),
iscocop = case_when(iscocop %in% c(0:999) ~ 0,
iscocop %in% c(1000:1999) ~ 1,
iscocop %in% c(2000:2999) ~ 2,
iscocop %in% c(3000:3999) ~ 3,
iscocop %in% c(4000:4999) ~ 4,
iscocop %in% c(5000:5999) ~ 5,
iscocop %in% c(6000:6999) ~ 6,
iscocop %in% c(7000:7999) ~ 7,
iscocop %in% c(8000:8999) ~ 8,
iscocop %in% c(9000:9999) ~ 9),
agea = if_else(agea < 999, agea, NA),
gndr = if_else(gndr %in% c(1:2), gndr, NA),
health = if_else(health %in% c(1:5), health, NA),
year = case_when(essround == 1 ~ 2002,
essround == 2 ~ 2004,
essround == 3 ~ 2006,
essround == 4 ~ 2008,
essround == 5 ~ 2010,
essround == 6 ~ 2012,
essround == 7 ~ 2014,
essround == 8 ~ 2016,
essround == 9 ~ 2018,
essround == 10 ~ 2020))
ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
!is.na(iscoco) ~ iscoco),
iscop = case_when(!is.na(isco08p) ~ isco08p,
!is.na(iscocop) ~ iscocop))
# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco,
iscop, agea, gndr, health, year))
# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
occu_homo = if_else(isco == iscop, 1, 0))
ess_sub <- ess_sub %>% mutate(both_homo = if_else(edu_homo == 1 & occu_homo == 1,
1, 0))
# skim
datasummary_skim(ess_sub %>% select(hinctnta, eisced, agea, gndr, health, edu_homo,
occu_homo, both_homo))
|
Unique (#)
|
Missing (%)
|
Mean
|
SD
|
Min
|
Median
|
Max
|
|
hinctnta
|
10
|
0
|
7.2
|
2.4
|
1.0
|
8.0
|
10.0
|
|
eisced
|
7
|
0
|
4.6
|
1.9
|
1.0
|
5.0
|
7.0
|
|
agea
|
65
|
0
|
45.2
|
12.1
|
18.0
|
45.0
|
82.0
|
|
gndr
|
2
|
0
|
1.6
|
0.5
|
1.0
|
2.0
|
2.0
|
|
health
|
5
|
0
|
1.9
|
0.8
|
1.0
|
2.0
|
5.0
|
|
edu_homo
|
2
|
0
|
0.3
|
0.5
|
0.0
|
0.0
|
1.0
|
|
occu_homo
|
2
|
0
|
0.2
|
0.4
|
0.0
|
0.0
|
1.0
|
|
both_homo
|
2
|
0
|
0.1
|
0.3
|
0.0
|
0.0
|
1.0
|
|
Initial Analysis
# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
both_homog_data <- ess_sub %>% filter(both_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0)
# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
both_homog_mean <- both_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_line(data = both_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_point(data = both_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + labs(x = "Highest Level of Education ES - ISCED",
y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
theme(text = element_text(family = "Times New Roman")) +
annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
annotate("text", x = 6, y = 5.6, label = "Both", color = 'purple') +
annotate("text", x = 6, y = 5.4, label = "None", color = 'black')
ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# Additions for SS3 starts here
# regressions to get returns to education of different marriages
# order: educational, occupational, both, none
# returns to education for each group = regression coefficient for eisced
model_edu <- lm(hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data)
model_occu <- lm(hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data)
model_both <- lm(hinctnta ~ eisced + agea + gndr + health, data = both_homog_data)
model_none <- lm(hinctnta ~ eisced + agea + gndr + health, data = non_homog_data)
model_base <- lm(hinctnta ~ eisced + agea + gndr + health, data = ess_sub)
# run the 4 step analysis using infer package
# test whether 2 coefficients are significantly different to check whether
# homogamies increase returns to education
# compare regression coefficient of both homogamies vs non-homogamies
coef1 <- summary(model_both)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]
# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)
# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference
# Calculating degrees of freedom
df <- min(nrow(model_both$model) - 2, nrow(model_none$model) - 2)
# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)
# Print results
print(paste("t-value:", t_value, "p-value:", p_value))
## [1] "t-value: 3.99287696803136 p-value: 8.29823493579759e-05"
# sample the null condition using bootstrapping
# randomly sample subsets from the large dataset and compute differences of
# their regression coefficient
n_bootstraps <- 1000
coef_diffs <- numeric(n_bootstraps)
for(i in 1:n_bootstraps) {
# Bootstrap sample for each dataset
sample1 <- ess_sub[sample(nrow(both_homog_data), replace = TRUE), ]
sample2 <- ess_sub[sample(nrow(non_homog_data), replace = TRUE), ]
# Fit models to bootstrap samples
boot_model1 <- lm(hinctnta ~ eisced + agea + gndr + health, data = sample1)
boot_model2 <- lm(hinctnta ~ eisced + agea + gndr + health, data = sample2)
# Calculate difference in coefficients
coef_diffs[i] <- coef(boot_model1)["eisced"] - coef(boot_model2)["eisced"]
}
# Compare with observed difference
observed_diff <- coef(model_both)["eisced"] - coef(model_none)["eisced"]
# Visualize
hist(coef_diffs, xlim = range(-0.3, 0.3), col = "lightblue", border = "blue",
xlab = "Coefficient Differences",
main = "Bootstrap Distribution of Coefficient Differences")
abline(v = observed_diff, col = "red", lwd = 4)

# model summary
# 2 other models to compare: model with only education, model with
# additional interaction term
model_general <- lm(hinctnta ~ eisced, data = ess_sub)
model_inter <- lm(hinctnta ~ eisced + agea + gndr + health
+ gndr*eisced, data = ess_sub)
modelsummary(
list(model_general, model_base, model_inter),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
coef_omit = "Intercept")
|
(1)
|
(2)
|
(3)
|
eisced
|
0.4 (0.0)***
|
0.4 (0.0)***
|
0.3 (0.1)***
|
agea
|
|
0.0 (0.0)***
|
0.0 (0.0)***
|
gndr
|
|
-0.3 (0.1)***
|
-0.7 (0.2)**
|
health
|
|
-0.4 (0.0)***
|
-0.4 (0.0)***
|
eisced × gndr
|
|
|
0.1 (0.0)+
|
Num.Obs.
|
2987
|
2987
|
2987
|
R2
|
0.113
|
0.140
|
0.141
|
R2 Adj.
|
0.113
|
0.139
|
0.140
|
AIC
|
13270.5
|
13182.6
|
13181.5
|
BIC
|
13288.6
|
13218.6
|
13223.5
|
Log.Lik.
|
-6632.272
|
-6585.294
|
-6583.734
|
RMSE
|
2.23
|
2.19
|
2.19
|
# assumptions test on the base model
# Normal Distribution of Errors
qqnorm(residuals(model_base))
qqline(residuals(model_base))

print(shapiro.test(residuals(model_base)))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_base)
## W = 0.96564, p-value < 2.2e-16
# Linearity
plot(model_base$fitted.values, residuals(model_base))

# Full Rank / No Perfect Multicollinearity
print(vif(model_base))
## eisced agea gndr health
## 1.038653 1.071263 1.017272 1.045069
# Zero Expected Error
print(mean(residuals(model_base)))
## [1] -2.270374e-17