Stepping Stone 3

data cleaning

# read the ESS File
ess <- read.fst("All-ESS-Data.fst")

# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")

# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
      gndr, health, iscoco, iscocop, essround)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
        hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
        eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
        isco08 = case_when(isco08 %in% c(0:999) ~ 0,
                  isco08 %in% c(1000:1999) ~ 1,
                  isco08 %in% c(2000:2999) ~ 2,
                  isco08 %in% c(3000:3999) ~ 3,
                  isco08 %in% c(4000:4999) ~ 4,
                  isco08 %in% c(5000:5999) ~ 5,
                  isco08 %in% c(6000:6999) ~ 6,
                  isco08 %in% c(7000:7999) ~ 7,
                  isco08 %in% c(8000:8999) ~ 8,
                  isco08 %in% c(9000:9999) ~ 9),
        isco08p = case_when(isco08p %in% c(0:999) ~ 0,
                  isco08p %in% c(1000:1999) ~ 1,
                  isco08p %in% c(2000:2999) ~ 2,
                  isco08p %in% c(3000:3999) ~ 3,
                  isco08p %in% c(4000:4999) ~ 4,
                  isco08p %in% c(5000:5999) ~ 5,
                  isco08p %in% c(6000:6999) ~ 6,
                  isco08p %in% c(7000:7999) ~ 7,
                  isco08p %in% c(8000:8999) ~ 8,
                  isco08p %in% c(9000:9999) ~ 9),
        iscoco = case_when(iscoco %in% c(0:999) ~ 0,
                           iscoco %in% c(1000:1999) ~ 1,
                           iscoco %in% c(2000:2999) ~ 2,
                           iscoco %in% c(3000:3999) ~ 3,
                           iscoco %in% c(4000:4999) ~ 4,
                           iscoco %in% c(5000:5999) ~ 5,
                           iscoco %in% c(6000:6999) ~ 6,
                           iscoco %in% c(7000:7999) ~ 7,
                           iscoco %in% c(8000:8999) ~ 8,
                           iscoco %in% c(9000:9999) ~ 9),
        iscocop = case_when(iscocop %in% c(0:999) ~ 0,
                           iscocop %in% c(1000:1999) ~ 1,
                           iscocop %in% c(2000:2999) ~ 2,
                           iscocop %in% c(3000:3999) ~ 3,
                           iscocop %in% c(4000:4999) ~ 4,
                           iscocop %in% c(5000:5999) ~ 5,
                           iscocop %in% c(6000:6999) ~ 6,
                           iscocop %in% c(7000:7999) ~ 7,
                           iscocop %in% c(8000:8999) ~ 8,
                           iscocop %in% c(9000:9999) ~ 9),
        agea = if_else(agea < 999, agea, NA),
        gndr = if_else(gndr %in% c(1:2), gndr, NA),
        health = if_else(health %in% c(1:5), health, NA),
        year = case_when(essround == 1 ~ 2002,
                         essround == 2 ~ 2004,
                         essround == 3 ~ 2006,
                         essround == 4 ~ 2008,
                         essround == 5 ~ 2010,
                         essround == 6 ~ 2012,
                         essround == 7 ~ 2014,
                         essround == 8 ~ 2016,
                         essround == 9 ~ 2018,
                         essround == 10 ~ 2020))

ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
                                               !is.na(iscoco) ~ iscoco),
                              iscop = case_when(!is.na(isco08p) ~ isco08p,
                                                !is.na(iscocop) ~ iscocop))

# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco, 
          iscop, agea, gndr, health, year))

# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
            occu_homo = if_else(isco == iscop, 1, 0))
ess_sub <- ess_sub %>% mutate(both_homo = if_else(edu_homo == 1 & occu_homo == 1,
            1, 0))

# skim
datasummary_skim(ess_sub %>% select(hinctnta, eisced, agea, gndr, health, edu_homo,
        occu_homo, both_homo))

	Unique (#)	Mean	SD	Min	Median	Max
hinctnta	10	7.2	2.4	1.0	8.0	10.0
eisced	7	4.6	1.9	1.0	5.0	7.0
agea	65	45.2	12.1	18.0	45.0	82.0
gndr	2	1.6	0.5	1.0	2.0	2.0
health	5	1.9	0.8	1.0	2.0	5.0
edu_homo	2	0.3	0.5	0.0	0.0	1.0
occu_homo	2	0.2	0.4	0.0	0.0	1.0
both_homo	2	0.1	0.3	0.0	0.0	1.0

Initial Analysis

# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
both_homog_data <- ess_sub %>% filter(both_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0)

# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
both_homog_mean <- both_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))

edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_line(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_point(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + labs(x = "Highest Level of Education ES - ISCED",
    y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
    theme(text = element_text(family = "Times New Roman")) +
    annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
      annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
      annotate("text", x = 6, y = 5.6, label = "Both", color = 'purple') +
      annotate("text", x = 6, y = 5.4, label = "None", color = 'black')

ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# Additions for SS3 starts here

# regressions to get returns to education of different marriages
# order: educational, occupational, both, none
# returns to education for each group = regression coefficient for eisced
model_edu <- lm(hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data)
model_occu <- lm(hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data)
model_both <- lm(hinctnta ~ eisced + agea + gndr + health, data = both_homog_data)
model_none <- lm(hinctnta ~ eisced + agea + gndr + health, data = non_homog_data)
model_base <- lm(hinctnta ~ eisced + agea + gndr + health, data = ess_sub)


# run the 4 step analysis using infer package
# test whether 2 coefficients are significantly different to check whether
# homogamies increase returns to education

# compare regression coefficient of both homogamies vs non-homogamies
coef1 <- summary(model_both)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]

# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)

# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference

# Calculating degrees of freedom
df <- min(nrow(model_both$model) - 2, nrow(model_none$model) - 2)

# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)

# Print results
print(paste("t-value:", t_value, "p-value:", p_value))

## [1] "t-value: 3.99287696803136 p-value: 8.29823493579759e-05"

# sample the null condition using bootstrapping
# randomly sample subsets from the large dataset and compute differences of 
# their regression coefficient

n_bootstraps <- 1000
coef_diffs <- numeric(n_bootstraps)

for(i in 1:n_bootstraps) {
  # Bootstrap sample for each dataset
  sample1 <- ess_sub[sample(nrow(both_homog_data), replace = TRUE), ]
  sample2 <- ess_sub[sample(nrow(non_homog_data), replace = TRUE), ]

  # Fit models to bootstrap samples
  boot_model1 <- lm(hinctnta ~ eisced + agea + gndr + health, data = sample1)
  boot_model2 <- lm(hinctnta ~ eisced + agea + gndr + health, data = sample2)

  # Calculate difference in coefficients
  coef_diffs[i] <- coef(boot_model1)["eisced"] - coef(boot_model2)["eisced"]
}

# Compare with observed difference
observed_diff <- coef(model_both)["eisced"] - coef(model_none)["eisced"]

# Visualize
hist(coef_diffs, xlim = range(-0.3, 0.3), col = "lightblue", border = "blue", 
     xlab = "Coefficient Differences", 
     main = "Bootstrap Distribution of Coefficient Differences")
abline(v = observed_diff, col = "red", lwd = 4)

# model summary
# 2 other models to compare: model with only education, model with
# additional interaction term
model_general <- lm(hinctnta ~ eisced, data = ess_sub)
model_inter <- lm(hinctnta ~ eisced + agea + gndr + health 
                  + gndr*eisced, data = ess_sub)

modelsummary(
  list(model_general, model_base, model_inter),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")

	(1)	(2)	(3)
eisced	0.4 (0.0)***	0.4 (0.0)***	0.3 (0.1)***
agea		0.0 (0.0)***	0.0 (0.0)***
gndr		-0.3 (0.1)***	-0.7 (0.2)**
health		-0.4 (0.0)***	-0.4 (0.0)***
eisced × gndr			0.1 (0.0)+
Num.Obs.	2987	2987	2987
R2	0.113	0.140	0.141
R2 Adj.	0.113	0.139	0.140
AIC	13270.5	13182.6	13181.5
BIC	13288.6	13218.6	13223.5
Log.Lik.	-6632.272	-6585.294	-6583.734
RMSE	2.23	2.19	2.19

# assumptions test on the base model

# Normal Distribution of Errors
qqnorm(residuals(model_base))
qqline(residuals(model_base))

print(shapiro.test(residuals(model_base)))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_base)
## W = 0.96564, p-value < 2.2e-16

# Linearity
plot(model_base$fitted.values, residuals(model_base))

# Full Rank / No Perfect Multicollinearity
print(vif(model_base))

##   eisced     agea     gndr   health 
## 1.038653 1.071263 1.017272 1.045069

# Zero Expected Error
print(mean(residuals(model_base)))

## [1] -2.270374e-17

Stepping Stone 3

Kelvin Li, Yisu Hou

2023-12-01

data cleaning

Initial Analysis