The Code

# read the ESS File
ess <- read.fst("All-ESS-Data.fst")

# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")

# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
      gndr, health, iscoco, iscocop, essround)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
        hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
        eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
        isco08 = case_when(isco08 %in% c(0:999) ~ 0,
                  isco08 %in% c(1000:1999) ~ 1,
                  isco08 %in% c(2000:2999) ~ 2,
                  isco08 %in% c(3000:3999) ~ 3,
                  isco08 %in% c(4000:4999) ~ 4,
                  isco08 %in% c(5000:5999) ~ 5,
                  isco08 %in% c(6000:6999) ~ 6,
                  isco08 %in% c(7000:7999) ~ 7,
                  isco08 %in% c(8000:8999) ~ 8,
                  isco08 %in% c(9000:9999) ~ 9),
        isco08p = case_when(isco08p %in% c(0:999) ~ 0,
                  isco08p %in% c(1000:1999) ~ 1,
                  isco08p %in% c(2000:2999) ~ 2,
                  isco08p %in% c(3000:3999) ~ 3,
                  isco08p %in% c(4000:4999) ~ 4,
                  isco08p %in% c(5000:5999) ~ 5,
                  isco08p %in% c(6000:6999) ~ 6,
                  isco08p %in% c(7000:7999) ~ 7,
                  isco08p %in% c(8000:8999) ~ 8,
                  isco08p %in% c(9000:9999) ~ 9),
        iscoco = case_when(iscoco %in% c(0:999) ~ 0,
                           iscoco %in% c(1000:1999) ~ 1,
                           iscoco %in% c(2000:2999) ~ 2,
                           iscoco %in% c(3000:3999) ~ 3,
                           iscoco %in% c(4000:4999) ~ 4,
                           iscoco %in% c(5000:5999) ~ 5,
                           iscoco %in% c(6000:6999) ~ 6,
                           iscoco %in% c(7000:7999) ~ 7,
                           iscoco %in% c(8000:8999) ~ 8,
                           iscoco %in% c(9000:9999) ~ 9),
        iscocop = case_when(iscocop %in% c(0:999) ~ 0,
                           iscocop %in% c(1000:1999) ~ 1,
                           iscocop %in% c(2000:2999) ~ 2,
                           iscocop %in% c(3000:3999) ~ 3,
                           iscocop %in% c(4000:4999) ~ 4,
                           iscocop %in% c(5000:5999) ~ 5,
                           iscocop %in% c(6000:6999) ~ 6,
                           iscocop %in% c(7000:7999) ~ 7,
                           iscocop %in% c(8000:8999) ~ 8,
                           iscocop %in% c(9000:9999) ~ 9),
        agea = if_else(agea < 999, agea, NA),
        gndr = if_else(gndr %in% c(1:2), gndr, NA),
        health = if_else(health %in% c(1:5), health, NA),
        year = case_when(essround == 1 ~ 2002,
                         essround == 2 ~ 2004,
                         essround == 3 ~ 2006,
                         essround == 4 ~ 2008,
                         essround == 5 ~ 2010,
                         essround == 6 ~ 2012,
                         essround == 7 ~ 2014,
                         essround == 8 ~ 2016,
                         essround == 9 ~ 2018,
                         essround == 10 ~ 2020))

ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
                                               !is.na(iscoco) ~ iscoco),
                              iscop = case_when(!is.na(isco08p) ~ isco08p,
                                                !is.na(iscocop) ~ iscocop))

# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco, 
          iscop, agea, gndr, health, year))

# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
            occu_homo = if_else(isco == iscop, 1, 0))
ess_sub <- ess_sub %>% mutate(both_homo = if_else(edu_homo == 1 & occu_homo == 1,
            1, 0))

# skim
datasummary_skim(ess_sub %>% select(hinctnta, eisced, agea, gndr, health, edu_homo,
        occu_homo, both_homo))
Unique (#) Missing (%) Mean SD Min Median Max
hinctnta 10 0 7.2 2.4 1.0 8.0 10.0
eisced 7 0 4.6 1.9 1.0 5.0 7.0
agea 65 0 45.2 12.1 18.0 45.0 82.0
gndr 2 0 1.6 0.5 1.0 2.0 2.0
health 5 0 1.9 0.8 1.0 2.0 5.0
edu_homo 2 0 0.3 0.5 0.0 0.0 1.0
occu_homo 2 0 0.2 0.4 0.0 0.0 1.0
both_homo 2 0 0.1 0.3 0.0 0.0 1.0
# time trend: split dataset by years and find regression coefficients
# to identify the returns to education in each year
ess_2010 <- ess_sub %>% filter(year == 2010)
ess_2012 <- ess_sub %>% filter(year == 2012)
ess_2014 <- ess_sub %>% filter(year == 2014)
ess_2016 <- ess_sub %>% filter(year == 2016)
ess_2018 <- ess_sub %>% filter(year == 2018)
ess_2020 <- ess_sub %>% filter(year == 2020)
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2010))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2010)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2350 -1.2652  0.2708  1.5509  4.8340 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.487877   0.560722   9.787   <2e-16 ***
## eisced       0.470275   0.048036   9.790   <2e-16 ***
## agea         0.016031   0.007877   2.035   0.0423 *  
## gndr        -0.381871   0.179130  -2.132   0.0334 *  
## health      -0.223518   0.105032  -2.128   0.0337 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.137 on 583 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1506 
## F-statistic: 27.02 on 4 and 583 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2012))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5570 -1.1784  0.4484  1.6815  4.2529 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.939876   0.679211   7.273 1.41e-12 ***
## eisced       0.467098   0.056604   8.252 1.46e-15 ***
## agea         0.017342   0.009131   1.899   0.0581 .  
## gndr         0.092345   0.210351   0.439   0.6609    
## health      -0.335668   0.129884  -2.584   0.0100 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.278 on 489 degrees of freedom
## Multiple R-squared:  0.1372, Adjusted R-squared:  0.1302 
## F-statistic: 19.45 on 4 and 489 DF,  p-value: 7.306e-15
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2014))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8590 -1.6010  0.4245  1.6909  4.8237 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.849881   0.634111   9.225  < 2e-16 ***
## eisced       0.437981   0.053978   8.114 3.23e-15 ***
## agea         0.014243   0.008334   1.709    0.088 .  
## gndr        -0.306184   0.196494  -1.558    0.120    
## health      -0.507060   0.117956  -4.299 2.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.262 on 548 degrees of freedom
## Multiple R-squared:  0.1612, Adjusted R-squared:  0.1551 
## F-statistic: 26.32 on 4 and 548 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2016))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2016)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1831 -1.4865  0.2887  1.6547  4.7182 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.175766   0.626074   8.267  1.4e-15 ***
## eisced       0.388745   0.053475   7.270  1.5e-12 ***
## agea         0.020016   0.008347   2.398  0.01688 *  
## gndr        -0.065814   0.202177  -0.326  0.74493    
## health      -0.408678   0.129377  -3.159  0.00169 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.178 on 473 degrees of freedom
## Multiple R-squared:  0.1313, Adjusted R-squared:  0.124 
## F-statistic: 17.88 on 4 and 473 DF,  p-value: 1.114e-13
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2018))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1638 -1.2233  0.2239  1.6766  4.4294 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.273342   0.571796  10.971  < 2e-16 ***
## eisced       0.420126   0.049658   8.460  < 2e-16 ***
## agea         0.011605   0.007366   1.575  0.11566    
## gndr        -0.540403   0.179206  -3.016  0.00267 ** 
## health      -0.417197   0.111117  -3.755  0.00019 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.169 on 606 degrees of freedom
## Multiple R-squared:  0.1381, Adjusted R-squared:  0.1325 
## F-statistic: 24.28 on 4 and 606 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2020))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2020)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0464 -1.5174  0.2731  1.7145  3.9163 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.15039    0.87637   8.159 1.49e-14 ***
## eisced       0.38111    0.07539   5.055 8.16e-07 ***
## agea         0.02191    0.01098   1.996 0.047009 *  
## gndr        -1.09588    0.26001  -4.215 3.46e-05 ***
## health      -0.50474    0.14407  -3.503 0.000541 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.064 on 258 degrees of freedom
## Multiple R-squared:  0.1911, Adjusted R-squared:  0.1785 
## F-statistic: 15.23 on 4 and 258 DF,  p-value: 3.388e-11
# record the coefficients and corresponding year to make a graph
returns <- c(0.470275, 0.467098, 0.437981, 0.388745, 0.420126, 0.38111)
years <- c(2010, 2012, 2014, 2016, 2018, 2020)

# make the plot and save it as image
# r runs into an error with times new roman, I don't know why. It works
# for the plot saved with ggsave though
returns_time <- ggplot(data = NULL, aes(x = years, y = returns)) +
  geom_line() + geom_point() + labs(x = "Year", y = "Returns to Education") +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
  theme(text = element_text(family = "Times New Roman")) +
  geom_text(aes(label = returns), vjust = -1)
ggsave("outcome temporal graph.png", plot = returns_time, width = 9, height = 6)
print(returns_time)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# comparing the trend with the ESS baseline
# clean the ESS baseline dataset for variables used in regression
ess <- ess %>% mutate(hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
                      eisced = if_else(eisced %in% c(1:7), eisced, NA),
                      eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
                      agea = if_else(agea < 999, agea, NA),
                      gndr = if_else(gndr %in% c(1:2), gndr, NA),
                      health = if_else(health %in% c(1:5), health, NA))

# split by years using ess rounds
ess_base2010 <- na.omit(ess %>% filter(essround == 5) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2012 <- na.omit(ess %>% filter(essround == 6) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2014 <- na.omit(ess %>% filter(essround == 7) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2016 <- na.omit(ess %>% filter(essround == 8) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2018 <- na.omit(ess %>% filter(essround == 9) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2020 <- na.omit(ess %>% filter(essround == 10) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2010))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2010)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9294 -1.8985 -0.0996  1.9144  7.8820 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.7143383  0.0648467   88.12   <2e-16 ***
## eisced       0.4959419  0.0070479   70.37   <2e-16 ***
## agea        -0.0163674  0.0007709  -21.23   <2e-16 ***
## gndr        -0.4996855  0.0252913  -19.76   <2e-16 ***
## health      -0.4295232  0.0144874  -29.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.498 on 39549 degrees of freedom
## Multiple R-squared:  0.1975, Adjusted R-squared:  0.1974 
## F-statistic:  2433 on 4 and 39549 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2012))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8566 -1.9447 -0.1326  1.9579  7.8261 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.223105   0.062358   83.76   <2e-16 ***
## eisced       0.537671   0.006709   80.14   <2e-16 ***
## agea        -0.012328   0.000738  -16.70   <2e-16 ***
## gndr        -0.476018   0.024317  -19.58   <2e-16 ***
## health      -0.407658   0.014254  -28.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.524 on 43640 degrees of freedom
## Multiple R-squared:  0.1959, Adjusted R-squared:  0.1958 
## F-statistic:  2658 on 4 and 43640 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2014))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9167 -1.9320 -0.0266  1.9771  6.9672 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3681139  0.0742571   72.29   <2e-16 ***
## eisced       0.5217912  0.0078581   66.40   <2e-16 ***
## agea        -0.0101834  0.0008416  -12.10   <2e-16 ***
## gndr        -0.4954921  0.0282713  -17.53   <2e-16 ***
## health      -0.3742457  0.0169125  -22.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.507 on 31658 degrees of freedom
## Multiple R-squared:  0.1848, Adjusted R-squared:  0.1847 
## F-statistic:  1795 on 4 and 31658 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2016))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2016)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8387 -1.8964 -0.0323  1.9054  7.7979 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.6092666  0.0672528   83.41   <2e-16 ***
## eisced       0.4879677  0.0071739   68.02   <2e-16 ***
## agea        -0.0148867  0.0007742  -19.23   <2e-16 ***
## gndr        -0.4554674  0.0259549  -17.55   <2e-16 ***
## health      -0.4330932  0.0153287  -28.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.462 on 36261 degrees of freedom
## Multiple R-squared:  0.1887, Adjusted R-squared:  0.1886 
## F-statistic:  2108 on 4 and 36261 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2018))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1223 -1.8986 -0.0627  1.9147  7.4769 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.2568298  0.0666362   93.89   <2e-16 ***
## eisced       0.4902947  0.0071003   69.05   <2e-16 ***
## agea        -0.0207870  0.0007482  -27.78   <2e-16 ***
## gndr        -0.5796486  0.0249344  -23.25   <2e-16 ***
## health      -0.4672702  0.0148255  -31.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.471 on 39590 degrees of freedom
## Multiple R-squared:  0.2116, Adjusted R-squared:  0.2115 
## F-statistic:  2657 on 4 and 39590 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2020))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2020)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0743 -1.8877  0.0199  1.9495  7.6590 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.0999608  0.0632803   96.40   <2e-16 ***
## eisced       0.4795954  0.0066156   72.50   <2e-16 ***
## agea        -0.0178978  0.0006912  -25.89   <2e-16 ***
## gndr        -0.5637844  0.0230659  -24.44   <2e-16 ***
## health      -0.3895181  0.0139879  -27.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.461 on 45748 degrees of freedom
## Multiple R-squared:  0.1844, Adjusted R-squared:  0.1844 
## F-statistic:  2586 on 4 and 45748 DF,  p-value: < 2.2e-16
# record regression coefficients that represent returns to education
base_returns <- c(0.495942, 0.537671, 0.521791, 0.487968, 0.490295, 0.479595)

# make a plot with both the baseline and GB trends
plot2 <- ggplot() + geom_line(data = NULL, aes(x = years, y = returns), color = 'red') +
  geom_line(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
  geom_point(data = NULL, aes(x = years, y = returns), color = 'red') +
  geom_point(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
  geom_text(data = NULL, aes(x = years, y = returns, label = returns), 
            vjust = -1, color = 'red') +
  geom_text(data = NULL, aes(x = years, y = base_returns, label = base_returns), 
            vjust = 2, color = 'blue') + 
  labs(x = "Year", y = "Returns to Education") +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
  theme(text = element_text(family = "Times New Roman")) +
  annotate("text", x = 2010.5, y = 0.53, label = "ESS Baseline", color = 'blue') +
  annotate("text", x = 2010.5, y = 0.46, label = "GB Trend", color = 'red')

ggsave("ess baseline graph.png", plot = plot2, width = 9, height = 6)
print(plot2)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# column percentages
edu_percentage <- cprop(table(ess_sub %>% select(eisced, hinctnta)))
gender_percentage <- cprop(table(ess_sub %>% select(gndr, hinctnta)))
health_percentage <- cprop(table(ess_sub %>% select(health, hinctnta)))


# mean and sd
edu_stats <- ess_sub %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
gender_stats <- ess_sub %>%
  group_by(gndr) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
health_stats <- ess_sub %>%
  group_by(health) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))

Visuals for column Percentages, Mean, and SD

Edu Level 1 2 3 4 5 6 7 8 9 10 All
1 22.2 24.8 19.8 16.1 11.5 8.4 5.4 3.2 2.6 2.7 7.1
2 20.0 13.9 21.4 14.0 17.0 15.7 14.9 12.2 8.8 6.7 12.4
3 11.1 8.9 8.7 10.8 12.6 15.4 9.7 9.4 7.1 3.2 8.9
4 13.3 19.8 19.0 18.8 17.8 16.3 15.6 17.8 11.8 10.4 15.1
5 11.1 18.8 15.1 17.7 18.2 20.8 22.6 20.4 19.9 16.1 19.0
6 15.6 6.9 10.3 9.7 12.6 13.9 17.4 20.4 26.5 29.6 19.8
7 6.7 6.9 5.6 12.9 10.3 9.6 14.4 16.8 23.5 31.3 17.8
Total 100 100 100 100 100 100 100 100 100 100 100
Gender 1 2 3 4 5 6 7 8 9 10 All
1 40.0 35.6 34.1 34.9 44.3 41.9 44.4 44.9 47.2 46.8 43.7
2 60.0 64.4 65.9 65.1 55.7 58.1 55.6 55.1 52.8 53.2 56.3
Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0
Health 1 2 3 4 5 6 7 8 9 10 All
1 22.2 20.8 30.2 34.4 33.2 28.6 32.8 38.7 41.5 43.8 36.3
2 35.6 50.5 37.3 43.0 47.0 49.4 44.9 42.9 41.7 45.3 44.4
3 28.9 21.8 21.4 16.1 14.6 18.1 18.7 16.2 14.3 8.9 15.5
4 8.9 5.0 7.1 4.8 4.3 3.0 3.1 2.0 1.9 2.1 3.0
5 4.4 2.0 4.0 1.6 0.8 0.9 0.5 0.2 0.6 0.0 0.8
Total 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0
eisced mean_income std_income
1 5.24 2.53
2 6.49 2.35
3 6.53 2.16
4 6.78 2.35
5 7.19 2.23
6 7.98 2.09
7 8.16 2.04
gndr mean_income std_income
1 7.36 2.28
2 7.03 2.42
health mean_income std_income
1 7.53 2.25
2 7.17 2.35
3 6.68 2.42
4 6.12 2.65
5 4.78 2.52

Initial Analysis

# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
both_homog_data <- ess_sub %>% filter(both_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0)

# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
both_homog_mean <- both_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))

edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_line(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_point(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + labs(x = "Highest Level of Education ES - ISCED",
    y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
    theme(text = element_text(family = "Times New Roman")) +
    annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
      annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
      annotate("text", x = 6, y = 5.6, label = "Both", color = 'purple') +
      annotate("text", x = 6, y = 5.4, label = "None", color = 'black')

ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# regressions to get returns to education of different marriages
# order: educational, occupational, both, none
# returns to education for each group = regression coefficient for eisced
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7501 -1.3105  0.4156  1.5521  4.5410 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.884271   0.436940  11.178  < 2e-16 ***
## eisced       0.536603   0.033871  15.843  < 2e-16 ***
## agea         0.017328   0.005764   3.006 0.002712 ** 
## gndr        -0.262084   0.135563  -1.933 0.053482 .  
## health      -0.286760   0.084890  -3.378 0.000758 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.119 on 997 degrees of freedom
## Multiple R-squared:  0.2236, Adjusted R-squared:  0.2205 
## F-statistic:  71.8 on 4 and 997 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1461 -1.1925  0.4604  1.5773  4.2467 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.164341   0.533153   9.686  < 2e-16 ***
## eisced       0.514623   0.045673  11.268  < 2e-16 ***
## agea         0.022569   0.007004   3.222 0.001334 ** 
## gndr        -0.315692   0.163776  -1.928 0.054333 .  
## health      -0.403078   0.105505  -3.820 0.000146 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.101 on 664 degrees of freedom
## Multiple R-squared:    0.2,  Adjusted R-squared:  0.1952 
## F-statistic: 41.51 on 4 and 664 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = both_homog_data))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = both_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8235 -1.0129  0.4206  1.3502  3.5459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.338395   0.736819   5.888  1.1e-08 ***
## eisced       0.603594   0.063279   9.539  < 2e-16 ***
## agea         0.028903   0.009397   3.076  0.00230 ** 
## gndr        -0.165678   0.222174  -0.746  0.45646    
## health      -0.470413   0.147149  -3.197  0.00155 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.864 on 284 degrees of freedom
## Multiple R-squared:  0.2953, Adjusted R-squared:  0.2853 
## F-statistic: 29.75 on 4 and 284 DF,  p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = non_homog_data))
## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = non_homog_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.394 -1.416  0.380  1.687  4.991 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.378090   0.353657  18.035  < 2e-16 ***
## eisced       0.320065   0.032218   9.934  < 2e-16 ***
## agea         0.014495   0.004721   3.070  0.00217 ** 
## gndr        -0.342507   0.113507  -3.017  0.00259 ** 
## health      -0.446677   0.066188  -6.749 2.08e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.217 on 1600 degrees of freedom
## Multiple R-squared:  0.09289,    Adjusted R-squared:  0.09062 
## F-statistic: 40.96 on 4 and 1600 DF,  p-value: < 2.2e-16