Stepping Stone 2

The Code

# read the ESS File
ess <- read.fst("All-ESS-Data.fst")

# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")

# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
      gndr, health, iscoco, iscocop, essround)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
        hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
        eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
        isco08 = case_when(isco08 %in% c(0:999) ~ 0,
                  isco08 %in% c(1000:1999) ~ 1,
                  isco08 %in% c(2000:2999) ~ 2,
                  isco08 %in% c(3000:3999) ~ 3,
                  isco08 %in% c(4000:4999) ~ 4,
                  isco08 %in% c(5000:5999) ~ 5,
                  isco08 %in% c(6000:6999) ~ 6,
                  isco08 %in% c(7000:7999) ~ 7,
                  isco08 %in% c(8000:8999) ~ 8,
                  isco08 %in% c(9000:9999) ~ 9),
        isco08p = case_when(isco08p %in% c(0:999) ~ 0,
                  isco08p %in% c(1000:1999) ~ 1,
                  isco08p %in% c(2000:2999) ~ 2,
                  isco08p %in% c(3000:3999) ~ 3,
                  isco08p %in% c(4000:4999) ~ 4,
                  isco08p %in% c(5000:5999) ~ 5,
                  isco08p %in% c(6000:6999) ~ 6,
                  isco08p %in% c(7000:7999) ~ 7,
                  isco08p %in% c(8000:8999) ~ 8,
                  isco08p %in% c(9000:9999) ~ 9),
        iscoco = case_when(iscoco %in% c(0:999) ~ 0,
                           iscoco %in% c(1000:1999) ~ 1,
                           iscoco %in% c(2000:2999) ~ 2,
                           iscoco %in% c(3000:3999) ~ 3,
                           iscoco %in% c(4000:4999) ~ 4,
                           iscoco %in% c(5000:5999) ~ 5,
                           iscoco %in% c(6000:6999) ~ 6,
                           iscoco %in% c(7000:7999) ~ 7,
                           iscoco %in% c(8000:8999) ~ 8,
                           iscoco %in% c(9000:9999) ~ 9),
        iscocop = case_when(iscocop %in% c(0:999) ~ 0,
                           iscocop %in% c(1000:1999) ~ 1,
                           iscocop %in% c(2000:2999) ~ 2,
                           iscocop %in% c(3000:3999) ~ 3,
                           iscocop %in% c(4000:4999) ~ 4,
                           iscocop %in% c(5000:5999) ~ 5,
                           iscocop %in% c(6000:6999) ~ 6,
                           iscocop %in% c(7000:7999) ~ 7,
                           iscocop %in% c(8000:8999) ~ 8,
                           iscocop %in% c(9000:9999) ~ 9),
        agea = if_else(agea < 999, agea, NA),
        gndr = if_else(gndr %in% c(1:2), gndr, NA),
        health = if_else(health %in% c(1:5), health, NA),
        year = case_when(essround == 1 ~ 2002,
                         essround == 2 ~ 2004,
                         essround == 3 ~ 2006,
                         essround == 4 ~ 2008,
                         essround == 5 ~ 2010,
                         essround == 6 ~ 2012,
                         essround == 7 ~ 2014,
                         essround == 8 ~ 2016,
                         essround == 9 ~ 2018,
                         essround == 10 ~ 2020))

ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
                                               !is.na(iscoco) ~ iscoco),
                              iscop = case_when(!is.na(isco08p) ~ isco08p,
                                                !is.na(iscocop) ~ iscocop))

# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco, 
          iscop, agea, gndr, health, year))

# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
            occu_homo = if_else(isco == iscop, 1, 0))
ess_sub <- ess_sub %>% mutate(both_homo = if_else(edu_homo == 1 & occu_homo == 1,
            1, 0))

# skim
datasummary_skim(ess_sub %>% select(hinctnta, eisced, agea, gndr, health, edu_homo,
        occu_homo, both_homo))

	Unique (#)	Mean	SD	Min	Median	Max
hinctnta	10	7.2	2.4	1.0	8.0	10.0
eisced	7	4.6	1.9	1.0	5.0	7.0
agea	65	45.2	12.1	18.0	45.0	82.0
gndr	2	1.6	0.5	1.0	2.0	2.0
health	5	1.9	0.8	1.0	2.0	5.0
edu_homo	2	0.3	0.5	0.0	0.0	1.0
occu_homo	2	0.2	0.4	0.0	0.0	1.0
both_homo	2	0.1	0.3	0.0	0.0	1.0

# time trend: split dataset by years and find regression coefficients
# to identify the returns to education in each year
ess_2010 <- ess_sub %>% filter(year == 2010)
ess_2012 <- ess_sub %>% filter(year == 2012)
ess_2014 <- ess_sub %>% filter(year == 2014)
ess_2016 <- ess_sub %>% filter(year == 2016)
ess_2018 <- ess_sub %>% filter(year == 2018)
ess_2020 <- ess_sub %>% filter(year == 2020)
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2010))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2010)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2350 -1.2652  0.2708  1.5509  4.8340 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.487877   0.560722   9.787   <2e-16 ***
## eisced       0.470275   0.048036   9.790   <2e-16 ***
## agea         0.016031   0.007877   2.035   0.0423 *  
## gndr        -0.381871   0.179130  -2.132   0.0334 *  
## health      -0.223518   0.105032  -2.128   0.0337 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.137 on 583 degrees of freedom
## Multiple R-squared:  0.1564, Adjusted R-squared:  0.1506 
## F-statistic: 27.02 on 4 and 583 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2012))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5570 -1.1784  0.4484  1.6815  4.2529 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.939876   0.679211   7.273 1.41e-12 ***
## eisced       0.467098   0.056604   8.252 1.46e-15 ***
## agea         0.017342   0.009131   1.899   0.0581 .  
## gndr         0.092345   0.210351   0.439   0.6609    
## health      -0.335668   0.129884  -2.584   0.0100 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.278 on 489 degrees of freedom
## Multiple R-squared:  0.1372, Adjusted R-squared:  0.1302 
## F-statistic: 19.45 on 4 and 489 DF,  p-value: 7.306e-15

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2014))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8590 -1.6010  0.4245  1.6909  4.8237 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.849881   0.634111   9.225  < 2e-16 ***
## eisced       0.437981   0.053978   8.114 3.23e-15 ***
## agea         0.014243   0.008334   1.709    0.088 .  
## gndr        -0.306184   0.196494  -1.558    0.120    
## health      -0.507060   0.117956  -4.299 2.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.262 on 548 degrees of freedom
## Multiple R-squared:  0.1612, Adjusted R-squared:  0.1551 
## F-statistic: 26.32 on 4 and 548 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2016))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2016)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1831 -1.4865  0.2887  1.6547  4.7182 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.175766   0.626074   8.267  1.4e-15 ***
## eisced       0.388745   0.053475   7.270  1.5e-12 ***
## agea         0.020016   0.008347   2.398  0.01688 *  
## gndr        -0.065814   0.202177  -0.326  0.74493    
## health      -0.408678   0.129377  -3.159  0.00169 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.178 on 473 degrees of freedom
## Multiple R-squared:  0.1313, Adjusted R-squared:  0.124 
## F-statistic: 17.88 on 4 and 473 DF,  p-value: 1.114e-13

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2018))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.1638 -1.2233  0.2239  1.6766  4.4294 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.273342   0.571796  10.971  < 2e-16 ***
## eisced       0.420126   0.049658   8.460  < 2e-16 ***
## agea         0.011605   0.007366   1.575  0.11566    
## gndr        -0.540403   0.179206  -3.016  0.00267 ** 
## health      -0.417197   0.111117  -3.755  0.00019 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.169 on 606 degrees of freedom
## Multiple R-squared:  0.1381, Adjusted R-squared:  0.1325 
## F-statistic: 24.28 on 4 and 606 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2020))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2020)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0464 -1.5174  0.2731  1.7145  3.9163 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.15039    0.87637   8.159 1.49e-14 ***
## eisced       0.38111    0.07539   5.055 8.16e-07 ***
## agea         0.02191    0.01098   1.996 0.047009 *  
## gndr        -1.09588    0.26001  -4.215 3.46e-05 ***
## health      -0.50474    0.14407  -3.503 0.000541 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.064 on 258 degrees of freedom
## Multiple R-squared:  0.1911, Adjusted R-squared:  0.1785 
## F-statistic: 15.23 on 4 and 258 DF,  p-value: 3.388e-11

# record the coefficients and corresponding year to make a graph
returns <- c(0.470275, 0.467098, 0.437981, 0.388745, 0.420126, 0.38111)
years <- c(2010, 2012, 2014, 2016, 2018, 2020)

# make the plot and save it as image
# r runs into an error with times new roman, I don't know why. It works
# for the plot saved with ggsave though
returns_time <- ggplot(data = NULL, aes(x = years, y = returns)) +
  geom_line() + geom_point() + labs(x = "Year", y = "Returns to Education") +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
  theme(text = element_text(family = "Times New Roman")) +
  geom_text(aes(label = returns), vjust = -1)
ggsave("outcome temporal graph.png", plot = returns_time, width = 9, height = 6)
print(returns_time)

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# comparing the trend with the ESS baseline
# clean the ESS baseline dataset for variables used in regression
ess <- ess %>% mutate(hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
                      eisced = if_else(eisced %in% c(1:7), eisced, NA),
                      eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
                      agea = if_else(agea < 999, agea, NA),
                      gndr = if_else(gndr %in% c(1:2), gndr, NA),
                      health = if_else(health %in% c(1:5), health, NA))

# split by years using ess rounds
ess_base2010 <- na.omit(ess %>% filter(essround == 5) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2012 <- na.omit(ess %>% filter(essround == 6) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2014 <- na.omit(ess %>% filter(essround == 7) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2016 <- na.omit(ess %>% filter(essround == 8) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2018 <- na.omit(ess %>% filter(essround == 9) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
ess_base2020 <- na.omit(ess %>% filter(essround == 10) %>% 
                          select(hinctnta, eisced, agea, gndr, health))
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2010))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2010)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9294 -1.8985 -0.0996  1.9144  7.8820 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.7143383  0.0648467   88.12   <2e-16 ***
## eisced       0.4959419  0.0070479   70.37   <2e-16 ***
## agea        -0.0163674  0.0007709  -21.23   <2e-16 ***
## gndr        -0.4996855  0.0252913  -19.76   <2e-16 ***
## health      -0.4295232  0.0144874  -29.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.498 on 39549 degrees of freedom
## Multiple R-squared:  0.1975, Adjusted R-squared:  0.1974 
## F-statistic:  2433 on 4 and 39549 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2012))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8566 -1.9447 -0.1326  1.9579  7.8261 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.223105   0.062358   83.76   <2e-16 ***
## eisced       0.537671   0.006709   80.14   <2e-16 ***
## agea        -0.012328   0.000738  -16.70   <2e-16 ***
## gndr        -0.476018   0.024317  -19.58   <2e-16 ***
## health      -0.407658   0.014254  -28.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.524 on 43640 degrees of freedom
## Multiple R-squared:  0.1959, Adjusted R-squared:  0.1958 
## F-statistic:  2658 on 4 and 43640 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2014))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9167 -1.9320 -0.0266  1.9771  6.9672 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3681139  0.0742571   72.29   <2e-16 ***
## eisced       0.5217912  0.0078581   66.40   <2e-16 ***
## agea        -0.0101834  0.0008416  -12.10   <2e-16 ***
## gndr        -0.4954921  0.0282713  -17.53   <2e-16 ***
## health      -0.3742457  0.0169125  -22.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.507 on 31658 degrees of freedom
## Multiple R-squared:  0.1848, Adjusted R-squared:  0.1847 
## F-statistic:  1795 on 4 and 31658 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2016))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2016)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8387 -1.8964 -0.0323  1.9054  7.7979 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.6092666  0.0672528   83.41   <2e-16 ***
## eisced       0.4879677  0.0071739   68.02   <2e-16 ***
## agea        -0.0148867  0.0007742  -19.23   <2e-16 ***
## gndr        -0.4554674  0.0259549  -17.55   <2e-16 ***
## health      -0.4330932  0.0153287  -28.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.462 on 36261 degrees of freedom
## Multiple R-squared:  0.1887, Adjusted R-squared:  0.1886 
## F-statistic:  2108 on 4 and 36261 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2018))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1223 -1.8986 -0.0627  1.9147  7.4769 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.2568298  0.0666362   93.89   <2e-16 ***
## eisced       0.4902947  0.0071003   69.05   <2e-16 ***
## agea        -0.0207870  0.0007482  -27.78   <2e-16 ***
## gndr        -0.5796486  0.0249344  -23.25   <2e-16 ***
## health      -0.4672702  0.0148255  -31.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.471 on 39590 degrees of freedom
## Multiple R-squared:  0.2116, Adjusted R-squared:  0.2115 
## F-statistic:  2657 on 4 and 39590 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2020))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2020)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0743 -1.8877  0.0199  1.9495  7.6590 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.0999608  0.0632803   96.40   <2e-16 ***
## eisced       0.4795954  0.0066156   72.50   <2e-16 ***
## agea        -0.0178978  0.0006912  -25.89   <2e-16 ***
## gndr        -0.5637844  0.0230659  -24.44   <2e-16 ***
## health      -0.3895181  0.0139879  -27.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.461 on 45748 degrees of freedom
## Multiple R-squared:  0.1844, Adjusted R-squared:  0.1844 
## F-statistic:  2586 on 4 and 45748 DF,  p-value: < 2.2e-16

# record regression coefficients that represent returns to education
base_returns <- c(0.495942, 0.537671, 0.521791, 0.487968, 0.490295, 0.479595)

# make a plot with both the baseline and GB trends
plot2 <- ggplot() + geom_line(data = NULL, aes(x = years, y = returns), color = 'red') +
  geom_line(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
  geom_point(data = NULL, aes(x = years, y = returns), color = 'red') +
  geom_point(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
  geom_text(data = NULL, aes(x = years, y = returns, label = returns), 
            vjust = -1, color = 'red') +
  geom_text(data = NULL, aes(x = years, y = base_returns, label = base_returns), 
            vjust = 2, color = 'blue') + 
  labs(x = "Year", y = "Returns to Education") +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
  theme(text = element_text(family = "Times New Roman")) +
  annotate("text", x = 2010.5, y = 0.53, label = "ESS Baseline", color = 'blue') +
  annotate("text", x = 2010.5, y = 0.46, label = "GB Trend", color = 'red')

ggsave("ess baseline graph.png", plot = plot2, width = 9, height = 6)
print(plot2)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# column percentages
edu_percentage <- cprop(table(ess_sub %>% select(eisced, hinctnta)))
gender_percentage <- cprop(table(ess_sub %>% select(gndr, hinctnta)))
health_percentage <- cprop(table(ess_sub %>% select(health, hinctnta)))


# mean and sd
edu_stats <- ess_sub %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
gender_stats <- ess_sub %>%
  group_by(gndr) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
health_stats <- ess_sub %>%
  group_by(health) %>%
  summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))

Visuals for column Percentages, Mean, and SD

Edu Level	1	2	3	4	5	6	7	8	9	10	All
1	22.2	24.8	19.8	16.1	11.5	8.4	5.4	3.2	2.6	2.7	7.1
2	20.0	13.9	21.4	14.0	17.0	15.7	14.9	12.2	8.8	6.7	12.4
3	11.1	8.9	8.7	10.8	12.6	15.4	9.7	9.4	7.1	3.2	8.9
4	13.3	19.8	19.0	18.8	17.8	16.3	15.6	17.8	11.8	10.4	15.1
5	11.1	18.8	15.1	17.7	18.2	20.8	22.6	20.4	19.9	16.1	19.0
6	15.6	6.9	10.3	9.7	12.6	13.9	17.4	20.4	26.5	29.6	19.8
7	6.7	6.9	5.6	12.9	10.3	9.6	14.4	16.8	23.5	31.3	17.8
Total	100	100	100	100	100	100	100	100	100	100	100

Gender	1	2	3	4	5	6	7	8	9	10	All
1	40.0	35.6	34.1	34.9	44.3	41.9	44.4	44.9	47.2	46.8	43.7
2	60.0	64.4	65.9	65.1	55.7	58.1	55.6	55.1	52.8	53.2	56.3
Total	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0

Health	1	2	3	4	5	6	7	8	9	10	All
1	22.2	20.8	30.2	34.4	33.2	28.6	32.8	38.7	41.5	43.8	36.3
2	35.6	50.5	37.3	43.0	47.0	49.4	44.9	42.9	41.7	45.3	44.4
3	28.9	21.8	21.4	16.1	14.6	18.1	18.7	16.2	14.3	8.9	15.5
4	8.9	5.0	7.1	4.8	4.3	3.0	3.1	2.0	1.9	2.1	3.0
5	4.4	2.0	4.0	1.6	0.8	0.9	0.5	0.2	0.6	0.0	0.8
Total	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0	100.0

eisced	mean_income	std_income
1	5.24	2.53
2	6.49	2.35
3	6.53	2.16
4	6.78	2.35
5	7.19	2.23
6	7.98	2.09
7	8.16	2.04

gndr	mean_income	std_income
1	7.36	2.28
2	7.03	2.42

health	mean_income	std_income
1	7.53	2.25
2	7.17	2.35
3	6.68	2.42
4	6.12	2.65
5	4.78	2.52

Initial Analysis

# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
both_homog_data <- ess_sub %>% filter(both_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0)

# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
both_homog_mean <- both_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))

edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_line(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_point(data = both_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + labs(x = "Highest Level of Education ES - ISCED",
    y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
    theme(text = element_text(family = "Times New Roman")) +
    annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
      annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
      annotate("text", x = 6, y = 5.6, label = "Both", color = 'purple') +
      annotate("text", x = 6, y = 5.4, label = "None", color = 'black')

ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# regressions to get returns to education of different marriages
# order: educational, occupational, both, none
# returns to education for each group = regression coefficient for eisced
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7501 -1.3105  0.4156  1.5521  4.5410 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.884271   0.436940  11.178  < 2e-16 ***
## eisced       0.536603   0.033871  15.843  < 2e-16 ***
## agea         0.017328   0.005764   3.006 0.002712 ** 
## gndr        -0.262084   0.135563  -1.933 0.053482 .  
## health      -0.286760   0.084890  -3.378 0.000758 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.119 on 997 degrees of freedom
## Multiple R-squared:  0.2236, Adjusted R-squared:  0.2205 
## F-statistic:  71.8 on 4 and 997 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1461 -1.1925  0.4604  1.5773  4.2467 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.164341   0.533153   9.686  < 2e-16 ***
## eisced       0.514623   0.045673  11.268  < 2e-16 ***
## agea         0.022569   0.007004   3.222 0.001334 ** 
## gndr        -0.315692   0.163776  -1.928 0.054333 .  
## health      -0.403078   0.105505  -3.820 0.000146 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.101 on 664 degrees of freedom
## Multiple R-squared:    0.2,  Adjusted R-squared:  0.1952 
## F-statistic: 41.51 on 4 and 664 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = both_homog_data))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = both_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8235 -1.0129  0.4206  1.3502  3.5459 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.338395   0.736819   5.888  1.1e-08 ***
## eisced       0.603594   0.063279   9.539  < 2e-16 ***
## agea         0.028903   0.009397   3.076  0.00230 ** 
## gndr        -0.165678   0.222174  -0.746  0.45646    
## health      -0.470413   0.147149  -3.197  0.00155 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.864 on 284 degrees of freedom
## Multiple R-squared:  0.2953, Adjusted R-squared:  0.2853 
## F-statistic: 29.75 on 4 and 284 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced + agea + gndr + health, data = non_homog_data))

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = non_homog_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.394 -1.416  0.380  1.687  4.991 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.378090   0.353657  18.035  < 2e-16 ***
## eisced       0.320065   0.032218   9.934  < 2e-16 ***
## agea         0.014495   0.004721   3.070  0.00217 ** 
## gndr        -0.342507   0.113507  -3.017  0.00259 ** 
## health      -0.446677   0.066188  -6.749 2.08e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.217 on 1600 degrees of freedom
## Multiple R-squared:  0.09289,    Adjusted R-squared:  0.09062 
## F-statistic: 40.96 on 4 and 1600 DF,  p-value: < 2.2e-16

Stepping Stone 2

Kelvin Li, Yisu Hou

2023-10-30

The Code

Visuals for column Percentages, Mean, and SD

Initial Analysis