The Code
# read the ESS File
ess <- read.fst("All-ESS-Data.fst")
# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")
# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
gndr, health, iscoco, iscocop, essround)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
isco08 = case_when(isco08 %in% c(0:999) ~ 0,
isco08 %in% c(1000:1999) ~ 1,
isco08 %in% c(2000:2999) ~ 2,
isco08 %in% c(3000:3999) ~ 3,
isco08 %in% c(4000:4999) ~ 4,
isco08 %in% c(5000:5999) ~ 5,
isco08 %in% c(6000:6999) ~ 6,
isco08 %in% c(7000:7999) ~ 7,
isco08 %in% c(8000:8999) ~ 8,
isco08 %in% c(9000:9999) ~ 9),
isco08p = case_when(isco08p %in% c(0:999) ~ 0,
isco08p %in% c(1000:1999) ~ 1,
isco08p %in% c(2000:2999) ~ 2,
isco08p %in% c(3000:3999) ~ 3,
isco08p %in% c(4000:4999) ~ 4,
isco08p %in% c(5000:5999) ~ 5,
isco08p %in% c(6000:6999) ~ 6,
isco08p %in% c(7000:7999) ~ 7,
isco08p %in% c(8000:8999) ~ 8,
isco08p %in% c(9000:9999) ~ 9),
iscoco = case_when(iscoco %in% c(0:999) ~ 0,
iscoco %in% c(1000:1999) ~ 1,
iscoco %in% c(2000:2999) ~ 2,
iscoco %in% c(3000:3999) ~ 3,
iscoco %in% c(4000:4999) ~ 4,
iscoco %in% c(5000:5999) ~ 5,
iscoco %in% c(6000:6999) ~ 6,
iscoco %in% c(7000:7999) ~ 7,
iscoco %in% c(8000:8999) ~ 8,
iscoco %in% c(9000:9999) ~ 9),
iscocop = case_when(iscocop %in% c(0:999) ~ 0,
iscocop %in% c(1000:1999) ~ 1,
iscocop %in% c(2000:2999) ~ 2,
iscocop %in% c(3000:3999) ~ 3,
iscocop %in% c(4000:4999) ~ 4,
iscocop %in% c(5000:5999) ~ 5,
iscocop %in% c(6000:6999) ~ 6,
iscocop %in% c(7000:7999) ~ 7,
iscocop %in% c(8000:8999) ~ 8,
iscocop %in% c(9000:9999) ~ 9),
agea = if_else(agea < 999, agea, NA),
gndr = if_else(gndr %in% c(1:2), gndr, NA),
health = if_else(health %in% c(1:5), health, NA),
year = case_when(essround == 1 ~ 2002,
essround == 2 ~ 2004,
essround == 3 ~ 2006,
essround == 4 ~ 2008,
essround == 5 ~ 2010,
essround == 6 ~ 2012,
essround == 7 ~ 2014,
essround == 8 ~ 2016,
essround == 9 ~ 2018,
essround == 10 ~ 2020))
ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
!is.na(iscoco) ~ iscoco),
iscop = case_when(!is.na(isco08p) ~ isco08p,
!is.na(iscocop) ~ iscocop))
# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco,
iscop, agea, gndr, health, year))
# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
occu_homo = if_else(isco == iscop, 1, 0))
ess_sub <- ess_sub %>% mutate(both_homo = if_else(edu_homo == 1 & occu_homo == 1,
1, 0))
# skim
datasummary_skim(ess_sub %>% select(hinctnta, eisced, agea, gndr, health, edu_homo,
occu_homo, both_homo))
Unique (#)
Missing (%)
Mean
SD
Min
Median
Max
hinctnta
10
0
7.2
2.4
1.0
8.0
10.0
eisced
7
0
4.6
1.9
1.0
5.0
7.0
agea
65
0
45.2
12.1
18.0
45.0
82.0
gndr
2
0
1.6
0.5
1.0
2.0
2.0
health
5
0
1.9
0.8
1.0
2.0
5.0
edu_homo
2
0
0.3
0.5
0.0
0.0
1.0
occu_homo
2
0
0.2
0.4
0.0
0.0
1.0
both_homo
2
0
0.1
0.3
0.0
0.0
1.0
# time trend: split dataset by years and find regression coefficients
# to identify the returns to education in each year
ess_2010 <- ess_sub %>% filter(year == 2010)
ess_2012 <- ess_sub %>% filter(year == 2012)
ess_2014 <- ess_sub %>% filter(year == 2014)
ess_2016 <- ess_sub %>% filter(year == 2016)
ess_2018 <- ess_sub %>% filter(year == 2018)
ess_2020 <- ess_sub %>% filter(year == 2020)
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2010))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2010)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2350 -1.2652 0.2708 1.5509 4.8340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.487877 0.560722 9.787 <2e-16 ***
## eisced 0.470275 0.048036 9.790 <2e-16 ***
## agea 0.016031 0.007877 2.035 0.0423 *
## gndr -0.381871 0.179130 -2.132 0.0334 *
## health -0.223518 0.105032 -2.128 0.0337 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.137 on 583 degrees of freedom
## Multiple R-squared: 0.1564, Adjusted R-squared: 0.1506
## F-statistic: 27.02 on 4 and 583 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2012))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2012)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5570 -1.1784 0.4484 1.6815 4.2529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.939876 0.679211 7.273 1.41e-12 ***
## eisced 0.467098 0.056604 8.252 1.46e-15 ***
## agea 0.017342 0.009131 1.899 0.0581 .
## gndr 0.092345 0.210351 0.439 0.6609
## health -0.335668 0.129884 -2.584 0.0100 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.278 on 489 degrees of freedom
## Multiple R-squared: 0.1372, Adjusted R-squared: 0.1302
## F-statistic: 19.45 on 4 and 489 DF, p-value: 7.306e-15
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2014))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8590 -1.6010 0.4245 1.6909 4.8237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.849881 0.634111 9.225 < 2e-16 ***
## eisced 0.437981 0.053978 8.114 3.23e-15 ***
## agea 0.014243 0.008334 1.709 0.088 .
## gndr -0.306184 0.196494 -1.558 0.120
## health -0.507060 0.117956 -4.299 2.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.262 on 548 degrees of freedom
## Multiple R-squared: 0.1612, Adjusted R-squared: 0.1551
## F-statistic: 26.32 on 4 and 548 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2016))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1831 -1.4865 0.2887 1.6547 4.7182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.175766 0.626074 8.267 1.4e-15 ***
## eisced 0.388745 0.053475 7.270 1.5e-12 ***
## agea 0.020016 0.008347 2.398 0.01688 *
## gndr -0.065814 0.202177 -0.326 0.74493
## health -0.408678 0.129377 -3.159 0.00169 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.178 on 473 degrees of freedom
## Multiple R-squared: 0.1313, Adjusted R-squared: 0.124
## F-statistic: 17.88 on 4 and 473 DF, p-value: 1.114e-13
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2018))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.1638 -1.2233 0.2239 1.6766 4.4294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.273342 0.571796 10.971 < 2e-16 ***
## eisced 0.420126 0.049658 8.460 < 2e-16 ***
## agea 0.011605 0.007366 1.575 0.11566
## gndr -0.540403 0.179206 -3.016 0.00267 **
## health -0.417197 0.111117 -3.755 0.00019 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.169 on 606 degrees of freedom
## Multiple R-squared: 0.1381, Adjusted R-squared: 0.1325
## F-statistic: 24.28 on 4 and 606 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_2020))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_2020)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0464 -1.5174 0.2731 1.7145 3.9163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.15039 0.87637 8.159 1.49e-14 ***
## eisced 0.38111 0.07539 5.055 8.16e-07 ***
## agea 0.02191 0.01098 1.996 0.047009 *
## gndr -1.09588 0.26001 -4.215 3.46e-05 ***
## health -0.50474 0.14407 -3.503 0.000541 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.064 on 258 degrees of freedom
## Multiple R-squared: 0.1911, Adjusted R-squared: 0.1785
## F-statistic: 15.23 on 4 and 258 DF, p-value: 3.388e-11
# record the coefficients and corresponding year to make a graph
returns <- c(0.470275, 0.467098, 0.437981, 0.388745, 0.420126, 0.38111)
years <- c(2010, 2012, 2014, 2016, 2018, 2020)
# make the plot and save it as image
# r runs into an error with times new roman, I don't know why. It works
# for the plot saved with ggsave though
returns_time <- ggplot(data = NULL, aes(x = years, y = returns)) +
geom_line() + geom_point() + labs(x = "Year", y = "Returns to Education") +
scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
theme(text = element_text(family = "Times New Roman")) +
geom_text(aes(label = returns), vjust = -1)
ggsave("outcome temporal graph.png", plot = returns_time, width = 9, height = 6)
print(returns_time)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
# comparing the trend with the ESS baseline
# clean the ESS baseline dataset for variables used in regression
ess <- ess %>% mutate(hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
eisced = if_else(eisced %in% c(1:7), eisced, NA),
eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
agea = if_else(agea < 999, agea, NA),
gndr = if_else(gndr %in% c(1:2), gndr, NA),
health = if_else(health %in% c(1:5), health, NA))
# split by years using ess rounds
ess_base2010 <- na.omit(ess %>% filter(essround == 5) %>%
select(hinctnta, eisced, agea, gndr, health))
ess_base2012 <- na.omit(ess %>% filter(essround == 6) %>%
select(hinctnta, eisced, agea, gndr, health))
ess_base2014 <- na.omit(ess %>% filter(essround == 7) %>%
select(hinctnta, eisced, agea, gndr, health))
ess_base2016 <- na.omit(ess %>% filter(essround == 8) %>%
select(hinctnta, eisced, agea, gndr, health))
ess_base2018 <- na.omit(ess %>% filter(essround == 9) %>%
select(hinctnta, eisced, agea, gndr, health))
ess_base2020 <- na.omit(ess %>% filter(essround == 10) %>%
select(hinctnta, eisced, agea, gndr, health))
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2010))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2010)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9294 -1.8985 -0.0996 1.9144 7.8820
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7143383 0.0648467 88.12 <2e-16 ***
## eisced 0.4959419 0.0070479 70.37 <2e-16 ***
## agea -0.0163674 0.0007709 -21.23 <2e-16 ***
## gndr -0.4996855 0.0252913 -19.76 <2e-16 ***
## health -0.4295232 0.0144874 -29.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.498 on 39549 degrees of freedom
## Multiple R-squared: 0.1975, Adjusted R-squared: 0.1974
## F-statistic: 2433 on 4 and 39549 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2012))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2012)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8566 -1.9447 -0.1326 1.9579 7.8261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.223105 0.062358 83.76 <2e-16 ***
## eisced 0.537671 0.006709 80.14 <2e-16 ***
## agea -0.012328 0.000738 -16.70 <2e-16 ***
## gndr -0.476018 0.024317 -19.58 <2e-16 ***
## health -0.407658 0.014254 -28.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.524 on 43640 degrees of freedom
## Multiple R-squared: 0.1959, Adjusted R-squared: 0.1958
## F-statistic: 2658 on 4 and 43640 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2014))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2014)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9167 -1.9320 -0.0266 1.9771 6.9672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3681139 0.0742571 72.29 <2e-16 ***
## eisced 0.5217912 0.0078581 66.40 <2e-16 ***
## agea -0.0101834 0.0008416 -12.10 <2e-16 ***
## gndr -0.4954921 0.0282713 -17.53 <2e-16 ***
## health -0.3742457 0.0169125 -22.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.507 on 31658 degrees of freedom
## Multiple R-squared: 0.1848, Adjusted R-squared: 0.1847
## F-statistic: 1795 on 4 and 31658 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2016))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2016)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8387 -1.8964 -0.0323 1.9054 7.7979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6092666 0.0672528 83.41 <2e-16 ***
## eisced 0.4879677 0.0071739 68.02 <2e-16 ***
## agea -0.0148867 0.0007742 -19.23 <2e-16 ***
## gndr -0.4554674 0.0259549 -17.55 <2e-16 ***
## health -0.4330932 0.0153287 -28.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.462 on 36261 degrees of freedom
## Multiple R-squared: 0.1887, Adjusted R-squared: 0.1886
## F-statistic: 2108 on 4 and 36261 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2018))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1223 -1.8986 -0.0627 1.9147 7.4769
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2568298 0.0666362 93.89 <2e-16 ***
## eisced 0.4902947 0.0071003 69.05 <2e-16 ***
## agea -0.0207870 0.0007482 -27.78 <2e-16 ***
## gndr -0.5796486 0.0249344 -23.25 <2e-16 ***
## health -0.4672702 0.0148255 -31.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.471 on 39590 degrees of freedom
## Multiple R-squared: 0.2116, Adjusted R-squared: 0.2115
## F-statistic: 2657 on 4 and 39590 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = ess_base2020))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = ess_base2020)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.0743 -1.8877 0.0199 1.9495 7.6590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0999608 0.0632803 96.40 <2e-16 ***
## eisced 0.4795954 0.0066156 72.50 <2e-16 ***
## agea -0.0178978 0.0006912 -25.89 <2e-16 ***
## gndr -0.5637844 0.0230659 -24.44 <2e-16 ***
## health -0.3895181 0.0139879 -27.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.461 on 45748 degrees of freedom
## Multiple R-squared: 0.1844, Adjusted R-squared: 0.1844
## F-statistic: 2586 on 4 and 45748 DF, p-value: < 2.2e-16
# record regression coefficients that represent returns to education
base_returns <- c(0.495942, 0.537671, 0.521791, 0.487968, 0.490295, 0.479595)
# make a plot with both the baseline and GB trends
plot2 <- ggplot() + geom_line(data = NULL, aes(x = years, y = returns), color = 'red') +
geom_line(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
geom_point(data = NULL, aes(x = years, y = returns), color = 'red') +
geom_point(data = NULL, aes(x = years, y = base_returns), color = 'blue') +
geom_text(data = NULL, aes(x = years, y = returns, label = returns),
vjust = -1, color = 'red') +
geom_text(data = NULL, aes(x = years, y = base_returns, label = base_returns),
vjust = 2, color = 'blue') +
labs(x = "Year", y = "Returns to Education") +
scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
theme(text = element_text(family = "Times New Roman")) +
annotate("text", x = 2010.5, y = 0.53, label = "ESS Baseline", color = 'blue') +
annotate("text", x = 2010.5, y = 0.46, label = "GB Trend", color = 'red')
ggsave("ess baseline graph.png", plot = plot2, width = 9, height = 6)
print(plot2)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
# column percentages
edu_percentage <- cprop(table(ess_sub %>% select(eisced, hinctnta)))
gender_percentage <- cprop(table(ess_sub %>% select(gndr, hinctnta)))
health_percentage <- cprop(table(ess_sub %>% select(health, hinctnta)))
# mean and sd
edu_stats <- ess_sub %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
gender_stats <- ess_sub %>%
group_by(gndr) %>%
summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
health_stats <- ess_sub %>%
group_by(health) %>%
summarize(mean_income = mean(hinctnta), std_income = sd(hinctnta))
Initial Analysis
# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
both_homog_data <- ess_sub %>% filter(both_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0)
# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
both_homog_mean <- both_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
group_by(eisced) %>%
summarize(mean_income = mean(hinctnta))
edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
y = mean_income), color = 'blue') + geom_line(data = both_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_point(data = both_homog_mean, aes(x = eisced,
y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
y = mean_income), color = 'black') + labs(x = "Highest Level of Education ES - ISCED",
y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
theme(text = element_text(family = "Times New Roman")) +
annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
annotate("text", x = 6, y = 5.6, label = "Both", color = 'purple') +
annotate("text", x = 6, y = 5.4, label = "None", color = 'black')
ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列
# regressions to get returns to education of different marriages
# order: educational, occupational, both, none
# returns to education for each group = regression coefficient for eisced
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = edu_homog_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7501 -1.3105 0.4156 1.5521 4.5410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.884271 0.436940 11.178 < 2e-16 ***
## eisced 0.536603 0.033871 15.843 < 2e-16 ***
## agea 0.017328 0.005764 3.006 0.002712 **
## gndr -0.262084 0.135563 -1.933 0.053482 .
## health -0.286760 0.084890 -3.378 0.000758 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.119 on 997 degrees of freedom
## Multiple R-squared: 0.2236, Adjusted R-squared: 0.2205
## F-statistic: 71.8 on 4 and 997 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = occu_homog_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1461 -1.1925 0.4604 1.5773 4.2467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.164341 0.533153 9.686 < 2e-16 ***
## eisced 0.514623 0.045673 11.268 < 2e-16 ***
## agea 0.022569 0.007004 3.222 0.001334 **
## gndr -0.315692 0.163776 -1.928 0.054333 .
## health -0.403078 0.105505 -3.820 0.000146 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.101 on 664 degrees of freedom
## Multiple R-squared: 0.2, Adjusted R-squared: 0.1952
## F-statistic: 41.51 on 4 and 664 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = both_homog_data))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = both_homog_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8235 -1.0129 0.4206 1.3502 3.5459
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.338395 0.736819 5.888 1.1e-08 ***
## eisced 0.603594 0.063279 9.539 < 2e-16 ***
## agea 0.028903 0.009397 3.076 0.00230 **
## gndr -0.165678 0.222174 -0.746 0.45646
## health -0.470413 0.147149 -3.197 0.00155 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.864 on 284 degrees of freedom
## Multiple R-squared: 0.2953, Adjusted R-squared: 0.2853
## F-statistic: 29.75 on 4 and 284 DF, p-value: < 2.2e-16
summary(lm(hinctnta ~ eisced + agea + gndr + health, data = non_homog_data))
##
## Call:
## lm(formula = hinctnta ~ eisced + agea + gndr + health, data = non_homog_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.394 -1.416 0.380 1.687 4.991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.378090 0.353657 18.035 < 2e-16 ***
## eisced 0.320065 0.032218 9.934 < 2e-16 ***
## agea 0.014495 0.004721 3.070 0.00217 **
## gndr -0.342507 0.113507 -3.017 0.00259 **
## health -0.446677 0.066188 -6.749 2.08e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.217 on 1600 degrees of freedom
## Multiple R-squared: 0.09289, Adjusted R-squared: 0.09062
## F-statistic: 40.96 on 4 and 1600 DF, p-value: < 2.2e-16