SOC252 Final Report

data cleaning

# read the ESS File
ess <- read.fst("All-ESS-Data.fst")

# filter for gb
ess_gb <- ess %>% filter(cntry == "GB")

# find birth year of partner to find age homogamies
ess_gb <- ess_gb %>% mutate(partner_birth = case_when(
   rshipa2 == 1 ~ yrbrn2,
      rshipa3 == 1 ~ yrbrn3,
      rshipa4 == 1 ~ yrbrn4,
      rshipa5 == 1 ~ yrbrn5,
      rshipa6 == 1 ~ yrbrn6,
      rshipa7 == 1 ~ yrbrn7,
      rshipa8 == 1 ~ yrbrn8,
      rshipa9 == 1 ~ yrbrn9,
      rshipa10 == 1 ~ yrbrn10,
      rshipa11 == 1 ~ yrbrn11,
      rshipa12 == 1 ~ yrbrn12,
      rshipa13 == 1 ~ yrbrn13,
      rshipa14 == 1 ~ yrbrn14,
      rshipa15 == 1 ~ yrbrn15
))

# subset with variables of interest and data cleaning
# split occupations by their general categories
ess_sub <- ess_gb %>% select(eisced, hinctnta, eiscedp, isco08, isco08p, agea,
      gndr, health, iscoco, iscocop, essround, yrbrn, partner_birth)
ess_sub <- ess_sub %>% mutate(eisced = if_else(eisced %in% c(1:7), eisced, NA),
        hinctnta = if_else(hinctnta %in% c(1:10), hinctnta, NA),
        eiscedp = if_else(eiscedp %in% c(1:7), eiscedp, NA),
        isco08 = case_when(isco08 %in% c(0:999) ~ 0,
                  isco08 %in% c(1000:1999) ~ 1,
                  isco08 %in% c(2000:2999) ~ 2,
                  isco08 %in% c(3000:3999) ~ 3,
                  isco08 %in% c(4000:4999) ~ 4,
                  isco08 %in% c(5000:5999) ~ 5,
                  isco08 %in% c(6000:6999) ~ 6,
                  isco08 %in% c(7000:7999) ~ 7,
                  isco08 %in% c(8000:8999) ~ 8,
                  isco08 %in% c(9000:9999) ~ 9),
        isco08p = case_when(isco08p %in% c(0:999) ~ 0,
                  isco08p %in% c(1000:1999) ~ 1,
                  isco08p %in% c(2000:2999) ~ 2,
                  isco08p %in% c(3000:3999) ~ 3,
                  isco08p %in% c(4000:4999) ~ 4,
                  isco08p %in% c(5000:5999) ~ 5,
                  isco08p %in% c(6000:6999) ~ 6,
                  isco08p %in% c(7000:7999) ~ 7,
                  isco08p %in% c(8000:8999) ~ 8,
                  isco08p %in% c(9000:9999) ~ 9),
        iscoco = case_when(iscoco %in% c(0:999) ~ 0,
                           iscoco %in% c(1000:1999) ~ 1,
                           iscoco %in% c(2000:2999) ~ 2,
                           iscoco %in% c(3000:3999) ~ 3,
                           iscoco %in% c(4000:4999) ~ 4,
                           iscoco %in% c(5000:5999) ~ 5,
                           iscoco %in% c(6000:6999) ~ 6,
                           iscoco %in% c(7000:7999) ~ 7,
                           iscoco %in% c(8000:8999) ~ 8,
                           iscoco %in% c(9000:9999) ~ 9),
        iscocop = case_when(iscocop %in% c(0:999) ~ 0,
                           iscocop %in% c(1000:1999) ~ 1,
                           iscocop %in% c(2000:2999) ~ 2,
                           iscocop %in% c(3000:3999) ~ 3,
                           iscocop %in% c(4000:4999) ~ 4,
                           iscocop %in% c(5000:5999) ~ 5,
                           iscocop %in% c(6000:6999) ~ 6,
                           iscocop %in% c(7000:7999) ~ 7,
                           iscocop %in% c(8000:8999) ~ 8,
                           iscocop %in% c(9000:9999) ~ 9),
        agea = if_else(agea < 999, agea, NA),
        gndr = if_else(gndr %in% c(1:2), gndr, NA),
        health = if_else(health %in% c(1:5), health, NA),
        year = case_when(essround == 1 ~ 2002,
                         essround == 2 ~ 2004,
                         essround == 3 ~ 2006,
                         essround == 4 ~ 2008,
                         essround == 5 ~ 2010,
                         essround == 6 ~ 2012,
                         essround == 7 ~ 2014,
                         essround == 8 ~ 2016,
                         essround == 9 ~ 2018,
                         essround == 10 ~ 2020),
        yrbrn = if_else(yrbrn %in% c(1900:2020), yrbrn, NA),
        partner_birth = if_else(partner_birth %in% c(1900:2020), partner_birth, NA))

ess_sub <- ess_sub %>% mutate(isco = case_when(!is.na(isco08) ~ isco08,
                                               !is.na(iscoco) ~ iscoco),
                              iscop = case_when(!is.na(isco08p) ~ isco08p,
                                                !is.na(iscocop) ~ iscocop))


# select relevant variables
ess_sub <- na.omit(ess_sub %>% select(eisced, hinctnta, eiscedp, isco, 
          iscop, agea, gndr, health, year, yrbrn, partner_birth))

# mutate new variables to represent different homogamies
ess_sub <- ess_sub %>% mutate(edu_homo = if_else(eisced == eiscedp, 1, 0),
            occu_homo = if_else(isco == iscop, 1, 0),
            age_homo = if_else(abs(yrbrn - partner_birth) <= 1, 1, 0))
ess_sub <- ess_sub %>% mutate(all_homo = if_else(edu_homo == 1 & occu_homo == 1,
            1, 0))

ess_sub_1 <- ess_sub %>% rename(highest_edu = eisced, household_income = hinctnta,
        highest_edu_partner = eiscedp, occu_category = isco, 
        occu_category_partner = iscop, age = agea, gender = gndr,
        birth_year = yrbrn)

# skim
datasummary_skim(ess_sub_1 %>% select(household_income, highest_edu,
  highest_edu_partner, age, gender, health, birth_year, partner_birth, 
  occu_category, occu_category_partner, edu_homo, occu_homo, age_homo, all_homo),
   fmt = fmt_significant(4))

	Unique (#)	Mean	SD	Min	Median	Max
household_income	10	7.177	2.363	1	8	10
highest_edu	7	4.571	1.865	1	5	7
highest_edu_partner	7	4.404	1.942	1	5	7
age	65	45.19	12.08	18	45	82
gender	2	1.562	0.4962	1	2	2
health	5	1.874	0.8315	1	2	5
birth_year	68	1970	12.34	1933	1970	2002
partner_birth	66	1970	12.03	1924	1970	2000
occu_category	10	3.8	2.362	0	3	9
occu_category_partner	10	4.007	2.486	0	3	9
edu_homo	2	0.3359	0.4724	0	0	1
occu_homo	2	0.2243	0.4172	0	0	1
age_homo	2	0.3269	0.4691	0	0	1
all_homo	2	0.09688	0.2958	0	0	1

Data Analysis

# center age since it is a large continuous interaction term
# so the regression coefficients can be interpreted more intuitively
ess_sub$agea_centered <- ess_sub$agea - min(ess_sub$agea)
ess_sub_1$age_centered <- ess_sub_1$age - min(ess_sub_1$age)
# age_centered is essentially "years after 18"
ess_sub$gndr <- ess_sub$gndr - min(ess_sub$gndr)
ess_sub_1$gender <- ess_sub_1$gender - min(ess_sub_1$gender)
# gender was changed to 0 = male and 1 = female, by the same logic

# initial analysis of the educational returns of people engaged in different marriages
# create subsets based on type of marriage
edu_homog_data <- ess_sub %>% filter(edu_homo == 1)
occu_homog_data <- ess_sub %>% filter(occu_homo == 1)
age_homog_data <- ess_sub %>% filter(age_homo == 1)
all_homog_data <- ess_sub %>% filter(all_homo == 1)
non_homog_data <- ess_sub %>% filter(edu_homo == 0 & occu_homo == 0 & age_homo == 0)

# get mean level of income for each education level separately for different types of marriages
edu_homog_mean <- edu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
occu_homog_mean <- occu_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
age_homog_mean <- age_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
all_homog_mean <- all_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
non_homog_mean <- non_homog_data %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))

edu_returns_plot <- ggplot() + geom_line(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_point(data = edu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_line(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_point(data = occu_homog_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_line(data = all_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_point(data = all_homog_mean, aes(x = eisced,
    y = mean_income), color = 'purple') + geom_line(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + geom_point(data = non_homog_mean, aes(x = eisced,
    y = mean_income), color = 'black') + geom_line(data = age_homog_mean, aes(x = eisced,
    y = mean_income), color = 'green') + geom_point(data = age_homog_mean, aes(x = eisced,
    y = mean_income), color = 'green') + labs(x = "Highest Level of Education ES - ISCED",
    y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
    theme(text = element_text(family = "Times New Roman")) +
    annotate("text", x = 6, y = 6, label = "Educational Homogamy", color = 'red') +
    annotate("text", x = 6, y = 5.8, label = "Occupational Homogamy", color = 'blue') +
    annotate("text", x = 6, y = 5.6, label = "Age Homogamy", color = 'green') +
    annotate("text", x = 6, y = 5.4, label = "All", color = 'purple') +
    annotate("text", x = 6, y = 5.2, label = "None", color = 'black')

ggsave("homogamy compare graph.png", plot = edu_returns_plot, width = 9, height = 6)
print(edu_returns_plot)

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)):
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# time trend of general returns to education
# split dataset by years and find regression coefficients
# to identify the returns to education in each year
ess_2010 <- ess_sub %>% filter(year == 2010)
ess_2012 <- ess_sub %>% filter(year == 2012)
ess_2014 <- ess_sub %>% filter(year == 2014)
ess_2016 <- ess_sub %>% filter(year == 2016)
ess_2018 <- ess_sub %>% filter(year == 2018)
ess_2020 <- ess_sub %>% filter(year == 2020)
summary(lm(hinctnta ~ eisced, data = ess_2010))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2010)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9489 -1.4042  0.3274  1.5958  4.3274 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.21737    0.21421  24.356   <2e-16 ***
## eisced       0.45526    0.04691   9.704   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.154 on 586 degrees of freedom
## Multiple R-squared:  0.1384, Adjusted R-squared:  0.137 
## F-statistic: 94.17 on 1 and 586 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced, data = ess_2012))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.0455 -1.2144  0.4967  1.4967  4.2434 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.29878    0.26330  20.125  < 2e-16 ***
## eisced       0.45779    0.05551   8.247  1.5e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.293 on 491 degrees of freedom
## Multiple R-squared:  0.1217, Adjusted R-squared:  0.1199 
## F-statistic: 68.01 on 1 and 491 DF,  p-value: 1.502e-15

summary(lm(hinctnta ~ eisced, data = ess_2014))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2014)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2200 -1.3601  0.6866  1.7800  4.6399 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.88351    0.26560  18.387   <2e-16 ***
## eisced       0.47664    0.05354   8.902   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.303 on 551 degrees of freedom
## Multiple R-squared:  0.1257, Adjusted R-squared:  0.1242 
## F-statistic: 79.25 on 1 and 551 DF,  p-value: < 2.2e-16

summary(lm(hinctnta ~ eisced, data = ess_2016))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2016)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9749 -1.5809  0.2069  1.8130  4.3887 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.21741    0.26663  19.568  < 2e-16 ***
## eisced       0.39392    0.05303   7.428 5.16e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.191 on 475 degrees of freedom
## Multiple R-squared:  0.1041, Adjusted R-squared:  0.1022 
## F-statistic: 55.18 on 1 and 475 DF,  p-value: 5.156e-13

summary(lm(hinctnta ~ eisced, data = ess_2018))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8864 -1.2994  0.2876  1.8746  4.3526 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.23442    0.25893  20.215  < 2e-16 ***
## eisced       0.41300    0.05006   8.251 9.82e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.203 on 607 degrees of freedom
## Multiple R-squared:  0.1008, Adjusted R-squared:  0.09935 
## F-statistic: 68.07 on 1 and 607 DF,  p-value: 9.825e-16

summary(lm(hinctnta ~ eisced, data = ess_2020))

## 
## Call:
## lm(formula = hinctnta ~ eisced, data = ess_2020)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.7268 -1.3917  0.2786  1.9381  3.9488 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.71606    0.40124  14.246  < 2e-16 ***
## eisced       0.33512    0.07569   4.427  1.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.2 on 261 degrees of freedom
## Multiple R-squared:  0.06985,    Adjusted R-squared:  0.06629 
## F-statistic:  19.6 on 1 and 261 DF,  p-value: 1.404e-05

# record the coefficients and corresponding year to make a graph
returns <- c(0.4553, 0.4578, 0.4766, 0.3940, 0.4130, 0.3351)
years <- c(2010, 2012, 2014, 2016, 2018, 2020)

# make the plot and save it as image
# r runs into an error with times new roman, I don't know why. It works
# for the plot saved with ggsave though
returns_time <- ggplot(data = NULL, aes(x = years, y = returns)) +
  geom_line() + geom_point() + labs(x = "Year", y = "Returns to Education") +
  scale_x_continuous(breaks = c(2010, 2012, 2014, 2016, 2018, 2020)) +
  theme(text = element_text(family = "Times New Roman")) +
  geom_text(aes(label = returns), vjust = -1)
ggsave("outcome temporal graph.png", plot = returns_time, width = 9, height = 6)
print(returns_time)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

# regressions to get returns to education of different marriages
# order: educational, occupational, all, none
# returns to education for each group = regression coefficient for eisced
model_edu <- lm(hinctnta ~ eisced + agea_centered + gndr + health
        + eisced:agea_centered + eisced:gndr, data = edu_homog_data)
model_occu <- lm(hinctnta ~ eisced + agea_centered + gndr + health
        + eisced:agea_centered + eisced:gndr, data = occu_homog_data)
model_age <- lm(hinctnta ~ eisced + agea_centered + gndr + health
        + eisced:agea_centered + eisced:gndr, data = age_homog_data)
model_all <- lm(hinctnta ~ eisced + agea_centered + gndr + health
        + eisced:agea_centered + eisced:gndr, data = all_homog_data)
model_none <- lm(hinctnta ~ eisced + agea_centered + gndr + health
        + eisced:agea_centered + eisced:gndr, data = non_homog_data)
model_base <- lm(household_income ~ highest_edu + age_centered + gender + health
        + highest_edu:age_centered + highest_edu:gender, data = ess_sub_1)


# assumptions test on the base model
# Normal Distribution of Errors
qqnorm(residuals(model_base))
qqline(residuals(model_base))

# Linearity
plot(model_base$fitted.values, residuals(model_base))

# Full Rank / No Perfect Multicollinearity
print(vif(model_base, type = 'predictor'))

## GVIFs computed for predictors

##                  GVIF Df GVIF^(1/(2*Df))       Interacts With
## highest_edu  1.046148  5        1.004522 age_centered, gender
## age_centered 2.353206  3        1.153303          highest_edu
## gender       6.618458  3        1.370228          highest_edu
## health       1.046148  1        1.022814                 --  
##                               Other Predictors
## highest_edu                             health
## age_centered                    gender, health
## gender                    age_centered, health
## health       highest_edu, age_centered, gender

# Zero Expected Error
print(mean(residuals(model_base)))

## [1] -1.466815e-16

# verify the significance of educational returns in general by doing the
# 4 step infer analysis
test_stat <- ess_sub_1 %>%
  specify(explanatory = highest_edu,
          response = household_income) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "slope")

observed_slope <- coef(model_base)[["highest_edu"]]

p_value <- test_stat %>%
  get_p_value(obs_stat = observed_slope, direction = "two-sided")

## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.

# p value = 0

infer_plot <- ggplot(test_stat, aes(x = stat)) +
  geom_histogram(binwidth = .025, color = "black", fill = "white") +
  geom_vline(xintercept = observed_slope, color = "red", size = 1) +
  theme_minimal() + 
  theme( plot.background = element_rect(fill = "white", colour = NA))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("infer plot.png", plot = infer_plot, width = 9, height = 6)
print(infer_plot)

# does not overlap


# model summary
# 2 other models to compare: model with only education, model with
# additional interaction term
model_general <- lm(household_income ~ highest_edu, data = ess_sub_1)
model_no_inter <- lm(household_income ~ highest_edu + age_centered + gender
                     + health, data = ess_sub_1)

modelsummary(
  list(model_general, model_base, model_no_inter),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")

	(1)	(2)	(3)
highest_edu	0.4 (0.0)***	0.3 (0.1)***	0.4 (0.0)***
age_centered		0.0 (0.0)	0.0 (0.0)***
gender		-0.7 (0.2)**	-0.3 (0.1)***
health		-0.4 (0.0)***	-0.4 (0.0)***
highest_edu × age_centered		0.0 (0.0)
highest_edu × gender		0.1 (0.0)+
Num.Obs.	2983	2983	2983
R2	0.113	0.140	0.139
R2 Adj.	0.112	0.138	0.138
AIC	13245.4	13160.9	13161.2
BIC	13263.4	13209.0	13197.2
Log.Lik.	-6619.681	-6572.473	-6574.599
RMSE	2.23	2.19	2.19

# now compare regression coefficients of different marriage types
# to answer the research question
# first identify coefficients of all regression models
 summary(model_none)

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea_centered + gndr + health + 
##     eisced:agea_centered + eisced:gndr, data = non_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4267 -1.4479  0.3484  1.7034  5.0729 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.581444   0.560527  11.742  < 2e-16 ***
## eisced                0.224858   0.117205   1.918   0.0553 .  
## agea_centered        -0.001902   0.014596  -0.130   0.8963    
## gndr                 -0.346309   0.359587  -0.963   0.3357    
## health               -0.404608   0.079369  -5.098 4.04e-07 ***
## eisced:agea_centered  0.003808   0.003190   1.194   0.2328    
## eisced:gndr           0.016887   0.078425   0.215   0.8296    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.234 on 1099 degrees of freedom
## Multiple R-squared:  0.09477,    Adjusted R-squared:  0.08983 
## F-statistic: 19.18 on 6 and 1099 DF,  p-value: < 2.2e-16

  summary(model_edu)

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea_centered + gndr + health + 
##     eisced:agea_centered + eisced:gndr, data = edu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4394 -1.2786  0.4054  1.5330  4.8987 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.071181   0.513257  11.829  < 2e-16 ***
## eisced                0.297216   0.093859   3.167 0.001589 ** 
## agea_centered        -0.003650   0.012967  -0.282 0.778369    
## gndr                 -1.205540   0.343372  -3.511 0.000467 ***
## health               -0.291157   0.084536  -3.444 0.000597 ***
## eisced:agea_centered  0.004656   0.002619   1.778 0.075693 .  
## eisced:gndr           0.198417   0.066402   2.988 0.002876 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 995 degrees of freedom
## Multiple R-squared:  0.2319, Adjusted R-squared:  0.2273 
## F-statistic: 50.08 on 6 and 995 DF,  p-value: < 2.2e-16

   summary(model_occu)

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea_centered + gndr + health + 
##     eisced:agea_centered + eisced:gndr, data = occu_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2135 -1.1746  0.4837  1.5676  4.2727 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.889794   0.729374   8.075 3.19e-15 ***
## eisced                0.391358   0.126240   3.100 0.002017 ** 
## agea_centered         0.005947   0.020612   0.289 0.773026    
## gndr                 -0.588426   0.492033  -1.196 0.232161    
## health               -0.399989   0.105623  -3.787 0.000166 ***
## eisced:agea_centered  0.003239   0.003726   0.869 0.384960    
## eisced:gndr           0.053866   0.089404   0.603 0.547047    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.102 on 662 degrees of freedom
## Multiple R-squared:  0.2014, Adjusted R-squared:  0.1942 
## F-statistic: 27.83 on 6 and 662 DF,  p-value: < 2.2e-16

    summary(model_age)

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea_centered + gndr + health + 
##     eisced:agea_centered + eisced:gndr, data = age_homog_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5732 -1.2705  0.3948  1.6191  4.7089 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.1335519  0.5405725  11.346  < 2e-16 ***
## eisced                0.3508144  0.1013869   3.460 0.000563 ***
## agea_centered         0.0229279  0.0149422   1.534 0.125247    
## gndr                 -0.9155894  0.3919458  -2.336 0.019694 *  
## health               -0.4497986  0.0889127  -5.059 5.04e-07 ***
## eisced:agea_centered -0.0001771  0.0030209  -0.059 0.953258    
## eisced:gndr           0.1008350  0.0771366   1.307 0.191446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.17 on 968 degrees of freedom
## Multiple R-squared:  0.1429, Adjusted R-squared:  0.1376 
## F-statistic:  26.9 on 6 and 968 DF,  p-value: < 2.2e-16

     summary(model_all)

## 
## Call:
## lm(formula = hinctnta ~ eisced + agea_centered + gndr + health + 
##     eisced:agea_centered + eisced:gndr, data = all_homog_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.792 -1.017  0.412  1.348  3.520 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           4.8577397  1.0843577   4.480 1.09e-05 ***
## eisced                0.5735774  0.1781797   3.219  0.00144 ** 
## agea_centered         0.0253236  0.0296054   0.855  0.39307    
## gndr                 -0.2710360  0.7134174  -0.380  0.70430    
## health               -0.4691763  0.1478864  -3.173  0.00168 ** 
## eisced:agea_centered  0.0006545  0.0050632   0.129  0.89723    
## eisced:gndr           0.0191734  0.1224155   0.157  0.87565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.87 on 282 degrees of freedom
## Multiple R-squared:  0.2954, Adjusted R-squared:  0.2804 
## F-statistic:  19.7 on 6 and 282 DF,  p-value: < 2.2e-16

# compare regression coefficient of all homogamies vs non-homogamies
coef1 <- summary(model_all)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]

# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)

# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference

# Calculating degrees of freedom
df <- min(nrow(model_all$model) - 2, nrow(model_none$model) - 2)

# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)

# Print results
print(paste("all vs none t-value:", t_value, "p-value:", p_value))

## [1] "all vs none t-value: 1.63508957832183 p-value: 0.103126539322105"

# educational homogamies vs non-homogamies
coef1 <- summary(model_edu)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]

# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)

# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference

# Calculating degrees of freedom
df <- min(nrow(model_edu$model) - 2, nrow(model_none$model) - 2)

# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)

# Print results
print(paste("edu vs none t-value:", t_value, "p-value:", p_value))

## [1] "edu vs none t-value: 0.481885426659551 p-value: 0.629992782620243"

# occupational homogamies vs non-homogamies
coef1 <- summary(model_occu)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]

# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)

# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference

# Calculating degrees of freedom
df <- min(nrow(model_occu$model) - 2, nrow(model_none$model) - 2)

# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)

# Print results
print(paste("occu vs none t-value:", t_value, "p-value:", p_value))

## [1] "occu vs none t-value: 0.966558713779183 p-value: 0.334115003167491"

# age homogamies vs non-homogamies
coef1 <- summary(model_age)$coefficients["eisced", ]
coef2 <- summary(model_none)$coefficients["eisced", ]

# Calculating the standard error of the difference
SE_difference <- sqrt(coef1["Std. Error"]^2 + coef2["Std. Error"]^2)

# Calculating the t-statistic
t_value <- (coef1["Estimate"] - coef2["Estimate"]) / SE_difference

# Calculating degrees of freedom
df <- min(nrow(model_age$model) - 2, nrow(model_none$model) - 2)

# Getting the p-value
p_value <- 2 * pt(-abs(t_value), df)

# Print results
print(paste("age vs none t-value:", t_value, "p-value:", p_value))

## [1] "age vs none t-value: 0.8127682761422 p-value: 0.416549745459296"

# Graph interaction effect
edu_homog_male <- edu_homog_data %>% filter(gndr == 0)
edu_homog_female <- edu_homog_data %>% filter(gndr == 1)

male_mean <- edu_homog_male %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))
female_mean <- edu_homog_female %>%
  group_by(eisced) %>%
  summarize(mean_income = mean(hinctnta))


gender_plot <- ggplot() + geom_line(data = female_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_point(data = female_mean, aes(x = eisced,
    y = mean_income), color = 'red') + geom_line(data = male_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + geom_point(data = male_mean, aes(x = eisced,
    y = mean_income), color = 'blue') + labs(x = "Highest Level of Education ES - ISCED",
    y = "Total Household Income Decile") + scale_x_continuous(breaks = c(0:8)) +
    theme(text = element_text(family = "Times New Roman")) +
    annotate("text", x = 6, y = 6, label = "Females", color = 'red') +
    annotate("text", x = 6, y = 5.8, label = "Males", color = 'blue') 

ggsave("gender compare graph.png", plot = gender_plot, width = 9, height = 6)
print(gender_plot)

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## Windows字体数据库里没有这样的字体系列

SOC252 Final Report

Kelvin Li, Yisu Hou

2023-12-19

data cleaning

Data Analysis