load("gss2022.Rdata")
gss <- df
gss <- gss %>% filter(educ >= 0) %>%
filter(!is.na(educ)) %>%
filter(paeduc >= 0) %>%
filter(!is.na(paeduc)) %>%
filter(maeduc >= 0) %>%
filter(!is.na(maeduc)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(race))
summary_stats <- gss %>%
  summarise(
    Variable = c("Respondent's Education (years)", "Father's Education (years)", "Mother's Education (years)"),
    Mean = round(c(mean(educ), mean(paeduc), mean(maeduc)), 2),
    Median = c(median(educ), median(paeduc), median(maeduc)),
    SD = round(c(sd(educ), sd(paeduc), sd(maeduc)), 2),
    Min = c(min(educ), min(paeduc), min(maeduc)),
    Max = c(max(educ), max(paeduc), max(maeduc)),
    Range = c(max(educ) - min(educ), max(paeduc) - min(paeduc), max(maeduc) - min(maeduc))
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
kable(summary_stats, col.names = c("Variable", "Mean", "Median", "SD", "Min", "Max", "Range")) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  add_header_above(c(" " = 1, "Table 1: Statistics of Respondent's Education, Father's Education and Mother's Education" = 6 ))
Table 1: Statistics of Respondent’s Education, Father’s Education and Mother’s Education
Variable Mean Median SD Min Max Range
Respondent’s Education (years) 13.52 13 3.04 0 20 20
Father’s Education (years) 10.92 12 4.31 0 20 20
Mother’s Education (years) 11.06 12 3.72 0 20 20
gss_cleaned <- gss %>%
  mutate(
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    )
  )

gss_cleaned <- gss_cleaned %>%
  dplyr::select(race, sex, degree)


categorical_summary <- datasummary_skim(gss_cleaned, type = "categorical")



gss_cleaned <- gss_cleaned %>%
  rename(
    "Respondent Race" = race,
    "Respondent Sex" = sex
  )

categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Respondent Race`, `Respondent Sex`),
  type = "categorical",
  output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.
categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  set_caption("Table 2:statistics for respondent's race and sex")

categorical_summary_flextable
Table 2:statistics for respondent's race and sex

N

%

Respondent Race

black

4484

9.9

other

2460

5.4

white

38578

84.7

Respondent Sex

female

24922

54.7

male

20600

45.3

ggplot(gss, aes(x = paeduc, y = educ)) +
  geom_jitter(alpha = 0.1, color = "black", size = 1, width = 0.1, height = 0.2) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Table 3:Relationship Between Father's Education and Child's Education",
       x = "Father's Education (years)",
       y = "Child's Education (years)") 
## `geom_smooth()` using formula = 'y ~ x'

gss_filtered <- gss %>%
  mutate(educ_group = case_when(
    educ >= 0 & educ <= 6 ~ "Elementary school",
    educ >= 7 & educ <= 9 ~ "Middle school",
    educ >= 10 & educ <= 12 ~ "High school",
    educ >= 13 & educ <= 16 ~ "Undergraduate",
    educ >= 17 ~ "Graduate or Higher"
  )) 

gss_filtered$educ_group <- factor(gss_filtered$educ_group, levels = c("Elementary school", "Middle school", "High school", "Undergraduate", "Graduate or Higher"))

# Create the side-by-side (clustered) bar chart
ggplot(gss_filtered, aes(x = educ_group, fill = sex)) +
  geom_bar(position = position_dodge(width = 0.8)) +
  labs(
    title = "Table 4: Distribution of Education Level Across Sex",
    x = "Education Group",
    y = "Count"
  ) +
  scale_fill_manual(values = c("male" = "skyblue", "female" = "pink")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

gss_cleaned <- gss %>% select(educ, paeduc, maeduc, sex, race)
gss_scaled <- gss_cleaned %>%
  mutate(
    educ = r2sd(educ),
    paeduc = r2sd(paeduc),
    maeduc = r2sd(maeduc)
  )

model1 <- lm(educ ~ paeduc, data = gss_cleaned)
model2 <- lm(educ ~ maeduc, data = gss_cleaned)
model3 <- lm(educ ~ maeduc + paeduc, data = gss_cleaned)
model4 <- lm(educ ~ maeduc + paeduc + sex + race, data = gss_scaled)

model_list <- list("Model 1: Father's Education" = model1, 
               "Model 2: Mather's Education" = model2,
               "Model 3: Father and Mather's Education" = model3,
               "Model 4: Full models(standardized)" = model4)
modelsummary(model_list, title = "Table 5: Linear regression")
tinytable_1jg1swxd6xuru4haf0cd
Table 5: Linear regression
Model 1: Father's Education Model 2: Mather's Education Model 3: Father and Mather's Education Model 4: Full models(standardized)
(Intercept) 9.770 9.225 8.778 0.012
(0.034) (0.039) (0.039) (0.003)
paeduc 0.344 0.215 0.300
(0.003) (0.004) (0.006)
maeduc 0.388 0.216 0.268
(0.003) (0.005) (0.006)
sexfemale -0.020
(0.004)
raceblack -0.048
(0.007)
raceother 0.067
(0.009)
Num.Obs. 45522 45522 45522 45522
R2 0.237 0.225 0.274 0.276
R2 Adj. 0.237 0.225 0.274 0.276
AIC 218217.3 218946.8 215977.1 51377.3
BIC 218243.5 218973.0 216012.0 51438.4
Log.Lik. -109105.663 -109470.396 -107984.527 -25681.653
F 14156.617 13207.951 8584.661 3473.783
RMSE 2.66 2.68 2.59 0.43

Appendix polynomial

model1_poly <- lm(educ ~ paeduc + I(paeduc^2), data = gss_cleaned)
model2_poly <- lm(educ ~ maeduc + I(maeduc^2), data = gss_cleaned)
model3_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2), data = gss_cleaned)
model4_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2) + sex + race, data = gss_scaled)
model_list <- list("Model 1: Father's Education" = model1_poly, 
               "Model 2: Mather's Education" = model2_poly,
               "Model 3: Father and Mather's Education" = model3_poly,
               "Model 4: Full models(standardized)" = model4_poly)
modelsummary(model_list, title = "Appendix 1: polynomial regression")
tinytable_7m3oue1u09lbk2ngwmkx
Appendix 1: polynomial regression
Model 1: Father's Education Model 2: Mather's Education Model 3: Father and Mather's Education Model 4: Full models(standardized)
(Intercept) 9.523 9.215 8.786 0.012
(0.054) (0.061) (0.062) (0.003)
paeduc 0.403 0.201 0.301
(0.010) (0.013) (0.006)
I(paeduc^2) -0.003 0.001 0.006
(0.000) (0.001) (0.007)
maeduc 0.391 0.228 0.266
(0.012) (0.014) (0.006)
I(maeduc^2) 0.000 -0.001 -0.008
(0.001) (0.001) (0.006)
sexfemale -0.020
(0.004)
raceblack -0.048
(0.007)
raceother 0.068
(0.009)
Num.Obs. 45522 45522 45522 45522
R2 0.238 0.225 0.274 0.276
R2 Adj. 0.238 0.225 0.274 0.276
AIC 218184.8 218948.7 215979.6 51379.3
BIC 218219.7 218983.6 216031.9 51457.8
Log.Lik. -109088.411 -109470.372 -107983.793 -25680.657
F 7100.778 6603.862 4292.648 2481.558
RMSE 2.66 2.68 2.59 0.43
# Categorize the 'maeduc' variable
gss_cleaned <- gss_cleaned %>%
  mutate(maeduc_group = case_when(
    maeduc >= 0 & maeduc <= 6 ~ "Elementary school",
    maeduc >= 7 & maeduc <= 9 ~ "Middle school",
    maeduc >= 10 & maeduc <= 12 ~ "High school",
    maeduc >= 13 & maeduc <= 16 ~ "Undergraduate",
    maeduc >= 17 ~ "Graduate or Higher"
  ))
# Convert 'maeduc_group' to an ordered factor
gss_cleaned <- gss_cleaned %>%
  mutate(maeduc_group = factor(maeduc_group, 
                               levels = c("Elementary school", 
                                          "Middle school", 
                                          "High school", 
                                          "Undergraduate", 
                                          "Graduate or Higher"),
                               ordered = TRUE))


interaction_model <- lm(educ ~ paeduc * maeduc_group, data = gss_cleaned)

# Generate predicted values
gss_cleaned$predicted_educ <- predict(interaction_model)

ggplot(gss_cleaned, aes(x = paeduc, y = educ)) +
  geom_point(alpha = 0.5, aes(color = maeduc_group)) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ maeduc_group) +
  theme_minimal() +
  labs(title = "Table 6: Interaction between Father's and Mother's Education on Respondent's Education",
       x = "Father's Education",
       y = "Respondent's Education",
       caption = "Faceted by Mother's Education Level")
## `geom_smooth()` using formula = 'y ~ x'