load("gss2022.Rdata")
gss <- df
gss <- gss %>% filter(educ >= 0) %>%
filter(!is.na(educ)) %>%
filter(paeduc >= 0) %>%
filter(!is.na(paeduc)) %>%
filter(maeduc >= 0) %>%
filter(!is.na(maeduc)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(race))
summary_stats <- gss %>%
summarise(
Variable = c("Respondent's Education (years)", "Father's Education (years)", "Mother's Education (years)"),
Mean = round(c(mean(educ), mean(paeduc), mean(maeduc)), 2),
Median = c(median(educ), median(paeduc), median(maeduc)),
SD = round(c(sd(educ), sd(paeduc), sd(maeduc)), 2),
Min = c(min(educ), min(paeduc), min(maeduc)),
Max = c(max(educ), max(paeduc), max(maeduc)),
Range = c(max(educ) - min(educ), max(paeduc) - min(paeduc), max(maeduc) - min(maeduc))
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
kable(summary_stats, col.names = c("Variable", "Mean", "Median", "SD", "Min", "Max", "Range")) %>%
kable_styling(full_width = FALSE, position = "center") %>%
add_header_above(c(" " = 1, "Table 1: Statistics of Respondent's Education, Father's Education and Mother's Education" = 6 ))
| Variable | Mean | Median | SD | Min | Max | Range |
|---|---|---|---|---|---|---|
| Respondent’s Education (years) | 13.52 | 13 | 3.04 | 0 | 20 | 20 |
| Father’s Education (years) | 10.92 | 12 | 4.31 | 0 | 20 | 20 |
| Mother’s Education (years) | 11.06 | 12 | 3.72 | 0 | 20 | 20 |
gss_cleaned <- gss %>%
mutate(
race = case_when(
race %in% c("white", "black", "other") ~ race,
TRUE ~ NA_character_
),
sex = case_when(
sex %in% c("male", "female") ~ sex,
TRUE ~ NA_character_
)
)
gss_cleaned <- gss_cleaned %>%
dplyr::select(race, sex, degree)
categorical_summary <- datasummary_skim(gss_cleaned, type = "categorical")
gss_cleaned <- gss_cleaned %>%
rename(
"Respondent Race" = race,
"Respondent Sex" = sex
)
categorical_summary_flextable <- datasummary_skim(
gss_cleaned %>%
dplyr::select(`Respondent Race`, `Respondent Sex`),
type = "categorical",
output = "flextable"
)
## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
## produced by the `tinytable` backend.
categorical_summary_flextable <- categorical_summary_flextable %>%
set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
theme_box() %>%
bold(part = "header") %>%
bg(part = "header", bg = "#4CAF50") %>%
color(part = "header", color = "white") %>%
border_remove() %>%
border_inner_v(border = fp_border(color = "black", width = 1)) %>%
set_caption("Table 2:statistics for respondent's race and sex")
categorical_summary_flextable
|
| N | % |
|---|---|---|---|
Respondent Race | black | 4484 | 9.9 |
other | 2460 | 5.4 | |
white | 38578 | 84.7 | |
Respondent Sex | female | 24922 | 54.7 |
male | 20600 | 45.3 |
ggplot(gss, aes(x = paeduc, y = educ)) +
geom_jitter(alpha = 0.1, color = "black", size = 1, width = 0.1, height = 0.2) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Table 3:Relationship Between Father's Education and Child's Education",
x = "Father's Education (years)",
y = "Child's Education (years)")
## `geom_smooth()` using formula = 'y ~ x'
gss_filtered <- gss %>%
mutate(educ_group = case_when(
educ >= 0 & educ <= 6 ~ "Elementary school",
educ >= 7 & educ <= 9 ~ "Middle school",
educ >= 10 & educ <= 12 ~ "High school",
educ >= 13 & educ <= 16 ~ "Undergraduate",
educ >= 17 ~ "Graduate or Higher"
))
gss_filtered$educ_group <- factor(gss_filtered$educ_group, levels = c("Elementary school", "Middle school", "High school", "Undergraduate", "Graduate or Higher"))
# Create the side-by-side (clustered) bar chart
ggplot(gss_filtered, aes(x = educ_group, fill = sex)) +
geom_bar(position = position_dodge(width = 0.8)) +
labs(
title = "Table 4: Distribution of Education Level Across Sex",
x = "Education Group",
y = "Count"
) +
scale_fill_manual(values = c("male" = "skyblue", "female" = "pink")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
gss_cleaned <- gss %>% select(educ, paeduc, maeduc, sex, race)
gss_scaled <- gss_cleaned %>%
mutate(
educ = r2sd(educ),
paeduc = r2sd(paeduc),
maeduc = r2sd(maeduc)
)
model1 <- lm(educ ~ paeduc, data = gss_cleaned)
model2 <- lm(educ ~ maeduc, data = gss_cleaned)
model3 <- lm(educ ~ maeduc + paeduc, data = gss_cleaned)
model4 <- lm(educ ~ maeduc + paeduc + sex + race, data = gss_scaled)
model_list <- list("Model 1: Father's Education" = model1,
"Model 2: Mather's Education" = model2,
"Model 3: Father and Mather's Education" = model3,
"Model 4: Full models(standardized)" = model4)
modelsummary(model_list, title = "Table 5: Linear regression")
| Model 1: Father's Education | Model 2: Mather's Education | Model 3: Father and Mather's Education | Model 4: Full models(standardized) | |
|---|---|---|---|---|
| (Intercept) | 9.770 | 9.225 | 8.778 | 0.012 |
| (0.034) | (0.039) | (0.039) | (0.003) | |
| paeduc | 0.344 | 0.215 | 0.300 | |
| (0.003) | (0.004) | (0.006) | ||
| maeduc | 0.388 | 0.216 | 0.268 | |
| (0.003) | (0.005) | (0.006) | ||
| sexfemale | -0.020 | |||
| (0.004) | ||||
| raceblack | -0.048 | |||
| (0.007) | ||||
| raceother | 0.067 | |||
| (0.009) | ||||
| Num.Obs. | 45522 | 45522 | 45522 | 45522 |
| R2 | 0.237 | 0.225 | 0.274 | 0.276 |
| R2 Adj. | 0.237 | 0.225 | 0.274 | 0.276 |
| AIC | 218217.3 | 218946.8 | 215977.1 | 51377.3 |
| BIC | 218243.5 | 218973.0 | 216012.0 | 51438.4 |
| Log.Lik. | -109105.663 | -109470.396 | -107984.527 | -25681.653 |
| F | 14156.617 | 13207.951 | 8584.661 | 3473.783 |
| RMSE | 2.66 | 2.68 | 2.59 | 0.43 |
Appendix polynomial
model1_poly <- lm(educ ~ paeduc + I(paeduc^2), data = gss_cleaned)
model2_poly <- lm(educ ~ maeduc + I(maeduc^2), data = gss_cleaned)
model3_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2), data = gss_cleaned)
model4_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2) + sex + race, data = gss_scaled)
model_list <- list("Model 1: Father's Education" = model1_poly,
"Model 2: Mather's Education" = model2_poly,
"Model 3: Father and Mather's Education" = model3_poly,
"Model 4: Full models(standardized)" = model4_poly)
modelsummary(model_list, title = "Appendix 1: polynomial regression")
| Model 1: Father's Education | Model 2: Mather's Education | Model 3: Father and Mather's Education | Model 4: Full models(standardized) | |
|---|---|---|---|---|
| (Intercept) | 9.523 | 9.215 | 8.786 | 0.012 |
| (0.054) | (0.061) | (0.062) | (0.003) | |
| paeduc | 0.403 | 0.201 | 0.301 | |
| (0.010) | (0.013) | (0.006) | ||
| I(paeduc^2) | -0.003 | 0.001 | 0.006 | |
| (0.000) | (0.001) | (0.007) | ||
| maeduc | 0.391 | 0.228 | 0.266 | |
| (0.012) | (0.014) | (0.006) | ||
| I(maeduc^2) | 0.000 | -0.001 | -0.008 | |
| (0.001) | (0.001) | (0.006) | ||
| sexfemale | -0.020 | |||
| (0.004) | ||||
| raceblack | -0.048 | |||
| (0.007) | ||||
| raceother | 0.068 | |||
| (0.009) | ||||
| Num.Obs. | 45522 | 45522 | 45522 | 45522 |
| R2 | 0.238 | 0.225 | 0.274 | 0.276 |
| R2 Adj. | 0.238 | 0.225 | 0.274 | 0.276 |
| AIC | 218184.8 | 218948.7 | 215979.6 | 51379.3 |
| BIC | 218219.7 | 218983.6 | 216031.9 | 51457.8 |
| Log.Lik. | -109088.411 | -109470.372 | -107983.793 | -25680.657 |
| F | 7100.778 | 6603.862 | 4292.648 | 2481.558 |
| RMSE | 2.66 | 2.68 | 2.59 | 0.43 |
# Categorize the 'maeduc' variable
gss_cleaned <- gss_cleaned %>%
mutate(maeduc_group = case_when(
maeduc >= 0 & maeduc <= 6 ~ "Elementary school",
maeduc >= 7 & maeduc <= 9 ~ "Middle school",
maeduc >= 10 & maeduc <= 12 ~ "High school",
maeduc >= 13 & maeduc <= 16 ~ "Undergraduate",
maeduc >= 17 ~ "Graduate or Higher"
))
# Convert 'maeduc_group' to an ordered factor
gss_cleaned <- gss_cleaned %>%
mutate(maeduc_group = factor(maeduc_group,
levels = c("Elementary school",
"Middle school",
"High school",
"Undergraduate",
"Graduate or Higher"),
ordered = TRUE))
interaction_model <- lm(educ ~ paeduc * maeduc_group, data = gss_cleaned)
# Generate predicted values
gss_cleaned$predicted_educ <- predict(interaction_model)
ggplot(gss_cleaned, aes(x = paeduc, y = educ)) +
geom_point(alpha = 0.5, aes(color = maeduc_group)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
facet_wrap(~ maeduc_group) +
theme_minimal() +
labs(title = "Table 6: Interaction between Father's and Mother's Education on Respondent's Education",
x = "Father's Education",
y = "Respondent's Education",
caption = "Faceted by Mother's Education Level")
## `geom_smooth()` using formula = 'y ~ x'