Final.knit

load("gss2022.Rdata")
gss <- df

gss <- gss %>% filter(educ >= 0) %>%
filter(!is.na(educ)) %>%
filter(paeduc >= 0) %>%
filter(!is.na(paeduc)) %>%
filter(maeduc >= 0) %>%
filter(!is.na(maeduc)) %>%
filter(!is.na(sex)) %>%
filter(!is.na(race))

summary_stats <- gss %>%
  summarise(
    Variable = c("Respondent's Education (years)", "Father's Education (years)", "Mother's Education (years)"),
    Mean = round(c(mean(educ), mean(paeduc), mean(maeduc)), 2),
    Median = c(median(educ), median(paeduc), median(maeduc)),
    SD = round(c(sd(educ), sd(paeduc), sd(maeduc)), 2),
    Min = c(min(educ), min(paeduc), min(maeduc)),
    Max = c(max(educ), max(paeduc), max(maeduc)),
    Range = c(max(educ) - min(educ), max(paeduc) - min(paeduc), max(maeduc) - min(maeduc))
  )

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

kable(summary_stats, col.names = c("Variable", "Mean", "Median", "SD", "Min", "Max", "Range")) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  add_header_above(c(" " = 1, "Table 1: Statistics of Respondent's Education, Father's Education and Mother's Education" = 6 ))

	Table 1: Statistics of Respondent’s Education, Father’s Education and Mother’s Education
Variable	Mean	Median	SD	Min	Max	Range
Respondent’s Education (years)	13.52	13	3.04	0	20	20
Father’s Education (years)	10.92	12	4.31	0	20	20
Mother’s Education (years)	11.06	12	3.72	0	20	20

gss_cleaned <- gss %>%
  mutate(
    race = case_when(
      race %in% c("white", "black", "other") ~ race,
      TRUE ~ NA_character_
    ),
    sex = case_when(
      sex %in% c("male", "female") ~ sex,
      TRUE ~ NA_character_
    )
  )

gss_cleaned <- gss_cleaned %>%
  dplyr::select(race, sex, degree)


categorical_summary <- datasummary_skim(gss_cleaned, type = "categorical")



gss_cleaned <- gss_cleaned %>%
  rename(
    "Respondent Race" = race,
    "Respondent Sex" = sex
  )

categorical_summary_flextable <- datasummary_skim(
  gss_cleaned %>%
    dplyr::select(`Respondent Race`, `Respondent Sex`),
  type = "categorical",
  output = "flextable"
)

## Warning: Inline histograms in `datasummary_skim()` are only supported for tables
##   produced by the `tinytable` backend.

categorical_summary_flextable <- categorical_summary_flextable %>%
  set_header_labels(Variable = "Variable", Value = "Value", Freq = "Frequency") %>%
  theme_box() %>%
  bold(part = "header") %>%
  bg(part = "header", bg = "#4CAF50") %>%
  color(part = "header", color = "white") %>%
  border_remove() %>%
  border_inner_v(border = fp_border(color = "black", width = 1)) %>%
  set_caption("Table 2:statistics for respondent's race and sex")

categorical_summary_flextable

Table 2:statistics for respondent's race and sex
		N	%
Respondent Race	black	4484	9.9
	other	2460	5.4
	white	38578	84.7
Respondent Sex	female	24922	54.7
	male	20600	45.3

ggplot(gss, aes(x = paeduc, y = educ)) +
  geom_jitter(alpha = 0.1, color = "black", size = 1, width = 0.1, height = 0.2) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Table 3:Relationship Between Father's Education and Child's Education",
       x = "Father's Education (years)",
       y = "Child's Education (years)")

## `geom_smooth()` using formula = 'y ~ x'

gss_filtered <- gss %>%
  mutate(educ_group = case_when(
    educ >= 0 & educ <= 6 ~ "Elementary school",
    educ >= 7 & educ <= 9 ~ "Middle school",
    educ >= 10 & educ <= 12 ~ "High school",
    educ >= 13 & educ <= 16 ~ "Undergraduate",
    educ >= 17 ~ "Graduate or Higher"
  )) 

gss_filtered$educ_group <- factor(gss_filtered$educ_group, levels = c("Elementary school", "Middle school", "High school", "Undergraduate", "Graduate or Higher"))

# Create the side-by-side (clustered) bar chart
ggplot(gss_filtered, aes(x = educ_group, fill = sex)) +
  geom_bar(position = position_dodge(width = 0.8)) +
  labs(
    title = "Table 4: Distribution of Education Level Across Sex",
    x = "Education Group",
    y = "Count"
  ) +
  scale_fill_manual(values = c("male" = "skyblue", "female" = "pink")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

gss_cleaned <- gss %>% select(educ, paeduc, maeduc, sex, race)
gss_scaled <- gss_cleaned %>%
  mutate(
    educ = r2sd(educ),
    paeduc = r2sd(paeduc),
    maeduc = r2sd(maeduc)
  )

model1 <- lm(educ ~ paeduc, data = gss_cleaned)
model2 <- lm(educ ~ maeduc, data = gss_cleaned)
model3 <- lm(educ ~ maeduc + paeduc, data = gss_cleaned)
model4 <- lm(educ ~ maeduc + paeduc + sex + race, data = gss_scaled)

model_list <- list("Model 1: Father's Education" = model1, 
               "Model 2: Mather's Education" = model2,
               "Model 3: Father and Mather's Education" = model3,
               "Model 4: Full models(standardized)" = model4)
modelsummary(model_list, title = "Table 5: Linear regression")

tinytable_1jg1swxd6xuru4haf0cd

Table 5: Linear regression
	Model 1: Father's Education	Model 2: Mather's Education	Model 3: Father and Mather's Education	Model 4: Full models(standardized)
(Intercept)	9.770	9.225	8.778	0.012
	(0.034)	(0.039)	(0.039)	(0.003)
paeduc	0.344		0.215	0.300
	(0.003)		(0.004)	(0.006)
maeduc		0.388	0.216	0.268
		(0.003)	(0.005)	(0.006)
sexfemale				-0.020
				(0.004)
raceblack				-0.048
				(0.007)
raceother				0.067
				(0.009)
Num.Obs.	45522	45522	45522	45522
R2	0.237	0.225	0.274	0.276
R2 Adj.	0.237	0.225	0.274	0.276
AIC	218217.3	218946.8	215977.1	51377.3
BIC	218243.5	218973.0	216012.0	51438.4
Log.Lik.	-109105.663	-109470.396	-107984.527	-25681.653
F	14156.617	13207.951	8584.661	3473.783
RMSE	2.66	2.68	2.59	0.43

Appendix polynomial

model1_poly <- lm(educ ~ paeduc + I(paeduc^2), data = gss_cleaned)
model2_poly <- lm(educ ~ maeduc + I(maeduc^2), data = gss_cleaned)
model3_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2), data = gss_cleaned)
model4_poly <- lm(educ ~ maeduc + I(maeduc^2) + paeduc + I(paeduc^2) + sex + race, data = gss_scaled)
model_list <- list("Model 1: Father's Education" = model1_poly, 
               "Model 2: Mather's Education" = model2_poly,
               "Model 3: Father and Mather's Education" = model3_poly,
               "Model 4: Full models(standardized)" = model4_poly)
modelsummary(model_list, title = "Appendix 1: polynomial regression")

tinytable_7m3oue1u09lbk2ngwmkx

Appendix 1: polynomial regression
	Model 1: Father's Education	Model 2: Mather's Education	Model 3: Father and Mather's Education	Model 4: Full models(standardized)
(Intercept)	9.523	9.215	8.786	0.012
	(0.054)	(0.061)	(0.062)	(0.003)
paeduc	0.403		0.201	0.301
	(0.010)		(0.013)	(0.006)
I(paeduc^2)	-0.003		0.001	0.006
	(0.000)		(0.001)	(0.007)
maeduc		0.391	0.228	0.266
		(0.012)	(0.014)	(0.006)
I(maeduc^2)		0.000	-0.001	-0.008
		(0.001)	(0.001)	(0.006)
sexfemale				-0.020
				(0.004)
raceblack				-0.048
				(0.007)
raceother				0.068
				(0.009)
Num.Obs.	45522	45522	45522	45522
R2	0.238	0.225	0.274	0.276
R2 Adj.	0.238	0.225	0.274	0.276
AIC	218184.8	218948.7	215979.6	51379.3
BIC	218219.7	218983.6	216031.9	51457.8
Log.Lik.	-109088.411	-109470.372	-107983.793	-25680.657
F	7100.778	6603.862	4292.648	2481.558
RMSE	2.66	2.68	2.59	0.43

# Categorize the 'maeduc' variable
gss_cleaned <- gss_cleaned %>%
  mutate(maeduc_group = case_when(
    maeduc >= 0 & maeduc <= 6 ~ "Elementary school",
    maeduc >= 7 & maeduc <= 9 ~ "Middle school",
    maeduc >= 10 & maeduc <= 12 ~ "High school",
    maeduc >= 13 & maeduc <= 16 ~ "Undergraduate",
    maeduc >= 17 ~ "Graduate or Higher"
  ))
# Convert 'maeduc_group' to an ordered factor
gss_cleaned <- gss_cleaned %>%
  mutate(maeduc_group = factor(maeduc_group, 
                               levels = c("Elementary school", 
                                          "Middle school", 
                                          "High school", 
                                          "Undergraduate", 
                                          "Graduate or Higher"),
                               ordered = TRUE))


interaction_model <- lm(educ ~ paeduc * maeduc_group, data = gss_cleaned)

# Generate predicted values
gss_cleaned$predicted_educ <- predict(interaction_model)

ggplot(gss_cleaned, aes(x = paeduc, y = educ)) +
  geom_point(alpha = 0.5, aes(color = maeduc_group)) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  facet_wrap(~ maeduc_group) +
  theme_minimal() +
  labs(title = "Table 6: Interaction between Father's and Mother's Education on Respondent's Education",
       x = "Father's Education",
       y = "Respondent's Education",
       caption = "Faceted by Mother's Education Level")

## `geom_smooth()` using formula = 'y ~ x'