library(fst)
library(dplyr)  
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)  
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gt) 
# Load the full GSS dataset
gss <- read_fst("gss2022.fst")

# Initial look at data dimensions
dim(gss)
## [1] 72390  6646
gssia <- gss %>%
  filter(year >= 2002 & year <= 2022) %>%  
  mutate(
    educ_level = case_when(
      degree == "less than high school" ~ "No High School Diploma",
      degree == "high school" ~ "High School Diploma",
      degree == "associate/junior college" ~ "Associate/Junior College",
      degree == "bachelor's" ~ "Bachelor's Degree",
      degree == "graduate" ~ "Graduate Degree",
      TRUE ~ NA_character_  
    ),
    educ_level = factor(educ_level, 
                        levels = c("No High School Diploma", "High School Diploma", "Associate/Junior College", "Bachelor's Degree", "Graduate Degree"))
  )


table(gssia$educ_level)
## 
##   No High School Diploma      High School Diploma Associate/Junior College 
##                     3870                    15332                     2557 
##        Bachelor's Degree          Graduate Degree 
##                     6043                     3614
gssS <- gss %>%
  filter(year >= 2002 & year <= 2022) %>%  
  mutate(
    male_female = case_when(
      sex == "male" ~ "Male",
      sex == "female" ~ "Female",
      TRUE ~ NA_character_  
    ),
    male_female = factor(male_female, 
                        levels = c("Male", "Female"))
  )


table(gssS$male_female)
## 
##   Male Female 
##  14049  17296
gssA <- gss %>%
  filter(year >= 2002 & year <= 2022) %>%  
  mutate(
    young_old = case_when(
      age >= 24 & age <= 29 ~ "Young Adulthood",  # 24 to 29 inclusive
      age >= 30 & age <= 39 ~ "Adulthood",        # 30 to 39
      age >= 40 & age <= 49 ~ "Midlife",          # 40 to 49
      age >= 50 & age <= 64 ~ "Late Adulthood",   # 50 to 64
      age >= 65 & age <= 75 ~ "Senior",           # 65 to 75
      TRUE ~ NA_character_  
    ),
    young_old = factor(young_old, 
                       levels = c("Young Adulthood", "Adulthood", "Midlife", "Late Adulthood", "Senior"))  
  )


table(gssA$young_old)
## 
## Young Adulthood       Adulthood         Midlife  Late Adulthood          Senior 
##            3102            5892            5582            7854            4036
gssC <- gss %>%
  filter(year >= 2002 & year <= 2022) %>% 
  mutate(
    number_kids = case_when(
      childs == 0 ~ "No children",                # 0 children
      childs >= 1 & childs <= 2 ~ "Small family",  # 1-2 children
      childs >= 3 & childs <= 4 ~ "Medium-sized family",  # 3-4 children
      childs >= 5 & childs <= 6 ~ "Large family",  # 5-6 children
      childs >= 7 & childs <= 8 ~ "Very large family",  # 7-8 children (grouped)
      TRUE ~ NA_character_  
    ),
    number_kids = factor(number_kids, 
                         levels = c("No children", "Small family", "Medium-sized family", "Large family", "Very large family")) 
  )


table(gssC$number_kids)
## 
##         No children        Small family Medium-sized family        Large family 
##                8835               13235                7355                1405 
##   Very large family 
##                 516
gssia <- gssia %>% select(year, educ_level)
gssS <- gssS %>% select(year, male_female)
gssA <- gssA %>% select(year, young_old)
gssC <- gssC %>% select(year, number_kids)
table_gender_edu <- gssia %>%
  left_join(gssS, by = "year") %>%
  group_by(male_female, educ_level) %>%
  summarise(N = n(), .groups = "drop") %>%
  mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssS, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
print(table_gender_edu)
## # A tibble: 18 × 4
##    male_female educ_level                      N Proportion
##    <fct>       <fct>                       <int>      <dbl>
##  1 Male        No High School Diploma    5225869    5.38   
##  2 Male        High School Diploma      20916511   21.6    
##  3 Male        Associate/Junior College  3565740    3.67   
##  4 Male        Bachelor's Degree         8408081    8.66   
##  5 Male        Graduate Degree           5087084    5.24   
##  6 Male        <NA>                        64495    0.0665 
##  7 Female      No High School Diploma    6440320    6.64   
##  8 Female      High School Diploma      25773930   26.6    
##  9 Female      Associate/Junior College  4393837    4.53   
## 10 Female      Bachelor's Degree        10372585   10.7    
## 11 Female      Graduate Degree           6279480    6.47   
## 12 Female      <NA>                        81251    0.0837 
## 13 <NA>        No High School Diploma      29812    0.0307 
## 14 <NA>        High School Diploma        180004    0.185  
## 15 <NA>        Associate/Junior College    40380    0.0416 
## 16 <NA>        Bachelor's Degree          110052    0.113  
## 17 <NA>        Graduate Degree             79460    0.0819 
## 18 <NA>        <NA>                         2116    0.00218
table_gender_edu %>%
  gt() %>%
  
  cols_label(
    male_female = "Gender",
    educ_level = "Education Level",
    N = "Count (N)",
    Proportion = "Proportion (%)"
  ) %>%
  
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels(
      columns = vars(Proportion)
    )
  ) %>%
  

  fmt_number(
    columns = vars(N),
    decimals = 0,
    use_seps = TRUE
  ) %>%
  

  fmt_percent(
    columns = vars(Proportion),
    decimals = 1,
    scale_values = FALSE
  ) %>%
  

  tab_header(
    title = md("**Proportion of Education Levels by Gender**")
  ) %>%
  tab_source_note(
    source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across gender."
  ) %>%
  

  tab_options(
    table.border.top.width = px(2),    
    table.border.bottom.width = px(2), 
    column_labels.border.top.width = px(1),   
    column_labels.border.bottom.width = px(1)  
  )
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
Proportion of Education Levels by Gender
Gender Education Level Count (N) Proportion (%)
Male No High School Diploma 5,225,869 5.4%
Male High School Diploma 20,916,511 21.6%
Male Associate/Junior College 3,565,740 3.7%
Male Bachelor's Degree 8,408,081 8.7%
Male Graduate Degree 5,087,084 5.2%
Male NA 64,495 0.1%
Female No High School Diploma 6,440,320 6.6%
Female High School Diploma 25,773,930 26.6%
Female Associate/Junior College 4,393,837 4.5%
Female Bachelor's Degree 10,372,585 10.7%
Female Graduate Degree 6,279,480 6.5%
Female NA 81,251 0.1%
NA No High School Diploma 29,812 0.0%
NA High School Diploma 180,004 0.2%
NA Associate/Junior College 40,380 0.0%
NA Bachelor's Degree 110,052 0.1%
NA Graduate Degree 79,460 0.1%
NA NA 2,116 0.0%
Note: Percentages are based on the total sample. Data reflects the distribution of education levels across gender.
table_age_edu <- gssia %>%
  left_join(gssA, by = "year") %>%
  group_by(young_old, educ_level) %>%
  summarise(N = n(), .groups = "drop") %>%
  mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssA, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
print(table_age_edu)
## # A tibble: 36 × 4
##    young_old       educ_level                     N Proportion
##    <fct>           <fct>                      <int>      <dbl>
##  1 Young Adulthood No High School Diploma   1171865     1.21  
##  2 Young Adulthood High School Diploma      4600139     4.74  
##  3 Young Adulthood Associate/Junior College  768759     0.792 
##  4 Young Adulthood Bachelor's Degree        1793003     1.85  
##  5 Young Adulthood Graduate Degree          1065671     1.10  
##  6 Young Adulthood <NA>                       12368     0.0127
##  7 Adulthood       No High School Diploma   2220591     2.29  
##  8 Adulthood       High School Diploma      8794306     9.06  
##  9 Adulthood       Associate/Junior College 1485580     1.53  
## 10 Adulthood       Bachelor's Degree        3481284     3.59  
## # ℹ 26 more rows
table_age_edu %>%
  gt() %>%
  

  cols_label(
    young_old = "Age Group",
    educ_level = "Education Level",
    N = "Count (N)",
    Proportion = "Proportion (%)"
  ) %>%
  

  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels(
      columns = vars(Proportion)
    )
  ) %>%
  
 
  fmt_number(
    columns = vars(N),
    decimals = 0,
    use_seps = TRUE
  ) %>%
  

  fmt_percent(
    columns = vars(Proportion),
    decimals = 1,
    scale_values = FALSE
  ) %>%
  

  tab_header(
    title = md("**Proportion of Education Levels by Age Group**")
  ) %>%
  tab_source_note(
    source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across age groups."
  ) %>%
  

  tab_options(
    table.border.top.width = px(2),   
    table.border.bottom.width = px(2), 
    column_labels.border.top.width = px(1),  
    column_labels.border.bottom.width = px(1)  
  )
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
Proportion of Education Levels by Age Group
Age Group Education Level Count (N) Proportion (%)
Young Adulthood No High School Diploma 1,171,865 1.2%
Young Adulthood High School Diploma 4,600,139 4.7%
Young Adulthood Associate/Junior College 768,759 0.8%
Young Adulthood Bachelor's Degree 1,793,003 1.8%
Young Adulthood Graduate Degree 1,065,671 1.1%
Young Adulthood NA 12,368 0.0%
Adulthood No High School Diploma 2,220,591 2.3%
Adulthood High School Diploma 8,794,306 9.1%
Adulthood Associate/Junior College 1,485,580 1.5%
Adulthood Bachelor's Degree 3,481,284 3.6%
Adulthood Graduate Degree 2,087,998 2.2%
Adulthood NA 25,518 0.0%
Midlife No High School Diploma 2,152,802 2.2%
Midlife High School Diploma 8,423,293 8.7%
Midlife Associate/Junior College 1,419,663 1.5%
Midlife Bachelor's Degree 3,305,265 3.4%
Midlife Graduate Degree 1,971,621 2.0%
Midlife NA 23,957 0.0%
Late Adulthood No High School Diploma 2,902,358 3.0%
Late Adulthood High School Diploma 11,625,959 12.0%
Late Adulthood Associate/Junior College 1,979,810 2.0%
Late Adulthood Bachelor's Degree 4,687,072 4.8%
Late Adulthood Graduate Degree 2,840,185 2.9%
Late Adulthood NA 37,150 0.0%
Senior No High School Diploma 1,433,703 1.5%
Senior High School Diploma 5,985,596 6.2%
Senior Associate/Junior College 1,051,889 1.1%
Senior Bachelor's Degree 2,534,805 2.6%
Senior Graduate Degree 1,576,632 1.6%
Senior NA 22,716 0.0%
NA No High School Diploma 1,814,682 1.9%
NA High School Diploma 7,441,152 7.7%
NA Associate/Junior College 1,294,256 1.3%
NA Bachelor's Degree 3,089,289 3.2%
NA Graduate Degree 1,903,917 2.0%
NA NA 26,153 0.0%
Note: Percentages are based on the total sample. Data reflects the distribution of education levels across age groups.
table_children_edu <- gssia %>%
  left_join(gssC, by = "year") %>%
  group_by(number_kids, educ_level) %>%
  summarise(N = n(), .groups = "drop") %>%
  mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssC, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
print(table_children_edu)
## # A tibble: 36 × 4
##    number_kids  educ_level                      N Proportion
##    <fct>        <fct>                       <int>      <dbl>
##  1 No children  No High School Diploma    3270512     3.37  
##  2 No children  High School Diploma      13186485    13.6   
##  3 No children  Associate/Junior College  2258470     2.33  
##  4 No children  Bachelor's Degree         5344585     5.51  
##  5 No children  Graduate Degree           3249765     3.35  
##  6 No children  <NA>                        42056     0.0433
##  7 Small family No High School Diploma    4895625     5.04  
##  8 Small family High School Diploma      19691813    20.3   
##  9 Small family Associate/Junior College  3371830     3.47  
## 10 Small family Bachelor's Degree         7984991     8.23  
## # ℹ 26 more rows
table_children_edu %>%
  gt() %>%
  

  cols_label(
    number_kids = "Number of Children",
    educ_level = "Education Level",
    N = "Count (N)",
    Proportion = "Proportion (%)"
  ) %>%
  
 
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels(
      columns = vars(Proportion)
    )
  ) %>%
  
 
  fmt_number(
    columns = vars(N),
    decimals = 0,
    use_seps = TRUE
  ) %>%
  

  fmt_percent(
    columns = vars(Proportion),
    decimals = 1,
    scale_values = FALSE
  ) %>%
  
  
  tab_header(
    title = md("**Proportion of Education Levels by Number of Children**")
  ) %>%
  tab_source_note(
    source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across different numbers of children."
  ) %>%
  
  
  tab_options(
    table.border.top.width = px(2),    
    table.border.bottom.width = px(2), 
    column_labels.border.top.width = px(1),   
    column_labels.border.bottom.width = px(1)  
  )
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
Proportion of Education Levels by Number of Children
Number of Children Education Level Count (N) Proportion (%)
No children No High School Diploma 3,270,512 3.4%
No children High School Diploma 13,186,485 13.6%
No children Associate/Junior College 2,258,470 2.3%
No children Bachelor's Degree 5,344,585 5.5%
No children Graduate Degree 3,249,765 3.3%
No children NA 42,056 0.0%
Small family No High School Diploma 4,895,625 5.0%
Small family High School Diploma 19,691,813 20.3%
Small family Associate/Junior College 3,371,830 3.5%
Small family Bachelor's Degree 7,984,991 8.2%
Small family Graduate Degree 4,854,488 5.0%
Small family NA 64,142 0.1%
Medium-sized family No High School Diploma 2,766,185 2.9%
Medium-sized family High School Diploma 10,975,956 11.3%
Medium-sized family Associate/Junior College 1,860,179 1.9%
Medium-sized family Bachelor's Degree 4,367,414 4.5%
Medium-sized family Graduate Degree 2,626,485 2.7%
Medium-sized family NA 32,991 0.0%
Large family No High School Diploma 530,500 0.5%
Large family High School Diploma 2,081,720 2.1%
Large family Associate/Junior College 349,133 0.4%
Large family Bachelor's Degree 814,776 0.8%
Large family Graduate Degree 484,783 0.5%
Large family NA 5,613 0.0%
Very large family No High School Diploma 195,755 0.2%
Very large family High School Diploma 759,818 0.8%
Very large family Associate/Junior College 126,514 0.1%
Very large family Bachelor's Degree 292,743 0.3%
Very large family Graduate Degree 172,872 0.2%
Very large family NA 1,798 0.0%
NA No High School Diploma 37,424 0.0%
NA High School Diploma 174,653 0.2%
NA Associate/Junior College 33,831 0.0%
NA Bachelor's Degree 86,209 0.1%
NA Graduate Degree 57,631 0.1%
NA NA 1,262 0.0%
Note: Percentages are based on the total sample. Data reflects the distribution of education levels across different numbers of children.
ggplot(gssia %>% left_join(gssS, by = "year"), aes(x = male_female, fill = educ_level)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Education Levels by Gender", x = "Gender", y = "Proportion") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()
## Warning in left_join(., gssS, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

ggplot(gssia %>% left_join(gssA, by = "year"), aes(x = young_old, fill = educ_level)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Education Levels by Age Group", 
       x = "Age Group", 
       y = "Proportion") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  
        plot.title = element_text(hjust = 0.5)) +  
  scale_x_discrete(expand = expansion(mult = 0.2))  
## Warning in left_join(., gssA, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

edu_fam_visual <- ggplot(gssia %>% left_join(gssC, by = "year"), aes(x = number_kids, fill = educ_level)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Education Levels by Family Size", 
       x = "Number of Children", 
       y = "Proportion") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  
        plot.title = element_text(hjust = 0.5)) + 
  scale_x_discrete(expand = expansion(mult = 0.2))  
## Warning in left_join(., gssC, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.