library(fst)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gt)
# Load the full GSS dataset
gss <- read_fst("gss2022.fst")
# Initial look at data dimensions
dim(gss)
## [1] 72390 6646
gssia <- gss %>%
filter(year >= 2002 & year <= 2022) %>%
mutate(
educ_level = case_when(
degree == "less than high school" ~ "No High School Diploma",
degree == "high school" ~ "High School Diploma",
degree == "associate/junior college" ~ "Associate/Junior College",
degree == "bachelor's" ~ "Bachelor's Degree",
degree == "graduate" ~ "Graduate Degree",
TRUE ~ NA_character_
),
educ_level = factor(educ_level,
levels = c("No High School Diploma", "High School Diploma", "Associate/Junior College", "Bachelor's Degree", "Graduate Degree"))
)
table(gssia$educ_level)
##
## No High School Diploma High School Diploma Associate/Junior College
## 3870 15332 2557
## Bachelor's Degree Graduate Degree
## 6043 3614
gssS <- gss %>%
filter(year >= 2002 & year <= 2022) %>%
mutate(
male_female = case_when(
sex == "male" ~ "Male",
sex == "female" ~ "Female",
TRUE ~ NA_character_
),
male_female = factor(male_female,
levels = c("Male", "Female"))
)
table(gssS$male_female)
##
## Male Female
## 14049 17296
gssA <- gss %>%
filter(year >= 2002 & year <= 2022) %>%
mutate(
young_old = case_when(
age >= 24 & age <= 29 ~ "Young Adulthood", # 24 to 29 inclusive
age >= 30 & age <= 39 ~ "Adulthood", # 30 to 39
age >= 40 & age <= 49 ~ "Midlife", # 40 to 49
age >= 50 & age <= 64 ~ "Late Adulthood", # 50 to 64
age >= 65 & age <= 75 ~ "Senior", # 65 to 75
TRUE ~ NA_character_
),
young_old = factor(young_old,
levels = c("Young Adulthood", "Adulthood", "Midlife", "Late Adulthood", "Senior"))
)
table(gssA$young_old)
##
## Young Adulthood Adulthood Midlife Late Adulthood Senior
## 3102 5892 5582 7854 4036
gssC <- gss %>%
filter(year >= 2002 & year <= 2022) %>%
mutate(
number_kids = case_when(
childs == 0 ~ "No children", # 0 children
childs >= 1 & childs <= 2 ~ "Small family", # 1-2 children
childs >= 3 & childs <= 4 ~ "Medium-sized family", # 3-4 children
childs >= 5 & childs <= 6 ~ "Large family", # 5-6 children
childs >= 7 & childs <= 8 ~ "Very large family", # 7-8 children (grouped)
TRUE ~ NA_character_
),
number_kids = factor(number_kids,
levels = c("No children", "Small family", "Medium-sized family", "Large family", "Very large family"))
)
table(gssC$number_kids)
##
## No children Small family Medium-sized family Large family
## 8835 13235 7355 1405
## Very large family
## 516
gssia <- gssia %>% select(year, educ_level)
gssS <- gssS %>% select(year, male_female)
gssA <- gssA %>% select(year, young_old)
gssC <- gssC %>% select(year, number_kids)
table_gender_edu <- gssia %>%
left_join(gssS, by = "year") %>%
group_by(male_female, educ_level) %>%
summarise(N = n(), .groups = "drop") %>%
mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssS, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
print(table_gender_edu)
## # A tibble: 18 × 4
## male_female educ_level N Proportion
## <fct> <fct> <int> <dbl>
## 1 Male No High School Diploma 5225869 5.38
## 2 Male High School Diploma 20916511 21.6
## 3 Male Associate/Junior College 3565740 3.67
## 4 Male Bachelor's Degree 8408081 8.66
## 5 Male Graduate Degree 5087084 5.24
## 6 Male <NA> 64495 0.0665
## 7 Female No High School Diploma 6440320 6.64
## 8 Female High School Diploma 25773930 26.6
## 9 Female Associate/Junior College 4393837 4.53
## 10 Female Bachelor's Degree 10372585 10.7
## 11 Female Graduate Degree 6279480 6.47
## 12 Female <NA> 81251 0.0837
## 13 <NA> No High School Diploma 29812 0.0307
## 14 <NA> High School Diploma 180004 0.185
## 15 <NA> Associate/Junior College 40380 0.0416
## 16 <NA> Bachelor's Degree 110052 0.113
## 17 <NA> Graduate Degree 79460 0.0819
## 18 <NA> <NA> 2116 0.00218
table_gender_edu %>%
gt() %>%
cols_label(
male_female = "Gender",
educ_level = "Education Level",
N = "Count (N)",
Proportion = "Proportion (%)"
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(
columns = vars(Proportion)
)
) %>%
fmt_number(
columns = vars(N),
decimals = 0,
use_seps = TRUE
) %>%
fmt_percent(
columns = vars(Proportion),
decimals = 1,
scale_values = FALSE
) %>%
tab_header(
title = md("**Proportion of Education Levels by Gender**")
) %>%
tab_source_note(
source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across gender."
) %>%
tab_options(
table.border.top.width = px(2),
table.border.bottom.width = px(2),
column_labels.border.top.width = px(1),
column_labels.border.bottom.width = px(1)
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
| Proportion of Education Levels by Gender | |||
| Gender | Education Level | Count (N) | Proportion (%) |
|---|---|---|---|
| Male | No High School Diploma | 5,225,869 | 5.4% |
| Male | High School Diploma | 20,916,511 | 21.6% |
| Male | Associate/Junior College | 3,565,740 | 3.7% |
| Male | Bachelor's Degree | 8,408,081 | 8.7% |
| Male | Graduate Degree | 5,087,084 | 5.2% |
| Male | NA | 64,495 | 0.1% |
| Female | No High School Diploma | 6,440,320 | 6.6% |
| Female | High School Diploma | 25,773,930 | 26.6% |
| Female | Associate/Junior College | 4,393,837 | 4.5% |
| Female | Bachelor's Degree | 10,372,585 | 10.7% |
| Female | Graduate Degree | 6,279,480 | 6.5% |
| Female | NA | 81,251 | 0.1% |
| NA | No High School Diploma | 29,812 | 0.0% |
| NA | High School Diploma | 180,004 | 0.2% |
| NA | Associate/Junior College | 40,380 | 0.0% |
| NA | Bachelor's Degree | 110,052 | 0.1% |
| NA | Graduate Degree | 79,460 | 0.1% |
| NA | NA | 2,116 | 0.0% |
| Note: Percentages are based on the total sample. Data reflects the distribution of education levels across gender. | |||
table_age_edu <- gssia %>%
left_join(gssA, by = "year") %>%
group_by(young_old, educ_level) %>%
summarise(N = n(), .groups = "drop") %>%
mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssA, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
print(table_age_edu)
## # A tibble: 36 × 4
## young_old educ_level N Proportion
## <fct> <fct> <int> <dbl>
## 1 Young Adulthood No High School Diploma 1171865 1.21
## 2 Young Adulthood High School Diploma 4600139 4.74
## 3 Young Adulthood Associate/Junior College 768759 0.792
## 4 Young Adulthood Bachelor's Degree 1793003 1.85
## 5 Young Adulthood Graduate Degree 1065671 1.10
## 6 Young Adulthood <NA> 12368 0.0127
## 7 Adulthood No High School Diploma 2220591 2.29
## 8 Adulthood High School Diploma 8794306 9.06
## 9 Adulthood Associate/Junior College 1485580 1.53
## 10 Adulthood Bachelor's Degree 3481284 3.59
## # ℹ 26 more rows
table_age_edu %>%
gt() %>%
cols_label(
young_old = "Age Group",
educ_level = "Education Level",
N = "Count (N)",
Proportion = "Proportion (%)"
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(
columns = vars(Proportion)
)
) %>%
fmt_number(
columns = vars(N),
decimals = 0,
use_seps = TRUE
) %>%
fmt_percent(
columns = vars(Proportion),
decimals = 1,
scale_values = FALSE
) %>%
tab_header(
title = md("**Proportion of Education Levels by Age Group**")
) %>%
tab_source_note(
source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across age groups."
) %>%
tab_options(
table.border.top.width = px(2),
table.border.bottom.width = px(2),
column_labels.border.top.width = px(1),
column_labels.border.bottom.width = px(1)
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
| Proportion of Education Levels by Age Group | |||
| Age Group | Education Level | Count (N) | Proportion (%) |
|---|---|---|---|
| Young Adulthood | No High School Diploma | 1,171,865 | 1.2% |
| Young Adulthood | High School Diploma | 4,600,139 | 4.7% |
| Young Adulthood | Associate/Junior College | 768,759 | 0.8% |
| Young Adulthood | Bachelor's Degree | 1,793,003 | 1.8% |
| Young Adulthood | Graduate Degree | 1,065,671 | 1.1% |
| Young Adulthood | NA | 12,368 | 0.0% |
| Adulthood | No High School Diploma | 2,220,591 | 2.3% |
| Adulthood | High School Diploma | 8,794,306 | 9.1% |
| Adulthood | Associate/Junior College | 1,485,580 | 1.5% |
| Adulthood | Bachelor's Degree | 3,481,284 | 3.6% |
| Adulthood | Graduate Degree | 2,087,998 | 2.2% |
| Adulthood | NA | 25,518 | 0.0% |
| Midlife | No High School Diploma | 2,152,802 | 2.2% |
| Midlife | High School Diploma | 8,423,293 | 8.7% |
| Midlife | Associate/Junior College | 1,419,663 | 1.5% |
| Midlife | Bachelor's Degree | 3,305,265 | 3.4% |
| Midlife | Graduate Degree | 1,971,621 | 2.0% |
| Midlife | NA | 23,957 | 0.0% |
| Late Adulthood | No High School Diploma | 2,902,358 | 3.0% |
| Late Adulthood | High School Diploma | 11,625,959 | 12.0% |
| Late Adulthood | Associate/Junior College | 1,979,810 | 2.0% |
| Late Adulthood | Bachelor's Degree | 4,687,072 | 4.8% |
| Late Adulthood | Graduate Degree | 2,840,185 | 2.9% |
| Late Adulthood | NA | 37,150 | 0.0% |
| Senior | No High School Diploma | 1,433,703 | 1.5% |
| Senior | High School Diploma | 5,985,596 | 6.2% |
| Senior | Associate/Junior College | 1,051,889 | 1.1% |
| Senior | Bachelor's Degree | 2,534,805 | 2.6% |
| Senior | Graduate Degree | 1,576,632 | 1.6% |
| Senior | NA | 22,716 | 0.0% |
| NA | No High School Diploma | 1,814,682 | 1.9% |
| NA | High School Diploma | 7,441,152 | 7.7% |
| NA | Associate/Junior College | 1,294,256 | 1.3% |
| NA | Bachelor's Degree | 3,089,289 | 3.2% |
| NA | Graduate Degree | 1,903,917 | 2.0% |
| NA | NA | 26,153 | 0.0% |
| Note: Percentages are based on the total sample. Data reflects the distribution of education levels across age groups. | |||
table_children_edu <- gssia %>%
left_join(gssC, by = "year") %>%
group_by(number_kids, educ_level) %>%
summarise(N = n(), .groups = "drop") %>%
mutate(Proportion = N / sum(N) * 100)
## Warning in left_join(., gssC, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
print(table_children_edu)
## # A tibble: 36 × 4
## number_kids educ_level N Proportion
## <fct> <fct> <int> <dbl>
## 1 No children No High School Diploma 3270512 3.37
## 2 No children High School Diploma 13186485 13.6
## 3 No children Associate/Junior College 2258470 2.33
## 4 No children Bachelor's Degree 5344585 5.51
## 5 No children Graduate Degree 3249765 3.35
## 6 No children <NA> 42056 0.0433
## 7 Small family No High School Diploma 4895625 5.04
## 8 Small family High School Diploma 19691813 20.3
## 9 Small family Associate/Junior College 3371830 3.47
## 10 Small family Bachelor's Degree 7984991 8.23
## # ℹ 26 more rows
table_children_edu %>%
gt() %>%
cols_label(
number_kids = "Number of Children",
educ_level = "Education Level",
N = "Count (N)",
Proportion = "Proportion (%)"
) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_column_labels(
columns = vars(Proportion)
)
) %>%
fmt_number(
columns = vars(N),
decimals = 0,
use_seps = TRUE
) %>%
fmt_percent(
columns = vars(Proportion),
decimals = 1,
scale_values = FALSE
) %>%
tab_header(
title = md("**Proportion of Education Levels by Number of Children**")
) %>%
tab_source_note(
source_note = "Note: Percentages are based on the total sample. Data reflects the distribution of education levels across different numbers of children."
) %>%
tab_options(
table.border.top.width = px(2),
table.border.bottom.width = px(2),
column_labels.border.top.width = px(1),
column_labels.border.bottom.width = px(1)
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
## Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## • Please use `columns = c(...)` instead.
| Proportion of Education Levels by Number of Children | |||
| Number of Children | Education Level | Count (N) | Proportion (%) |
|---|---|---|---|
| No children | No High School Diploma | 3,270,512 | 3.4% |
| No children | High School Diploma | 13,186,485 | 13.6% |
| No children | Associate/Junior College | 2,258,470 | 2.3% |
| No children | Bachelor's Degree | 5,344,585 | 5.5% |
| No children | Graduate Degree | 3,249,765 | 3.3% |
| No children | NA | 42,056 | 0.0% |
| Small family | No High School Diploma | 4,895,625 | 5.0% |
| Small family | High School Diploma | 19,691,813 | 20.3% |
| Small family | Associate/Junior College | 3,371,830 | 3.5% |
| Small family | Bachelor's Degree | 7,984,991 | 8.2% |
| Small family | Graduate Degree | 4,854,488 | 5.0% |
| Small family | NA | 64,142 | 0.1% |
| Medium-sized family | No High School Diploma | 2,766,185 | 2.9% |
| Medium-sized family | High School Diploma | 10,975,956 | 11.3% |
| Medium-sized family | Associate/Junior College | 1,860,179 | 1.9% |
| Medium-sized family | Bachelor's Degree | 4,367,414 | 4.5% |
| Medium-sized family | Graduate Degree | 2,626,485 | 2.7% |
| Medium-sized family | NA | 32,991 | 0.0% |
| Large family | No High School Diploma | 530,500 | 0.5% |
| Large family | High School Diploma | 2,081,720 | 2.1% |
| Large family | Associate/Junior College | 349,133 | 0.4% |
| Large family | Bachelor's Degree | 814,776 | 0.8% |
| Large family | Graduate Degree | 484,783 | 0.5% |
| Large family | NA | 5,613 | 0.0% |
| Very large family | No High School Diploma | 195,755 | 0.2% |
| Very large family | High School Diploma | 759,818 | 0.8% |
| Very large family | Associate/Junior College | 126,514 | 0.1% |
| Very large family | Bachelor's Degree | 292,743 | 0.3% |
| Very large family | Graduate Degree | 172,872 | 0.2% |
| Very large family | NA | 1,798 | 0.0% |
| NA | No High School Diploma | 37,424 | 0.0% |
| NA | High School Diploma | 174,653 | 0.2% |
| NA | Associate/Junior College | 33,831 | 0.0% |
| NA | Bachelor's Degree | 86,209 | 0.1% |
| NA | Graduate Degree | 57,631 | 0.1% |
| NA | NA | 1,262 | 0.0% |
| Note: Percentages are based on the total sample. Data reflects the distribution of education levels across different numbers of children. | |||
ggplot(gssia %>% left_join(gssS, by = "year"), aes(x = male_female, fill = educ_level)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Education Levels by Gender", x = "Gender", y = "Proportion") +
scale_y_continuous(labels = scales::percent) +
theme_minimal()
## Warning in left_join(., gssS, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(gssia %>% left_join(gssA, by = "year"), aes(x = young_old, fill = educ_level)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Education Levels by Age Group",
x = "Age Group",
y = "Proportion") +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(expand = expansion(mult = 0.2))
## Warning in left_join(., gssA, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
edu_fam_visual <- ggplot(gssia %>% left_join(gssC, by = "year"), aes(x = number_kids, fill = educ_level)) +
geom_bar(position = "fill") +
labs(title = "Proportion of Education Levels by Family Size",
x = "Number of Children",
y = "Proportion") +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(expand = expansion(mult = 0.2))
## Warning in left_join(., gssC, by = "year"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.