A short, reproducible vignette demonstrating key tidyverse verbs such as select, rename, mutate, filter, arrange, slice_head, group_by, ggplot, write_csv, with the fivethirtyeight::college_recent_grads dataset.
library(tidyverse)
library(fivethirtyeight)
data("college_recent_grads", package = "fivethirtyeight")
glimpse(college_recent_grads)
## Rows: 173
## Columns: 21
## $ rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ major_code <int> 2419, 2416, 2415, 2417, 2405, 2418, 6202, …
## $ major <chr> "Petroleum Engineering", "Mining And Miner…
## $ major_category <chr> "Engineering", "Engineering", "Engineering…
## $ total <int> 2339, 756, 856, 1258, 32260, 2573, 3777, 1…
## $ sample_size <int> 36, 7, 3, 16, 289, 17, 51, 10, 1029, 631, …
## $ men <int> 2057, 679, 725, 1123, 21239, 2200, 2110, 8…
## $ women <int> 282, 77, 131, 135, 11021, 373, 1667, 960, …
## $ sharewomen <dbl> 0.1205643, 0.1018519, 0.1530374, 0.1073132…
## $ employed <int> 1976, 640, 648, 758, 25694, 1857, 2912, 15…
## $ employed_fulltime <int> 1849, 556, 558, 1069, 23170, 2038, 2924, 1…
## $ employed_parttime <int> 270, 170, 133, 150, 5180, 264, 296, 553, 1…
## $ employed_fulltime_yearround <int> 1207, 388, 340, 692, 16697, 1449, 2482, 82…
## $ unemployed <int> 37, 85, 16, 40, 1672, 400, 308, 33, 4650, …
## $ unemployment_rate <dbl> 0.018380527, 0.117241379, 0.024096386, 0.0…
## $ p25th <dbl> 95000, 55000, 50000, 43000, 50000, 50000, …
## $ median <dbl> 110000, 75000, 73000, 70000, 65000, 65000,…
## $ p75th <dbl> 125000, 90000, 105000, 80000, 75000, 10200…
## $ college_jobs <int> 1534, 350, 456, 529, 18314, 1142, 1768, 97…
## $ non_college_jobs <int> 364, 257, 176, 102, 4440, 657, 314, 500, 1…
## $ low_wage_jobs <int> 193, 50, 0, 0, 972, 244, 259, 220, 3253, 3…
Variables of interest
grads <- college_recent_grads %>%
select(major, major_category, median, p25th, p75th, sharewomen, unemployment_rate) %>%
rename(median_earn = median, p25 = p25th, p75 = p75th, share_women = sharewomen)
glimpse(grads)
## Rows: 173
## Columns: 7
## $ major <chr> "Petroleum Engineering", "Mining And Mineral Enginee…
## $ major_category <chr> "Engineering", "Engineering", "Engineering", "Engine…
## $ median_earn <dbl> 110000, 75000, 73000, 70000, 65000, 65000, 62000, 62…
## $ p25 <dbl> 95000, 55000, 50000, 43000, 50000, 50000, 53000, 315…
## $ p75 <dbl> 125000, 90000, 105000, 80000, 75000, 102000, 72000, …
## $ share_women <dbl> 0.1205643, 0.1018519, 0.1530374, 0.1073132, 0.341630…
## $ unemployment_rate <dbl> 0.018380527, 0.117241379, 0.024096386, 0.050125313, …
grads2 <- grads %>%
mutate(
share_women_pct = 100 * share_women,
earn_bucket = case_when(
median_earn < 30000 ~ "< 30k",
median_earn < 40000 ~ "30–39k",
median_earn < 50000 ~ "40–49k",
TRUE ~ "50k+"
)
) %>%
mutate(across(c(median_earn, p25, p75), ~ round(.x, -3)))
count(grads2, earn_bucket, sort = TRUE)
## # A tibble: 4 Ă— 2
## earn_bucket n
## <chr> <int>
## 1 30–39k 84
## 2 40–49k 40
## 3 50k+ 34
## 4 < 30k 15
# Top 10 majors by median earnings
top10 <- grads2 %>%
arrange(desc(median_earn)) %>%
slice_head(n = 10)
top10 %>% select(major, major_category, median_earn, unemployment_rate)
## # A tibble: 10 Ă— 4
## major major_category median_earn unemployment_rate
## <chr> <chr> <dbl> <dbl>
## 1 Petroleum Engineering Engineering 110000 0.0184
## 2 Mining And Mineral Engineering Engineering 75000 0.117
## 3 Metallurgical Engineering Engineering 73000 0.0241
## 4 Naval Architecture And Marine E… Engineering 70000 0.0501
## 5 Chemical Engineering Engineering 65000 0.0611
## 6 Nuclear Engineering Engineering 65000 0.177
## 7 Actuarial Science Business 62000 0.0957
## 8 Astronomy And Astrophysics Physical Scie… 62000 0.0212
## 9 Mechanical Engineering Engineering 60000 0.0573
## 10 Electrical Engineering Engineering 60000 0.0592
earn_long <- grads2 %>%
select(major, major_category, median_earn, p25, p75) %>%
pivot_longer(cols = c(median_earn, p25, p75),
names_to = "stat",
values_to = "earn")
earn_long %>%
count(stat)
## # A tibble: 3 Ă— 2
## stat n
## <chr> <int>
## 1 median_earn 173
## 2 p25 173
## 3 p75 173
by_category <- grads2 %>%
group_by(major_category) %>%
summarise(
majors_n = n(),
median_earn_avg = mean(median_earn, na.rm = TRUE),
share_women_avg = mean(share_women_pct, na.rm = TRUE),
unemp_avg = mean(unemployment_rate, na.rm = TRUE)
) %>%
arrange(desc(median_earn_avg))
by_category %>%
slice_head(n = 8)
## # A tibble: 8 Ă— 5
## major_category majors_n median_earn_avg share_women_avg unemp_avg
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Engineering 29 57379. 23.9 0.0633
## 2 Business 13 43538. 48.3 0.0711
## 3 Computers & Mathematics 11 42727. 31.2 0.0843
## 4 Law & Public Policy 5 42200 48.4 0.0908
## 5 Physical Sciences 10 41900 50.9 0.0465
## 6 Social Science 9 37333. 55.4 0.0957
## 7 Agriculture & Natural Reso… 10 36900 40.5 0.0563
## 8 Health 12 36833. 79.5 0.0659
grads2 %>%
mutate(major_category = fct_lump_n(major_category, n = 10)) %>%
ggplot(aes(x = fct_reorder(major_category, median_earn, .fun = median, na.rm = TRUE),
y = median_earn)) +
geom_boxplot(outlier.alpha = 0.3) +
geom_jitter(width = 0.15, alpha = 0.2) +
coord_flip() +
labs(title = "Median Earnings by Major Category (Top 10)",
x = "Major Category", y = "Median Earnings (USD)",
caption = "Data: fivethirtyeight::college_recent_grads")
# Relationship between share of women and earnings
grads2 %>%
ggplot(aes(x = share_women_pct, y = median_earn, color = major_category)) +
geom_point(alpha = 0.6, size = 2) +
scale_x_continuous(labels = scales::label_percent(scale = 1)) +
labs(title = "Share of Women vs. Median Earnings",
x = "Share of Women (%)", y = "Median Earnings (USD)",
color = "Category")
# Save a processed CSV
readr::write_csv(grads2, "college_recent_grads_processed.csv")
sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.7.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] fivethirtyeight_0.6.2 lubridate_1.9.4 forcats_1.0.0
## [4] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4
## [7] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [10] ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.0 tidyselect_1.2.1
## [5] jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [9] R6_2.6.1 labeling_0.4.3 generics_0.1.3 knitr_1.50
## [13] bslib_0.9.0 pillar_1.10.2 RColorBrewer_1.1-3 tzdb_0.5.0
## [17] rlang_1.1.6 utf8_1.2.4 cachem_1.1.0 stringi_1.8.7
## [21] xfun_0.52 sass_0.4.10 timechange_0.3.0 cli_3.6.5
## [25] withr_3.0.2 magrittr_2.0.3 digest_0.6.37 grid_4.5.0
## [29] rstudioapi_0.17.1 hms_1.1.3 lifecycle_1.0.4 vctrs_0.6.5
## [33] evaluate_1.0.3 glue_1.8.0 farver_2.1.2 rmarkdown_2.29
## [37] tools_4.5.0 pkgconfig_2.0.3 htmltools_0.5.8.1
This vignette shows how the Tidyverse simplifies data analysis through consistent and readable syntax. Using the college_recent_grads dataset from fivethirtyeight, I explored how functions from dplyr, tidyr, and ggplot2 can transform, summarize, and visualize real-world data efficiently. Each step—from selecting and mutating variables to reshaping and plotting—showed how the Tidyverse brings a clear, logical workflow for data wrangling and exploration. This example highlights how quickly meaningful insights can be derived when powerful tools like the Tidyverse are applied to structured data.