Scatterplots were used to examine relationships between salary and
unemployment rate, as well as salary and gender composition.
#Step 1: Load Libraries
library(fivethirtyeight)
## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Step 2: Load Data
data("college_recent_grads")
glimpse(college_recent_grads)
## Rows: 173
## Columns: 21
## $ rank <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ major_code <int> 2419, 2416, 2415, 2417, 2405, 2418, 6202, …
## $ major <chr> "Petroleum Engineering", "Mining And Miner…
## $ major_category <chr> "Engineering", "Engineering", "Engineering…
## $ total <int> 2339, 756, 856, 1258, 32260, 2573, 3777, 1…
## $ sample_size <int> 36, 7, 3, 16, 289, 17, 51, 10, 1029, 631, …
## $ men <int> 2057, 679, 725, 1123, 21239, 2200, 2110, 8…
## $ women <int> 282, 77, 131, 135, 11021, 373, 1667, 960, …
## $ sharewomen <dbl> 0.1205643, 0.1018519, 0.1530374, 0.1073132…
## $ employed <int> 1976, 640, 648, 758, 25694, 1857, 2912, 15…
## $ employed_fulltime <int> 1849, 556, 558, 1069, 23170, 2038, 2924, 1…
## $ employed_parttime <int> 270, 170, 133, 150, 5180, 264, 296, 553, 1…
## $ employed_fulltime_yearround <int> 1207, 388, 340, 692, 16697, 1449, 2482, 82…
## $ unemployed <int> 37, 85, 16, 40, 1672, 400, 308, 33, 4650, …
## $ unemployment_rate <dbl> 0.018380527, 0.117241379, 0.024096386, 0.0…
## $ p25th <dbl> 95000, 55000, 50000, 43000, 50000, 50000, …
## $ median <dbl> 110000, 75000, 73000, 70000, 65000, 65000,…
## $ p75th <dbl> 125000, 90000, 105000, 80000, 75000, 10200…
## $ college_jobs <int> 1534, 350, 456, 529, 18314, 1142, 1768, 97…
## $ non_college_jobs <int> 364, 257, 176, 102, 4440, 657, 314, 500, 1…
## $ low_wage_jobs <int> 193, 50, 0, 0, 972, 244, 259, 220, 3253, 3…
#Step 3: Clean Data
college_clean <- college_recent_grads %>%
filter(!is.na(median))
#Step 4: Descriptive Statistics
summary(college_clean$median)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22000 33000 36000 40151 45000 110000
#Step 5: Salary by Major Category
ggplot(college_clean,
aes(x = reorder(major_category, median, FUN = mean),
y = median)) +
geom_boxplot() +
coord_flip() +
labs(title = "Median Salary by Major Category",
x = "Major Category",
y = "Median Salary") +
theme_minimal()

#Step 6: Unemployment vs Salary
ggplot(college_clean,
aes(x = unemployment_rate,
y = median)) +
geom_point(alpha = 0.5) +
labs(title = "Unemployment Rate vs Median Salary",
x = "Unemployment Rate",
y = "Median Salary") +
theme_minimal()

#Step 7: Percent Women vs Salary
ggplot(college_clean,
aes(x = sharewomen,
y = median)) +
geom_point(alpha = 0.5) +
labs(title = "Percent Women vs Median Salary",
x = "Share of Women",
y = "Median Salary") +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
