library(tidyverse)
source("../mytheme.R")
survey_full <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv')
survey <- survey_full %>% replace_na(list(other_monetary_comp = 0)) %>%
mutate(other_monetary_comp = as.numeric(other_monetary_comp),
compensation = annual_salary + other_monetary_comp,
gender = factor(case_when(
gender == "Man" ~ "Men",
gender == "Woman" ~ "Women",
TRUE ~ "Other"
), levels = c("Men", "Women", "Other"))) %>%
filter(currency == "USD", compensation != 0) %>%
select(compensation, gender)
survey_full
survey
mytheme
## function (angle = c(0.5, 35), legend.position = "bottom")
## {
## ggplot2::theme(text = element_text(family = "serif"), panel.background = element_blank(),
## axis.ticks = element_blank(), plot.title = element_text(hjust = 0.5),
## plot.subtitle = element_text(hjust = 0.5), plot.caption = element_text(hjust = 0),
## legend.position = legend.position, legend.background = element_blank(),
## legend.key = element_blank(), axis.text.x = element_text(vjust = angle[1],
## angle = angle[2]))
## }
GreenPal
## function (n = 7, colors = c(7, 5, 3))
## {
## RColorBrewer::brewer.pal(n, "Greens")[colors]
## }
survey_full %>%
filter(currency == "USD") %>%
group_by(gender) %>% summarize(count = n()) %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(gender, count), y = count, fill = reorder(gender, count))) + geom_col() +
#scale_fill_manual(values = GreenPal(9, colors = c(4:9))) +
geom_label(aes(label = count), position = position_stack(vjust = 0.5), size = 3) +
coord_flip() + xlab(NULL) + ylab(NULL) + ggtitle("Gender Categories in USD Survey Data") +
mytheme(angle=NULL, legend.position = "none")
survey %>% group_by(gender) %>% summarize(count = n()) %>%
arrange(desc(count)) %>%
ggplot(aes(x = reorder(gender, count), y = count, fill = gender)) + geom_col() +
scale_fill_manual(values = GreenPal()) +
geom_label(aes(label = count), position = position_stack(vjust = 0.5), size = 3) +
coord_flip() + xlab(NULL) + ylab(NULL) + ggtitle("Gender Categories in USD Survey Data") +
mytheme(angle = NULL, legend.position = "none")
survey %>%
filter(compensation != 0) %>%
group_by(gender) %>%
summarize(count = as.factor(n()),
min = min(compensation),
median = median(compensation),
max = max(compensation)) %>%
arrange(desc(median)) %>%
mutate_if(is.numeric, scales::dollar) %>%
knitr::kable(col.names = c("Gender", "Count", "Minimum", "Median", "Maxium"),
caption = "Total Compensation by Gender")
| Gender | Count | Minimum | Median | Maxium |
|---|---|---|---|---|
| Men | 3726 | $52 | $110,650 | $102,000,000 |
| Women | 17201 | $35 | $78,000 | $3,000,000 |
| Other | 917 | $80 | $67,600 | $800,000 |
I created three plots for this project to visualize the distribution of total compensation by gender: (1) heatmap, (2) histogram, and (3) density plot. I ended up keeping only the density plot, as depicted below because it provided the cleanest representation of a pattern.
survey %>%
ggplot(aes(x = compensation,
y = gender)) +
ggridges::geom_density_ridges(aes(fill = gender),
#stat = "density",
scale = 2, alpha = 0.4, color = "white",
quantile_lines = TRUE, quantiles = 2,
from = 0, to = 400000) +
#jittered_points = TRUE, position = "raincloud",
#alpha = 0.7, scale = 0.9) +
scale_fill_manual(values = GreenPal()) + # defined in mytheme.R
coord_cartesian(xlim=c(0, 400000)) + # truncate here to not impact quantiles
xlab(NULL) + ylab(NULL) +
ggtitle("Gender Disparity in Total Annual Compensation\n") +
scale_x_continuous(labels = scales::dollar_format()) +
mytheme(angle = NULL, legend.position = "none")
# cut_interval
survey %>%
filter(compensation < 500000) %>%
ggplot() +
geom_bin2d(aes(x = compensation,
y = gender),
bins = 100) +
#geom_line(aes(x = annual_salary, y = ..density..), stat = "identity") +
coord_cartesian(xlim=c(0, 400000)) +
scale_fill_viridis_c(option = "magma", direction = -1) +
scale_x_continuous(labels = scales::dollar_format()) +
ylab(NULL) + xlab("\nAnnual Total Compensation") +
ggtitle("Gender Disparity in Total Compensation") +
guides(color = guide_colorbar()) +
mytheme(angle=NULL)
survey %>% group_by(gender) %>% mutate(median = median(compensation)) %>%
# plot
ggplot() +
geom_histogram(aes(x = compensation, fill = gender, group = gender),
alpha = 0.4, position = "identity", binwidth = 5000) +
geom_vline(aes(xintercept = median, color = gender), legend.show = FALSE) +
# labels
scale_x_continuous(labels = scales::dollar_format()) +
ylab(NULL) + xlab("\nAnnual Salary") +
## truncate here so as not to affect the median calculation above
coord_cartesian(xlim = c(0, 400000)) +
scale_fill_discrete(name = "") + ggtitle("Gender Disparity in Total Compensation\n") +
guides(color = FALSE) +
mytheme()