Setup

library(tidyverse)
source("../mytheme.R")
survey_full <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv')

survey <- survey_full %>% replace_na(list(other_monetary_comp = 0)) %>%
  mutate(other_monetary_comp = as.numeric(other_monetary_comp),
         compensation = annual_salary + other_monetary_comp,
         gender = factor(case_when(
           gender == "Man" ~ "Men",
           gender == "Woman" ~ "Women",
           TRUE ~ "Other"
         ), levels = c("Men", "Women", "Other"))) %>%
  filter(currency == "USD", compensation != 0) %>%
  select(compensation, gender)

Data

survey_full

survey

Formatting

mytheme

## function (angle = c(0.5, 35), legend.position = "bottom") 
## {
##     ggplot2::theme(text = element_text(family = "serif"), panel.background = element_blank(), 
##         axis.ticks = element_blank(), plot.title = element_text(hjust = 0.5), 
##         plot.subtitle = element_text(hjust = 0.5), plot.caption = element_text(hjust = 0), 
##         legend.position = legend.position, legend.background = element_blank(), 
##         legend.key = element_blank(), axis.text.x = element_text(vjust = angle[1], 
##             angle = angle[2]))
## }

GreenPal

## function (n = 7, colors = c(7, 5, 3)) 
## {
##     RColorBrewer::brewer.pal(n, "Greens")[colors]
## }

Metadata

survey_full %>%
  filter(currency == "USD") %>%
  group_by(gender) %>% summarize(count = n()) %>%
  arrange(desc(count)) %>%
  
  ggplot(aes(x = reorder(gender, count), y = count, fill = reorder(gender, count))) + geom_col() +
  #scale_fill_manual(values = GreenPal(9, colors = c(4:9))) +
  geom_label(aes(label = count), position = position_stack(vjust = 0.5), size = 3) +
  coord_flip() + xlab(NULL) + ylab(NULL) + ggtitle("Gender Categories in USD Survey Data") +
  mytheme(angle=NULL, legend.position = "none")

survey %>% group_by(gender) %>% summarize(count = n()) %>%
  arrange(desc(count)) %>%
  
  ggplot(aes(x = reorder(gender, count), y = count, fill = gender)) + geom_col() +
  scale_fill_manual(values = GreenPal()) +
  geom_label(aes(label = count), position = position_stack(vjust = 0.5), size = 3) +
  coord_flip() + xlab(NULL) + ylab(NULL) + ggtitle("Gender Categories in USD Survey Data") +
  mytheme(angle = NULL, legend.position = "none")

survey %>%
  filter(compensation != 0) %>%
  group_by(gender) %>%
  summarize(count = as.factor(n()), 
            min = min(compensation),
            median = median(compensation),
            max = max(compensation)) %>%
  arrange(desc(median)) %>%
  mutate_if(is.numeric, scales::dollar) %>%
  knitr::kable(col.names = c("Gender", "Count", "Minimum", "Median", "Maxium"),
               caption = "Total Compensation by Gender")

Total Compensation by Gender
Gender	Count	Minimum	Median	Maxium
Men	3726	$52	$110,650	$102,000,000
Women	17201	$35	$78,000	$3,000,000
Other	917	$80	$67,600	$800,000

Plots

I created three plots for this project to visualize the distribution of total compensation by gender: (1) heatmap, (2) histogram, and (3) density plot. I ended up keeping only the density plot, as depicted below because it provided the cleanest representation of a pattern.

Density plot

survey %>%
 
  ggplot(aes(x = compensation,
             y = gender)) +
  ggridges::geom_density_ridges(aes(fill = gender),
                                #stat = "density",
                                scale = 2, alpha = 0.4, color = "white",
                                quantile_lines = TRUE, quantiles = 2,
                                from = 0, to = 400000) +
                                #jittered_points = TRUE, position = "raincloud",
                                #alpha = 0.7, scale = 0.9) +
  
  
  scale_fill_manual(values = GreenPal()) + # defined in mytheme.R
  coord_cartesian(xlim=c(0, 400000)) + # truncate here to not impact quantiles
  xlab(NULL) + ylab(NULL) +
  ggtitle("Gender Disparity in Total Annual Compensation\n") +
  scale_x_continuous(labels = scales::dollar_format()) +
  
  mytheme(angle = NULL, legend.position = "none")

Heatmap

# cut_interval
survey %>%
  filter(compensation < 500000) %>%
  ggplot() +
  geom_bin2d(aes(x = compensation,
                 y = gender),
             bins = 100) +
  #geom_line(aes(x = annual_salary, y = ..density..), stat = "identity") +
  coord_cartesian(xlim=c(0, 400000)) +
  scale_fill_viridis_c(option = "magma", direction = -1) +
  scale_x_continuous(labels = scales::dollar_format()) +
  ylab(NULL) + xlab("\nAnnual Total Compensation") +
  ggtitle("Gender Disparity in Total Compensation") +
  guides(color = guide_colorbar()) +
  mytheme(angle=NULL)

Histogram

survey %>% group_by(gender) %>% mutate(median = median(compensation)) %>%
  # plot
  ggplot() +
  geom_histogram(aes(x = compensation, fill = gender, group = gender),
                 alpha = 0.4, position = "identity", binwidth = 5000) +
  geom_vline(aes(xintercept = median, color = gender), legend.show = FALSE) +
  
  # labels
  scale_x_continuous(labels = scales::dollar_format()) +
  ylab(NULL) + xlab("\nAnnual Salary") + 
  ## truncate here so as not to affect the median calculation above
  coord_cartesian(xlim = c(0, 400000)) + 
  scale_fill_discrete(name = "") + ggtitle("Gender Disparity in Total Compensation\n") +
  guides(color = FALSE) +
  mytheme()

Tidy Tuesday Week 21: Survey Data

Arielle Herman

2021-05-18