Data Sources: Salary Data: https://www.zippia.com/ 3/2/2024 Cost of Living Index: https://www.datapandas.org/ranking/cost-of-living-by-state

Inspiration for dumbbell viz: Stalder, T., Holtz, Y. (2021): Extended Dumbbell Plot in R with ggplot2. R graph gallery. Access: r-graph-gallery.com/web-extended-dumbbell-plot-ggplot2.html. Date: 03-03-2024.

library(tidyverse)
library(tidytext)
library(ggtext)
library(extrafont)
library(scales)

Bring in Data

#salary data
sal <- read.csv("/Users/diana/Documents/dmp/CUNY/608 Data Viz/Story 4/Salary data.csv")

#cost of living data 
col<- read.csv("/Users/diana/Documents/dmp/CUNY/608 Data Viz/Story 4/COL.csv")

Combine Cost of Living and Salary Data

#combine col with sal
sal <- merge(x=sal, y = col, by = "State" )

#create sal_adj - salary adjusted for cost of living index
#plus some renaming and cleaning
sal<- sal %>% rename(coli=Cost.of.Living.Index, an_sal=Annual.Salary) %>% select(-X) %>%
  mutate (sal_adj = an_sal/(coli/100))

Max and Min for Dumbbell Chart

#make small dataset with max min by title for use in dumbbell
#also 
sal_db <- sal %>%
  group_by(Title) %>%
  summarise(mx_adj = max(sal_adj), mn_adj = min(sal_adj),
            mx_sal = max(an_sal), mn_sal = min(an_sal)) %>%
  mutate (diff_adj = mx_adj- mn_adj, diff_sal=mx_sal- mn_sal)
  

#then make a max and min and diff df from this, so we can plot each
sal_mx <- sal_db %>% select(-mn_adj,mn_sal)
sal_mn <- sal_db %>% select(-mx_adj,mx_sal)
diff <- sal_db %>% mutate(x_pos_adj = mn_adj+(diff_adj/2), 
                          x_pos_sal = mn_sal+(diff_sal/2))

Dumbbell Adjusted COL

# this is with salary adjusted for cost of living
ggplot(sal_db)+
 geom_segment(data = sal_mn,
              aes(x = mn_adj, y = Title,
                  yend = sal_mx$Title, 
                  xend = sal_mx$mx_adj), 
              color = "#aeb6bf",
              size = 3, 
              alpha = .5) +
  geom_point(aes(x = mn_adj, y = Title),color = "#b8d8be", size = 4)+
  geom_point(aes(x = mx_adj, y = Title ),color = "#084f09", size = 4)+
  geom_text(data=diff, aes(label=paste("Δ:", comma(round(diff_adj,2))), parse = TRUE,
              x=x_pos_adj, y=Title),
              fill = "white",
              color = "#4a4e4d",
              size = 2)+
ggtitle("Salary Adjusted by Cost of Living")+
  theme_classic()+
  theme(axis.title.y = element_blank())+
  scale_x_continuous(labels=comma, limits = c(0, 125000) )+
  xlab("Average Salary ($)")

Dumbbell Raw

# this is with raw salary NOT adjusted for cost of living
ggplot(sal_db)+
 geom_segment(data = sal_mn,
              aes(x = mn_sal, y = Title,
                  yend = sal_mx$Title, 
                  xend = sal_mx$mx_sal), 
              color = "#aeb6bf",
              size = 3, 
              alpha = .5) +
  geom_point(aes(x = mn_sal, y = Title),color = "#b8d8be", size = 4.5)+
  geom_point(aes(x = mx_sal, y = Title ),color = "#084f09", size = 4.5)+
  geom_text(data=diff, aes(label=paste("Δ:", comma(round(diff_sal,2))), parse = TRUE,
              x=x_pos_sal, y=Title),
              fill = "white",
              color = "#4a4e4d",
              size = 2)+
ggtitle("Raw Salary")+
  theme_classic()+
  theme(axis.title.y = element_blank())+
  scale_x_continuous(labels=comma, limits = c(0, 125000) )+
  xlab("Average Salary ($)")

States in top / bottom most frequently

# first get the top 5 states, for both raw and adj
top5raw<- sal %>%
  group_by(Title) %>%
  top_n(5, an_sal) %>%
  ungroup()

top5adj <- sal %>%
  group_by(Title) %>%
  top_n(5, sal_adj) %>%
  ungroup()


#and the bottom 5 states, for both raw and adj
bottom5raw<- sal %>%
  group_by(Title) %>%
  top_n(-5, an_sal) %>%
  ungroup()


bottom5adj <- sal %>%
  group_by(Title) %>%
  top_n(-5, sal_adj) %>%
  ungroup()
topStateraw <- top5raw %>% group_by(State) %>% 
  summarise(frequency = n()) %>%
  ungroup()
topStateadj <- top5adj %>% group_by(State) %>% 
  summarise(frequency = n()) %>%
  ungroup()
bottomStateraw<- bottom5raw %>% group_by(State) %>% 
  summarise(frequency = n()) %>%
  ungroup()
bottomStateadj<- bottom5adj %>% group_by(State) %>% 
  summarise(frequency = n()) %>%
  ungroup()

Small Multiple Bars

#first try at the small multiple bar
ggplot(top5raw, aes(x= reorder_within(State,an_sal, Title), y=an_sal)) + 
  geom_bar(fill = "#084f09",stat = "identity")+
  coord_flip() +
  scale_x_discrete(labels = label_wrap_gen(width = 1)) +
  facet_wrap( ~ Title, drop=TRUE, scale="free") +
  theme_classic()+
  theme(axis.text=element_text(size=6),
        axis.title=element_blank())

sal <- sal %>% 
  mutate(topRaw = ifelse(State == "California", "Top", "Other"),
         topAdj = ifelse(State == "Michigan", "Top", ifelse(State== "Ohio", "Top", "Other")),
         botRaw = ifelse(State == "Hawaii", "Bottom", ifelse(State== "South Dakota", "Bottom", "Other")),
         botAdj = ifelse(State == "Hawaii", "Bottom", "Other"))

This is more complex, but nicer results

#RAW
sal %>%
  group_by(Title) %>%
  top_n(5, an_sal) %>%
  ungroup %>%
  mutate(Title = as.factor(Title),
         State = reorder_within(State, -an_sal, Title)) %>%
  arrange(Title, an_sal, desc(State)) %>% 
  mutate(State = fct_inorder(State)) %>% 
  ggplot(aes(State, an_sal )) +
  geom_col(aes(fill = topRaw), show.legend = FALSE) +
  geom_text(aes(label = label_dollar()(an_sal) ), color='white', hjust = 1, size=2) +
  facet_wrap(~Title, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(labels=NULL, breaks=NULL) +
  labs(y = NULL,
       x = NULL,
       title = "States with Top 5 Raw Salaries per Title") +
  theme_classic() +
  scale_fill_manual(values = c("Top" = "#084f09", "Other" = "gray"))

#ADJUSTED FOR COL
sal %>%
  group_by(Title) %>%
  top_n(5, sal_adj) %>%
  ungroup %>%
  mutate(Title = as.factor(Title),
         State = reorder_within(State, -sal_adj, Title)) %>%
  arrange(Title, sal_adj, desc(State)) %>% 
  mutate(State = fct_inorder(State)) %>% 
  ggplot(aes(State, sal_adj )) +
  geom_col(aes(fill = topAdj), show.legend = FALSE) +
  geom_text(aes(label = label_dollar()(sal_adj) ), color='white', hjust = 1, size=2) +
  facet_wrap(~Title, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(labels=NULL, breaks=NULL) +
  labs(y = NULL,
       x = NULL,
       title = "States with Top 5 COL Adjusted Salaries per Title") +
  theme_classic() +
  scale_fill_manual(values = c("Top" = "#084f09", "Other" = "gray"))

#RAW
sal %>%
  group_by(Title) %>%
  top_n(-5, an_sal) %>%
  ungroup %>%
  mutate(Title = as.factor(Title),
         State = reorder_within(State, -an_sal, Title)) %>%
  arrange(Title, an_sal, desc(State)) %>% 
  mutate(State = fct_inorder(State)) %>% 
  ggplot(aes(State, an_sal )) +
  geom_col(aes(fill = botRaw), show.legend = FALSE) +
  geom_text(aes(label = label_dollar()(an_sal) ), color='black', hjust = 1, size=2) +
  facet_wrap(~Title, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(labels=NULL, breaks=NULL) +
  labs(y = NULL,
       x = NULL,
       title = "States with Bottom 5 Raw Salaries per Title") +
  theme_classic() +
  scale_fill_manual(values = c("Bottom" = "#b8d8be", "Other" = "gray"))

#ADJUSTED COL
sal %>%
  group_by(Title) %>%
  top_n(-5, sal_adj) %>%
  ungroup %>%
  mutate(Title = as.factor(Title),
         State = reorder_within(State, -sal_adj, Title)) %>%
  arrange(Title, sal_adj, desc(State)) %>% 
  mutate(State = fct_inorder(State)) %>% 
  ggplot(aes(State, sal_adj )) +
  geom_col(aes(fill = botAdj), show.legend = FALSE) +
  geom_text(aes(label = label_dollar()(sal_adj) ), color='black', hjust = 1, size=2) +
  facet_wrap(~Title, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  scale_y_continuous(labels=NULL, breaks=NULL) +
  labs(y = NULL,
       x = NULL,
       title = "States with Bottom 5 COL Adjusted Salaries per Title") +
  theme_classic() +
  scale_fill_manual(values = c("Bottom" = "#b8d8be", "Other" = "gray"))