Data Sources: Salary Data: https://www.zippia.com/ 3/2/2024 Cost of Living Index: https://www.datapandas.org/ranking/cost-of-living-by-state
Inspiration for dumbbell viz: Stalder, T., Holtz, Y. (2021): Extended Dumbbell Plot in R with ggplot2. R graph gallery. Access: r-graph-gallery.com/web-extended-dumbbell-plot-ggplot2.html. Date: 03-03-2024.
library(tidyverse)
library(tidytext)
library(ggtext)
library(extrafont)
library(scales)
#salary data
sal <- read.csv("/Users/diana/Documents/dmp/CUNY/608 Data Viz/Story 4/Salary data.csv")
#cost of living data
col<- read.csv("/Users/diana/Documents/dmp/CUNY/608 Data Viz/Story 4/COL.csv")
#combine col with sal
sal <- merge(x=sal, y = col, by = "State" )
#create sal_adj - salary adjusted for cost of living index
#plus some renaming and cleaning
sal<- sal %>% rename(coli=Cost.of.Living.Index, an_sal=Annual.Salary) %>% select(-X) %>%
mutate (sal_adj = an_sal/(coli/100))
#make small dataset with max min by title for use in dumbbell
#also
sal_db <- sal %>%
group_by(Title) %>%
summarise(mx_adj = max(sal_adj), mn_adj = min(sal_adj),
mx_sal = max(an_sal), mn_sal = min(an_sal)) %>%
mutate (diff_adj = mx_adj- mn_adj, diff_sal=mx_sal- mn_sal)
#then make a max and min and diff df from this, so we can plot each
sal_mx <- sal_db %>% select(-mn_adj,mn_sal)
sal_mn <- sal_db %>% select(-mx_adj,mx_sal)
diff <- sal_db %>% mutate(x_pos_adj = mn_adj+(diff_adj/2),
x_pos_sal = mn_sal+(diff_sal/2))
# this is with salary adjusted for cost of living
ggplot(sal_db)+
geom_segment(data = sal_mn,
aes(x = mn_adj, y = Title,
yend = sal_mx$Title,
xend = sal_mx$mx_adj),
color = "#aeb6bf",
size = 3,
alpha = .5) +
geom_point(aes(x = mn_adj, y = Title),color = "#b8d8be", size = 4)+
geom_point(aes(x = mx_adj, y = Title ),color = "#084f09", size = 4)+
geom_text(data=diff, aes(label=paste("Δ:", comma(round(diff_adj,2))), parse = TRUE,
x=x_pos_adj, y=Title),
fill = "white",
color = "#4a4e4d",
size = 2)+
ggtitle("Salary Adjusted by Cost of Living")+
theme_classic()+
theme(axis.title.y = element_blank())+
scale_x_continuous(labels=comma, limits = c(0, 125000) )+
xlab("Average Salary ($)")
# this is with raw salary NOT adjusted for cost of living
ggplot(sal_db)+
geom_segment(data = sal_mn,
aes(x = mn_sal, y = Title,
yend = sal_mx$Title,
xend = sal_mx$mx_sal),
color = "#aeb6bf",
size = 3,
alpha = .5) +
geom_point(aes(x = mn_sal, y = Title),color = "#b8d8be", size = 4.5)+
geom_point(aes(x = mx_sal, y = Title ),color = "#084f09", size = 4.5)+
geom_text(data=diff, aes(label=paste("Δ:", comma(round(diff_sal,2))), parse = TRUE,
x=x_pos_sal, y=Title),
fill = "white",
color = "#4a4e4d",
size = 2)+
ggtitle("Raw Salary")+
theme_classic()+
theme(axis.title.y = element_blank())+
scale_x_continuous(labels=comma, limits = c(0, 125000) )+
xlab("Average Salary ($)")
# first get the top 5 states, for both raw and adj
top5raw<- sal %>%
group_by(Title) %>%
top_n(5, an_sal) %>%
ungroup()
top5adj <- sal %>%
group_by(Title) %>%
top_n(5, sal_adj) %>%
ungroup()
#and the bottom 5 states, for both raw and adj
bottom5raw<- sal %>%
group_by(Title) %>%
top_n(-5, an_sal) %>%
ungroup()
bottom5adj <- sal %>%
group_by(Title) %>%
top_n(-5, sal_adj) %>%
ungroup()
topStateraw <- top5raw %>% group_by(State) %>%
summarise(frequency = n()) %>%
ungroup()
topStateadj <- top5adj %>% group_by(State) %>%
summarise(frequency = n()) %>%
ungroup()
bottomStateraw<- bottom5raw %>% group_by(State) %>%
summarise(frequency = n()) %>%
ungroup()
bottomStateadj<- bottom5adj %>% group_by(State) %>%
summarise(frequency = n()) %>%
ungroup()
#first try at the small multiple bar
ggplot(top5raw, aes(x= reorder_within(State,an_sal, Title), y=an_sal)) +
geom_bar(fill = "#084f09",stat = "identity")+
coord_flip() +
scale_x_discrete(labels = label_wrap_gen(width = 1)) +
facet_wrap( ~ Title, drop=TRUE, scale="free") +
theme_classic()+
theme(axis.text=element_text(size=6),
axis.title=element_blank())
sal <- sal %>%
mutate(topRaw = ifelse(State == "California", "Top", "Other"),
topAdj = ifelse(State == "Michigan", "Top", ifelse(State== "Ohio", "Top", "Other")),
botRaw = ifelse(State == "Hawaii", "Bottom", ifelse(State== "South Dakota", "Bottom", "Other")),
botAdj = ifelse(State == "Hawaii", "Bottom", "Other"))
This is more complex, but nicer results
#RAW
sal %>%
group_by(Title) %>%
top_n(5, an_sal) %>%
ungroup %>%
mutate(Title = as.factor(Title),
State = reorder_within(State, -an_sal, Title)) %>%
arrange(Title, an_sal, desc(State)) %>%
mutate(State = fct_inorder(State)) %>%
ggplot(aes(State, an_sal )) +
geom_col(aes(fill = topRaw), show.legend = FALSE) +
geom_text(aes(label = label_dollar()(an_sal) ), color='white', hjust = 1, size=2) +
facet_wrap(~Title, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(labels=NULL, breaks=NULL) +
labs(y = NULL,
x = NULL,
title = "States with Top 5 Raw Salaries per Title") +
theme_classic() +
scale_fill_manual(values = c("Top" = "#084f09", "Other" = "gray"))
#ADJUSTED FOR COL
sal %>%
group_by(Title) %>%
top_n(5, sal_adj) %>%
ungroup %>%
mutate(Title = as.factor(Title),
State = reorder_within(State, -sal_adj, Title)) %>%
arrange(Title, sal_adj, desc(State)) %>%
mutate(State = fct_inorder(State)) %>%
ggplot(aes(State, sal_adj )) +
geom_col(aes(fill = topAdj), show.legend = FALSE) +
geom_text(aes(label = label_dollar()(sal_adj) ), color='white', hjust = 1, size=2) +
facet_wrap(~Title, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(labels=NULL, breaks=NULL) +
labs(y = NULL,
x = NULL,
title = "States with Top 5 COL Adjusted Salaries per Title") +
theme_classic() +
scale_fill_manual(values = c("Top" = "#084f09", "Other" = "gray"))
#RAW
sal %>%
group_by(Title) %>%
top_n(-5, an_sal) %>%
ungroup %>%
mutate(Title = as.factor(Title),
State = reorder_within(State, -an_sal, Title)) %>%
arrange(Title, an_sal, desc(State)) %>%
mutate(State = fct_inorder(State)) %>%
ggplot(aes(State, an_sal )) +
geom_col(aes(fill = botRaw), show.legend = FALSE) +
geom_text(aes(label = label_dollar()(an_sal) ), color='black', hjust = 1, size=2) +
facet_wrap(~Title, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(labels=NULL, breaks=NULL) +
labs(y = NULL,
x = NULL,
title = "States with Bottom 5 Raw Salaries per Title") +
theme_classic() +
scale_fill_manual(values = c("Bottom" = "#b8d8be", "Other" = "gray"))
#ADJUSTED COL
sal %>%
group_by(Title) %>%
top_n(-5, sal_adj) %>%
ungroup %>%
mutate(Title = as.factor(Title),
State = reorder_within(State, -sal_adj, Title)) %>%
arrange(Title, sal_adj, desc(State)) %>%
mutate(State = fct_inorder(State)) %>%
ggplot(aes(State, sal_adj )) +
geom_col(aes(fill = botAdj), show.legend = FALSE) +
geom_text(aes(label = label_dollar()(sal_adj) ), color='black', hjust = 1, size=2) +
facet_wrap(~Title, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
scale_y_continuous(labels=NULL, breaks=NULL) +
labs(y = NULL,
x = NULL,
title = "States with Bottom 5 COL Adjusted Salaries per Title") +
theme_classic() +
scale_fill_manual(values = c("Bottom" = "#b8d8be", "Other" = "gray"))