Salaries for Data-Roles in the United States

Loading Necessary Packages:

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.1

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 4.3.1

library(forcats)

Loading Data:

data_scientist = read_csv('data_scientist.csv')

## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Area name, Employment(1), Employment percent relative standard err...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

database_admin = read_csv('database_administrator.csv')

## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Area name
## dbl (17): Employment(1), Employment percent relative standard error(3), Hour...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

database_architect = read_csv('database_architect.csv')

## Rows: 48 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Area name, Employment(1), Employment percent relative standard err...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

operation_analyst = read_csv('operation_research_analyst.csv')

## Rows: 51 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Area name
## dbl (17): Employment(1), Employment percent relative standard error(3), Hour...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

statistician = read_csv('statistician.csv')

## Rows: 47 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Area name, Hourly 90th percentile wage, Annual 90th percentile wag...
## dbl (15): Employment(1), Employment percent relative standard error(3), Hour...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Adding Roles:

data_scientist$role = 'Data Scientist'
database_admin$role = 'Database Administrator'
database_architect$role = 'Database Architect'
operation_analyst$role = 'Operation Research Analyst'
statistician$role = 'Statistican'

Combining Data:

df = rbind(data_scientist, database_admin, database_architect, operation_analyst, statistician)

Selecting Relevant Columns:

df = df %>%
  select(-c(`Employment percent relative standard error(3)`, `Wage percent relative standard error(3)`, `Employment per 1,000 jobs`, `Location Quotient`))

Cleaning the Dataframe

df = df %>%
  mutate(Area = str_replace_all(`Area name`, "\\([0-9]*\\)", "")) %>%
  select(-`Area name`)

colnames(df) = c('Employment', 'Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile', 'Role', 'Area')

df[df == '(8) -'] = NA

df = df %>%
  mutate_at(c('Employment','Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile'), as.numeric)

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Hourly Wage 90th Percentile = .Primitive("as.double")(`Hourly
##   Wage 90th Percentile`)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

Employment by Role:

emp_by_role = df %>%
  select(c(Role, Employment)) %>%
  group_by(Role) %>%
  summarise(Jobs = sum(Employment, na.rm = TRUE))

emp_by_role$Role <- factor(emp_by_role$Role, levels = emp_by_role$Role[order(emp_by_role$Jobs)])

ggplot(emp_by_role, aes(x = Jobs, y = Role)) +
  geom_bar(stat = "identity", fill = "skyblue") +  # Create horizontal bars
  geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
  theme_minimal() +  # Set minimal theme
  labs(x = "Jobs", y = "", title = "Employment by Role")

Employment by Area (Top 5):

emp_by_area = df %>%
  select(c(Area, Employment)) %>%
  group_by(Area) %>%
  summarise(Jobs = sum(Employment, na.rm = TRUE))

# Arrange the dataframe by the number of jobs
emp_by_area <- emp_by_area %>% 
  arrange(desc(Jobs))

emp_by_area$Area <- factor(emp_by_area$Area, levels = emp_by_area$Area[order(emp_by_area$Jobs)])

# Select the top 5 areas by jobs
emp_by_area_top <- head(emp_by_area, 5)

# Plotting the horizontal bar chart for top 5 areas by jobs
ggplot(emp_by_area_top, aes(x = Jobs, y = Area)) +
  geom_bar(stat = "identity", fill = "skyblue") +  # Create horizontal bars
  geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
  theme_minimal() +  # Set minimal theme
  labs(x = "Jobs", y = "", title = "Top 5 Areas by Employment")

Employment by Area (Bottom 5):

emp_by_area = df %>%
  select(c(Area, Employment)) %>%
  group_by(Area) %>%
  summarise(Jobs = sum(Employment, na.rm = TRUE))

# Arrange the dataframe by the number of jobs
emp_by_area <- emp_by_area %>% 
  arrange(desc(Jobs))

emp_by_area$Area <- factor(emp_by_area$Area, levels = emp_by_area$Area[order(emp_by_area$Jobs)])

# Select the top 5 areas by jobs
emp_by_area_top <- tail(emp_by_area, 5)

# Plotting the horizontal bar chart for top 5 areas by jobs
ggplot(emp_by_area_top, aes(x = Jobs, y = Area)) +
  geom_bar(stat = "identity", fill = "skyblue") +  # Create horizontal bars
  geom_text(aes(label = Jobs), hjust = 1.1, size = 4, color = "black") + # Add numerical figure to bars
  theme_minimal() +  # Set minimal theme
  labs(x = "Jobs", y = "", title = "Bottom 5 Areas by Employment")

Figures Aggregated by Role

by_role = df %>%
  select(-c(Area, Employment)) %>%
  group_by(Role) %>%
  summarise_at(vars('Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile'), median, na.rm = TRUE)

Hourly Wage IQR Lollipop Chart by Role

by_role$Role <- factor(by_role$Role, levels = by_role$Role[order(by_role$`Hourly Wage 75th Percentile`)])

ggplot(by_role) +
  geom_segment(aes(x = Role, xend = Role, y = `Hourly Wage 25th Percentile`, yend = `Hourly Wage 75th Percentile`), color = "black") +
  geom_point(aes(x = Role, y = `Hourly Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Role, y = `Hourly Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Role, y = `Median Hourly Wage`, color = 'red'), size = 3) +  # Add median points with color aesthetic
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("IQR of Hourly Wages (USD) by Data-Role") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +  # Customize legend labels
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Annual Wage IQR Lollipop Chart by Role

by_role$Role <- factor(by_role$Role, levels = by_role$Role[order(by_role$`Annual Wage 75th Percentile`)])


ggplot(by_role) +
  geom_segment(aes(x = Role, xend = Role, y = `Annual Wage 25th Percentile`, yend = `Annual Wage 75th Percentile`), color = "black") +
  geom_point(aes(x = Role, y = `Annual Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Role, y = `Annual Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Role, y = `Median Annual Wage`, color = 'red'), size = 3) +  
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("IQR of Annual Wages (USD) by Data-Role") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Figures Aggregated by Territory

by_area = df %>%
  select(-c(Role,Employment)) %>%
  group_by(Area) %>%
  summarise_at(vars('Mean Hourly Wage','Mean Annual Wage', 'Hourly Wage 10th Percentile', 'Hourly Wage 25th Percentile','Median Hourly Wage','Hourly Wage 75th Percentile','Hourly Wage 90th Percentile','Annual Wage 10th Percentile','Annual Wage 25th Percentile','Median Annual Wage','Annual Wage 75th Percentile','Annual Wage 90th Percentile'), median, na.rm = TRUE)

Distribtion of Area Median Hourly Salary:

ggplot(by_area, aes(x = `Median Hourly Wage`)) +
  geom_histogram(binwidth = 4, color = "black", alpha = 0.7) +  # Adjust binwidth as needed
  scale_fill_brewer(palette = "Set1") +  # Set color palette
  theme_minimal() +
  labs(x = "", y = "Frequency", title = "Distribution of Median Hourly Wage by Area") +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$"))

Distribtion of Area Median Annual Salary:

ggplot(by_area, aes(x = `Median Annual Wage`)) +
  geom_histogram(binwidth = 4 * 2080, color = "black", alpha = 0.7) +  # Adjust binwidth as needed
  scale_fill_brewer(palette = "Set1") +  # Set color palette
  theme_minimal() +
  labs(x = "", y = "Frequency", title = "Distribution of Median Annual Wage by Area") +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$"))

Hourly Wages IQR for Area filtered to Highest 5 Median Wage

# Order the data frame by median hourly wage
by_area <- by_area[order(by_area$`Median Hourly Wage`), ]

# Top 5 and bottom 5 areas by median hourly wage
top_areas <- tail(by_area, 5)

# Reorder levels of Area based on the Median Hourly Wage
top_areas$Area <- factor(top_areas$Area, levels = top_areas$Area)

# Plot with legend on the right
ggplot(top_areas) +
  geom_segment(aes(x = Area, xend = Area, y = `Hourly Wage 25th Percentile`, yend = `Hourly Wage 75th Percentile`)) +
  geom_point(aes(x = Area, y = `Hourly Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Hourly Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Median Hourly Wage`, color = 'red'), size = 3) +  # Add median points with color aesthetic
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("Hourly Wages IQR for Area (USD)\n(Highest 5 by Median Wage)") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Hourly Wages IQR for Area filtered to Lowest 5 by Median Wage

# Order the data frame by median hourly wage
by_area <- by_area[order(by_area$`Median Hourly Wage`), ]

# Bottom 5 areas by median hourly wage
bottom_areas <- head(by_area, 5)

# Reorder levels of Area based on the Median Hourly Wage
bottom_areas$Area <- factor(bottom_areas$Area, levels = bottom_areas$Area)

# Plot with legend on the right
ggplot(bottom_areas) +
  geom_segment(aes(x = Area, xend = Area, y = `Hourly Wage 25th Percentile`, yend = `Hourly Wage 75th Percentile`)) +
  geom_point(aes(x = Area, y = `Hourly Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Hourly Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Median Hourly Wage`, color = 'red'), size = 3) +  # Add median points with color aesthetic
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("Hourly Wages IQR for Area (USD)\n(Lowest 5 by Median Wage)") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Annual Wages IQR for Area filtered to Highest 5 by Median Wage

# Order the data frame by median hourly wage
by_area <- by_area[order(by_area$`Median Annual Wage`), ]

# Top 5 areas by median hourly wage
top_areas <- tail(by_area, 5)

# Reorder levels of Area based on the Median Hourly Wage
top_areas$Area <- factor(top_areas$Area, levels = top_areas$Area)

# Plot with legend on the right
ggplot(top_areas) +
  geom_segment(aes(x = Area, xend = Area, y = `Annual Wage 25th Percentile`, yend = `Annual Wage 75th Percentile`), color = 'black') +
  geom_point(aes(x = Area, y = `Annual Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Annual Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Median Annual Wage`, color = 'red'), size = 3) +  # Add median points with color aesthetic
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("Annual Wages IQR for Area (USD)\n(Highest 5 by Median Wage)") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Annual Wages IQR for Area filtered to Lowest 5 by Median Wage

# Order the data frame by median hourly wage
by_area <- by_area[order(by_area$`Median Annual Wage`), ]

# Bottom 5 areas by median hourly wage
bottom_areas <- head(by_area, 5)

# Reorder levels of Area based on the Median Hourly Wage
bottom_areas$Area <- factor(bottom_areas$Area, levels = bottom_areas$Area)

# Plot with legend on the right
ggplot(bottom_areas) +
  geom_segment(aes(x = Area, xend = Area, y = `Annual Wage 25th Percentile`, yend = `Annual Wage 75th Percentile`), color = 'black') +
  geom_point(aes(x = Area, y = `Annual Wage 25th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Annual Wage 75th Percentile`), color = 'black', size = 3) +
  geom_point(aes(x = Area, y = `Median Annual Wage`, color = 'red'), size = 3) +  # Add median points with color aesthetic
  coord_flip() +
  theme_ipsum() +
  theme(
    legend.position = "right",  # Position the legend on the right
    legend.title = element_blank()  # Remove the legend title
  ) +
  xlab("") +
  ylab("") +
  ggtitle("Annual Wages IQR for Area (USD)\n(Lowest 5 by Median Wage)") +
  scale_color_manual(values = c("red" = "red"), labels = c("Median")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "$"))

Salaries for Data-Roles in the United States

Coco Donovan

2024-03-17

Loading Necessary Packages:

Loading Data:

Adding Roles:

Combining Data:

Selecting Relevant Columns:

Cleaning the Dataframe

Employment by Role:

Employment by Area (Top 5):

Employment by Area (Bottom 5):

Figures Aggregated by Role

Hourly Wage IQR Lollipop Chart by Role

Annual Wage IQR Lollipop Chart by Role

Figures Aggregated by Territory

Distribtion of Area Median Hourly Salary:

Distribtion of Area Median Annual Salary:

Hourly Wages IQR for Area filtered to Highest 5 Median Wage

Hourly Wages IQR for Area filtered to Lowest 5 by Median Wage

Annual Wages IQR for Area filtered to Highest 5 by Median Wage

Annual Wages IQR for Area filtered to Lowest 5 by Median Wage