library(chromote)
library(htmltools)
library(rvest)
library(readxl)
library(plotly)
library(leaflet)
library(sf)
library(ggplot2)
library(maps)
library(RColorBrewer) # lots of color palettes for these kind of charts
library(data.table) # for sorting by key
library(mapproj) #coord_maps() needed this
#library(lattice)
#library(vegalite)
library(tidyverse)
#library(leaflet.minicharts)
library(magrittr)
library(usmap)
For this analysis, I will compare the average salaries of top Data Science-related job roles, including Data Scientist, Data Analyst, Data Engineer, Big Data Engineer, Data Manager, Data Architect, Data Visualization Engineer, Machine Learning Engineer, and Business Analyst. Additionally, I will compare the salary of each role by state. Data for this analysis was obtained from ZipRecruiter.
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-DATA-Scientist-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577655.2
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_scientist
df_main <- df_data_scientist %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(DS_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Data-Analyst-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577655.8
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_analyst
df_data_analyst2 <- df_data_analyst %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Data_Analyst_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-BIG-DATA-Engineer-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577656.5
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_big_data_engineer
df_big_data_engineer2 <- df_big_data_engineer %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Big_Data_Engineer_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Machine-Learning-Engineer-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577657.1
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_ml_engineer
df_ml_engineer2 <- df_ml_engineer %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(ML_Engineer_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Business-Analyst-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577657.7
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_business_analyst
df_business_analyst2 <- df_business_analyst %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Business_Analyst_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Lead-DATA-Architect-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577658.2
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_architect
df_data_architect2 <- df_data_architect %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Data_Architect_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Data-Manager-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577658.9
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_manager
df_data_manager2 <- df_data_manager %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Data_Manager_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Data-Visualization-Engineer-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577659.4
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_visualization
df_data_visualization2 <- df_data_visualization %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Data_Visualization_Annual_Salary = `Annual Salary`)
uri <- r"(https://www.ziprecruiter.com/Salaries/What-Is-the-Average-DATA-Engineer-Salary-by-State)"
b <- ChromoteSession$new()
b$Network$setUserAgentOverride(userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
## named list()
{b$Page$navigate(uri)
b$Page$loadEventFired()}
## $timestamp
## [1] 577660
resp <- b$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value
resp %>%
read_html() %>%
html_element(".salary_by_state_table") %>%
{if( html_text(.) |> nchar() > 100) html_table(.) else tibble(State = character(),`Annual Salary` = numeric())} -> df_data_engineer
df_data_engineer2 <- df_data_engineer %>%
select(State, `Annual Salary`) %>%
mutate(
across(.cols = c("Annual Salary"),
.fns = parse_number)
) %>%
rename(Data_Engineer_Annual_Salary = `Annual Salary`)
all_data <- df_main %>%
left_join(df_data_analyst2, by="State") %>%
left_join(df_big_data_engineer2, by="State") %>%
left_join(df_ml_engineer2, by="State") %>%
left_join(df_business_analyst2, by="State") %>%
left_join(df_data_architect2, by="State") %>%
left_join(df_data_manager2, by="State") %>%
left_join(df_data_visualization2, by="State") %>%
left_join(df_data_engineer2, by="State")
all_data
## # A tibble: 50 × 10
## State DS_Annual_Salary Data_Analyst_Annual_S…¹ Big_Data_Engineer_An…²
## <chr> <dbl> <dbl> <dbl>
## 1 New York 145027 98238 136712
## 2 California 143099 75874 132134
## 3 Vermont 130783 72641 122857
## 4 Maine 129931 69468 120340
## 5 Massachusetts 128900 83473 125443
## 6 Nevada 128653 83624 124884
## 7 New Jersey 127259 84878 121320
## 8 Wisconsin 126987 84340 121427
## 9 Washington 126680 81244 143356
## 10 Oregon 125467 81828 121511
## # ℹ 40 more rows
## # ℹ abbreviated names: ¹​Data_Analyst_Annual_Salary,
## # ²​Big_Data_Engineer_Annual_Salary
## # ℹ 6 more variables: ML_Engineer_Annual_Salary <dbl>,
## # Business_Analyst_Annual_Salary <dbl>, Data_Architect_Annual_Salary <dbl>,
## # Data_Manager_Annual_Salary <dbl>, Data_Visualization_Annual_Salary <dbl>,
## # Data_Engineer_Annual_Salary <dbl>
mean_of_each_role <- colMeans(all_data[sapply(all_data, is.numeric)], na.rm=TRUE)
new_mean_df2 <- as.data.frame(mean_of_each_role)
new_mean_df2 <- tibble::rownames_to_column(new_mean_df2, "Job_Titles")
new_mean_df2
## Job_Titles mean_of_each_role
## 1 DS_Annual_Salary 116193.76
## 2 Data_Analyst_Annual_Salary 75121.02
## 3 Big_Data_Engineer_Annual_Salary 114454.82
## 4 ML_Engineer_Annual_Salary 116884.48
## 5 Business_Analyst_Annual_Salary 86956.96
## 6 Data_Architect_Annual_Salary 126962.74
## 7 Data_Manager_Annual_Salary 94585.12
## 8 Data_Visualization_Annual_Salary 116432.52
## 9 Data_Engineer_Annual_Salary 119563.08
t1 <- list(family="Arial", size=14, color="black")
t2 <- list(family="sans serif", size=16, color="blue")
fig <- plot_ly(all_data, y = ~DS_Annual_Salary, name = 'Data Scientist', type = 'scatter', mode = 'line',
line = list(color = 'PuRd', width = 4))
fig <- fig %>% add_trace(y = ~Data_Analyst_Annual_Salary, name = 'Data Analyst', line = list(color = 'red', width = 4))
fig <- fig %>% add_trace(y = ~Big_Data_Engineer_Annual_Salary, name = 'Big Data Enigeer', line = list(color = 'PRGn', width = 4))
fig <- fig %>% add_trace(y = ~ML_Engineer_Annual_Salary, name = 'ML Engineer', line = list(color = 'black', width = 4))
fig <- fig %>% add_trace(y = ~Business_Analyst_Annual_Salary, name = 'Business Analyst', line = list(color = 'yellow', width = 4))
fig <- fig %>% add_trace(y = ~Data_Architect_Annual_Salary, name = 'Data Architect', line = list(color = 'blue', width = 4))
fig <- fig %>% add_trace(y = ~Data_Manager_Annual_Salary, name = 'Data Manager', line = list(color = 'orange', width = 4))
fig <- fig %>% add_trace(y = ~Data_Visualization_Annual_Salary, name = 'Data Visualization', line = list(color = 'RdBu', width = 4))
fig <- fig %>% add_trace(y = ~Data_Engineer_Annual_Salary, name = 'Data Engineer', line = list(color = 'BuGn', width = 4))
fig <- fig %>% layout(title = "Variations of Annual Salaries for Each Job Title", font=t1,
xaxis = list(title = "State Count", rangemode="tozero"),
yaxis = list (title = "Annual Salary", rangemode="tozero"),
annotations = list(text="Data Architect has the Highest Average, Data Analyst the Lowest",
font=t2,
showarrow=FALSE,
xref = "paper", x=0.6,
yref = "paper", y= 1))
fig
t1 <- list(family="Arial", size=14, color="black")
t2 <- list(family="sans serif", size=20, color="blue")
fig <- plot_ly(new_mean_df2, x = ~Job_Titles, y=~mean_of_each_role, type = 'bar',
text = mean_of_each_role, textposition = 'auto',
marker = list(color = 'rgb(158,202,225)',
line = list(color = 'rgb(8,48,107)', width = 1.5)))
fig <- fig %>% layout(title = "Average Annual Salary for Each Role in the U.S.", font=t1,
xaxis = list(categoryorder = "total descending", title = "Job Titles"),
yaxis = list(title = "Annual Salary"),
annotations = list(text="Data Architect has the Highest Average, Data Analyst the Lowest",
font=t2,
showarrow=FALSE,
xref = "paper", x=0.6,
yref = "paper", y= 1))
fig
all_states <- map_data("state")
#state_labels <- usmapdata::centroid_labels("states")
# You need to merge dataset with maps one with long and lat.
# But you need same key so lets change state to region used in maps all_states
# Note I lowercased it to get the match
all_data$region <- tolower(all_data$State)
totaldf <- merge(all_states, all_data, by = "region")
#totaldf2 <- merge(state_labels, totaldf, by = "fips")
# switched to data.table to fix the cut up map issue
# getting sort by region then order
totaldt <- as.data.table(totaldf)
setkey(totaldt, region, order)
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = DS_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Scientist Annual Salary", subtitle ="New York + California Contain the Highest Salary Earners; South and Midwest Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold", hjust=-0.2),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Data_Analyst_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Analyst Annual Salary", subtitle ="New York has the Highest Salary Earners; South and Midwest Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Big_Data_Engineer_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Big Data Engineer Annual Salary", subtitle ="Washington State has the Highest Salary Earners; Southeast States Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = ML_Engineer_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Machine Learning Engineer Annual Salary", subtitle ="New York Comtains the Highest Salary Earners; South and Midwest Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Business_Analyst_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Business Analyst Annual Salary", subtitle ="Washington State and NY Contain the Highest Salary Earners; Southeast States Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=11, face="bold.italic", hjust=-0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Data_Architect_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Architect Annual Salary", subtitle ="Nevada and Massachusetts Contain the Highest Salary Earners; Most States Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=-0.3),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Data_Manager_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Manager Annual Salary", subtitle ="New York Contains the Highest Salary Earners; South and Midwest Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Data_Visualization_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Visualization Engineer Annual Salary", subtitle ="Washington State and New York Contain the Highest Salary Earners; South and Midwest Lag Behind") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=10, face="bold.italic", hjust=-0.3),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
ggplot(data = totaldt,
aes(x = long, y = lat, group = group, fill = Data_Engineer_Annual_Salary)) +
geom_polygon() + coord_map() +
#geom_text(data = totaldf2, aes(
#x = x, y = y,
#label = stateabbr,
#), color = "white")
#geom_text(data = totaldt, aes(x = long, y = lat, label = state.abb), size = 3) +
scale_fill_gradientn("", colours=brewer.pal(9, "YlGnBu")) +
ggtitle("Data Engineer Annual Salary", subtitle ="Nevada, Oregon and Massachusetts Contain the Highest Salary Earners") +
theme_void()+
theme(plot.subtitle=element_text(color="blue", size=12, face="bold.italic", hjust=0.5),
plot.title=element_text(color="red", size=12, face="bold", hjust=0.5))
# I tried to input state abbreviations for each state on the map, but did not have time
# Get centroids
#centroid_labels <- usmapdata::centroid_labels("states")
# Join centroids to data
#state_labels <- merge(all_data, centroid_labels, by = "full")
#plot_usmap(data = statecounts, regions = "state", values = "n") +
# geom_text(data = state_labels, aes(
#x = x, y = y,
# label = stateabbr,
#), color = "white") +
#labs(title = "Frequency of Unique Users in the United States",
#caption = "",
#fill = "Data_Scientist_Annual_Salary")
https://stackoverflow.com/questions/24422719/plotting-barchart-on-states-map-in-r
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-DATA-Scientist-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Data-Analyst-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-BIG-DATA-Engineer-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Machine-Learning-Engineer-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Business-Analyst-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Lead-DATA-Architect-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-Data-Manager-Salary-by-State
https://www.ziprecruiter.com/Salaries/What-Is-the-Average-DATA-Engineer-Salary-by-State