library(tidyverse)
library(here)
library(scales)
library(countrycode)
library(gridExtra)
library(plotly)
rm(list = ls())
d = read_csv(here('Final Project' ,'ds_salaries.csv'))
d <- d %>% select(-salary)
d$country_name <- countrycode(d$employee_residence, origin = "iso2c", destination = "country.name")
d$company_location <- countrycode(d$company_location, origin = "iso2c", destination = "country.name")
d <- d %>% rename(region = country_name)
d <- d %>%
mutate(region = case_when(
region == "United States" ~ "USA",
region == "United Kingdom" ~ "UK",
region == "Bosnia & Herzegovina" ~ "Bosnia and Herzegovina",
region == "Czechia" ~ "Czech Republic",
region == "Hong Kong SAR China" ~ "China",
TRUE ~ region
))
d <- d %>%
mutate(company_location = case_when(
company_location == "United States" ~ "USA",
company_location == "United Kingdom" ~ "UK",
company_location == "Bosnia & Herzegovina" ~ "Bosnia and Herzegovina",
company_location == "Czechia" ~ "Czech Republic",
company_location == "Hong Kong SAR China" ~ "China",
TRUE ~ company_location
))
d <- d %>%
mutate(work_setting = case_when(
remote_ratio == 100 ~ "Remote",
remote_ratio == 50 ~ "Hybrid",
remote_ratio == 0 ~ "On-site"))
We first look at the salary statistics across difference experience levels:
salary_by_experience <- d %>%
mutate(experience_level = factor(experience_level,
levels = c('EX','SE','MI','EN'),
labels = c('Expert','Senior','Mid','Entry'))) %>%
group_by(experience_level) %>%
summarize(
"Mean Salary" = mean(salary_in_usd),
"Median Salary" = median(salary_in_usd),
"Min Salary" = min(salary_in_usd),
"Max Salary" = max(salary_in_usd)
) %>%
pivot_longer(cols = c("Mean Salary", "Median Salary", "Min Salary", "Max Salary"),
names_to = "salary_statistics",
values_to = "salary")
ggplot(data = salary_by_experience, aes(x = experience_level,
y = salary,
color = salary_statistics)) +
geom_point(size = 4) +
scale_y_continuous(labels = dollar_format(),
breaks = seq(0, 500000, 50000))+
labs(title = "Basic Salary Statistics Across Experience Levels",
x = "Experience Level",
y = "US Dollars",
color = "Salary Statistics") +
theme_minimal()
Insights:
Mean and Median salaries are relatively stable across experience levels, suggesting that, on average, salary progression is somewhat steady.
The max salary for Mid-level professionals is the highest among all levels, which is surprising. Typically, one would expect Experts to have the highest max salary. This could be due to outliers or specific high-paying roles within the Mid-level category. It may also indicate a few Mid-level employees securing exceptionally high-paying positions.
The difference in max salary between Entry-level and other levels is enormous, suggesting that career progression significantly impacts earning potential.
We will look into the distribution of salary to see if there is any outliers for Mid-level:
d %>%
mutate(experience_level = factor(experience_level,
levels = c('EX','SE','MI','EN'),
labels = c('Expert','Senior','Mid','Entry'))) %>%
ggplot(aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
geom_boxplot() +
scale_y_continuous(labels = dollar_format()) +
labs(title = "Salary Distribution by Experience Levels",
x = "Experience Level",
y = "Salary in USD",
fill = "Experience Level") +
theme_minimal()
Insights:
=> Mid-level employees have a broader earning potential, meaning some people at this stage leap into high salaries earlier, while others remain closer to the median.
=> Senior salaries stabilize, suggesting that once professionals reach senior positions, their pay is more predictable and standardized acorss companies.
Now, we look further to the employee locations to see if there is any different pattern in salary based on experience level.
world_map <- map_data('world')
world_map_filtered <- world_map %>% dplyr::filter(lat > -60, long >= -180)
average_salary_data <- d %>%
group_by(region, experience_level) %>%
summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE),
.groups = "drop")
entry_salary <- average_salary_data %>%
dplyr::filter(experience_level == "EN")
ggplotly(ggplot() +
geom_polygon(data = world_map_filtered,
aes(x = long, y = lat, group = group),
fill = "lightgray", color = "white") +
geom_map(data = entry_salary, map = world_map_filtered, # Use the filtered dataset
aes(map_id = region, fill = avg_salary), color = "white") +
scale_fill_gradient(low = "lightblue",
high = "darkblue",
na.value = "gray",
labels = label_comma(),
name = "Average Salary") +
labs(title = 'Average Salary across Regions - ENTRY') +
theme_minimal())
mid_salary <- average_salary_data %>%
dplyr::filter(experience_level == "MI")
ggplotly(ggplot() +
geom_polygon(data = world_map_filtered,
aes(x = long, y = lat, group = group),
fill = "lightgray", color = "white") +
geom_map(data = mid_salary, map = world_map_filtered, # Use the filtered dataset
aes(map_id = region, fill = avg_salary), color = "white") +
scale_fill_gradient(low = "lightblue",
high = "darkblue",
na.value = "gray",
labels = label_comma(),
name = "Average Salary") +
labs(title = 'Average Salary across Regions - MID') +
theme_minimal())
senior_salary <- average_salary_data %>%
dplyr::filter(experience_level == "SE")
ggplotly(ggplot() +
geom_polygon(data = world_map_filtered,
aes(x = long, y = lat, group = group),
fill = "lightgray", color = "white") +
geom_map(data = senior_salary, map = world_map_filtered, # Use the filtered dataset
aes(map_id = region, fill = avg_salary), color = "white") +
scale_fill_gradient(low = "lightblue",
high = "darkblue",
na.value = "gray",
labels = label_comma(),
name = "Average Salary") +
labs(title = 'Average Salary across Regions - SENIOR') +
theme_minimal())
expert_salary <- average_salary_data %>%
dplyr::filter(experience_level == "EN")
ggplotly(ggplot() +
geom_polygon(data = world_map_filtered,
aes(x = long, y = lat, group = group),
fill = "lightgray", color = "white") +
geom_map(data = expert_salary, map = world_map_filtered, # Use the filtered dataset
aes(map_id = region, fill = avg_salary), color = "white") +
scale_fill_gradient(low = "lightblue",
high = "darkblue",
na.value = "gray",
labels = label_comma(),
name = "Average Salary") +
labs(title = 'Average Salary across Regions - EXPERT') +
theme_minimal())
summary_data <- d %>%
group_by(experience_level, work_setting) %>%
summarize(counting = n(), .groups = "drop")
ggplot(data = summary_data,
aes(x = experience_level,
y = counting,
fill = work_setting)) +
geom_bar(stat = "identity",
position = "fill") +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = "Experience Level",
y = "Count",
fill = "Work Setting",
title = "Proportion Of Work Setting Across Experience Level") +
theme_minimal()
Insights:
As experience increases to senior level, there’s a clear preference for traditional on-site work.
Entry-level positions offer more hybrid flexibility than other experience levels.
Remote work remains a consistent option regardless of experience level.
Mid-level appears to be a transition point where work arrangements shift.
d %>%
dplyr::filter(company_size == "M") %>%
ggplot(aes(x = salary_in_usd,
fill = work_setting)) +
geom_histogram(binwidth = 10000,
alpha = 0.5)+
scale_x_continuous(labels = dollar_format(),
breaks = seq(0, 500000, 50000)) +
scale_y_continuous(breaks = seq(0, 300, 50)) +
labs(title = "Distribution of Salary by Work Setting",
x = "Salary in USD",
y = "Count",
fill = "Work Setting")
Insights:
On-site work shows stronger representation in both the middle salary range ($150,000) and at higher salaries (above $250,000).
Remote work has significant presence across most of the salary spectrum but tails off at the highest ranges
Hybrid arrangements appear to be less common overall and are most visible in the lower to mid-salary ranges
=> On-site work is most common overall, while remote and hybrid arrangements have different salary distribution patterns.
chosen_jobs <-d %>%
dplyr::filter(company_size == "M") %>%
group_by(job_title) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count)) %>%
slice_head(n = 10) %>%
pull(job_title)
d %>%
dplyr::filter(job_title %in% chosen_jobs, company_size == "M") %>%
group_by(job_title, work_setting) %>%
summarise(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = job_title,
y = work_setting,
fill = avg_salary)) +
geom_tile() +
scale_fill_gradientn(colors = c('#F6F8D5', '#98D2C0', '#205781')) +
labs(title = "Average Salary of Different Job Titles across Work Setting",
x = "Job Titles",
y = "Work Setting",
fill = "Average Salary")
1. Hybrid work settings:
Significantly lower salaries across all job titles compared to on-site and remote
Several job titles don’t appear to have hybrid options (shown as gray/white cells)
Analytics Engineer, Data Analyst, Data Engineer, Data Scientist, Research Engineer, and Research Scientist roles are available in hybrid settings
The highest hybrid salaries appear to be for Data Scientists and Research Scientists, but still lower than their on-site/remote counterparts
2. On-site vs. Remote work:
Salaries are fairly consistent between on-site and remote for most positions
Data Science Manager shows the highest average salary in both on-site and remote settings (darkest blue)
Applied Scientist, Machine Learning Engineer, and Research Scientist also command high salaries in both settings
3. Role-based observations:
Data Science Manager is the highest-paid role across settings
Data Analyst appears to have the lowest average salary (lightest color) across all settings
Specialized roles like Machine Learning Engineer, Applied Scientist, and Research Scientist command higher salaries
Data Engineer shows mid-range compensation
4. Overall patterns:
Clear salary hierarchy exists across job titles regardless of work setting
Hybrid work consistently offers lower compensation than traditional on-site or remote arrangements
Remote work is competitive with (and sometimes exceeds) on-site work in terms of compensation
The technical/specialized roles tend to have higher compensation regardless of setting
Job titles having at least 30 count:
d %>% count(job_title) %>% arrange(desc(n)) %>% dplyr::filter(n >= 30)
job_30 <- d %>%
group_by(job_title) %>%
summarise(count = n(),
min_salary = min(salary_in_usd),
max_salary = max(salary_in_usd),
median_salary = median(salary_in_usd),
mean_salary = mean(salary_in_usd)) %>%
arrange(desc(count)) %>%
dplyr::filter(count >= 30)
ggplot(data = d %>% dplyr::filter(job_title %in% job_30$job_title),
aes(x = reorder(job_title, salary_in_usd, FUN = median),
y = salary_in_usd)) +
geom_boxplot(aes(fill = job_title),
alpha = 0.7) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
theme_minimal() +
theme(legend.position = "none",
axis.text.y = element_text(size = 10)) +
labs( title = "Salary Distribution by Common Data Science Job Titles",
subtitle = "For job titles with at least 30 observations",
x = "",
y = "Annual Salary (USD)")
Insights:
Most roles have median salaries between approximately $150,000 and $200,000, showing that data science fields generally offer substantial compensation.
Data Engineer shows a cluster of outliers in the $300,000-$350,000 range, suggesting some specialized data engineers command much higher salaries than the median.
Research Scientist has the widest overall range, with outliers reaching close to $450,000, indicating significant variability in compensation.
ggplot(data = d %>% dplyr::filter(job_title %in% job_30$job_title),
aes(x = salary_in_usd)) +
geom_histogram(aes(fill = job_title),
bins = 20,
alpha = 0.5) +
facet_wrap(~ job_title,
scales = "free_y") +
scale_x_continuous(labels = scales::label_dollar(scale = 1e-3, suffix = "K")) +
labs(x = "Salary in USD",
y = "Count") +
theme(legend.position = "none")
Insights:
There’s significant variation in data volume across roles - Data Engineer and Data Scientist have hundreds of observations, while roles like ML Engineer and Research Engineer have fewer than 50, suggesting these are less common positions.
Data Analyst shows a clear right-skewed distribution with most salaries clustered around $100K-$150K.
Data Engineer has a prominent peak around $150K with a long tail extending to $300K.
Data Scientist distribution is more spread out between $100K-$250K.
Machine Learning Engineer shows a multimodal distribution with peaks at around $100K, $200K, and $250K.
Research Scientist shows the most interesting high-end distribution with scattered observations extending all the way to $450K, confirming the wide range seen in the box plot.
most_common_job <- d %>%
count(job_title) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(job_title)
most_common_job_locations <- d %>%
dplyr::filter(job_title == most_common_job) %>%
group_by(company_location) %>%
summarise(
count = n(),
"Median salary" = median(salary_in_usd),
"Mean salary" = mean(salary_in_usd)
) %>%
dplyr::filter(count >= 5)
print(most_common_job)
## [1] "Data Engineer"
most_common_job_locations_modified <- most_common_job_locations %>%
pivot_longer(cols = c("Median salary", "Mean salary"),
names_to = "salary_type",
values_to = "salary") %>%
group_by(company_location) %>%
mutate(max_salary = max(salary)) %>%
ungroup()
ggplot(data = most_common_job_locations_modified,
aes(x = company_location,
y = salary,
fill = salary_type)) +
geom_col(position = position_dodge(width = 0.7),
alpha = 0.8) +
geom_text(aes(label = paste0("n=", count),
y = max_salary * 1.05,
group = company_location),
position = position_dodge(width = 0.7),
vjust = -0.5,
size = 3) +
scale_y_continuous(labels = dollar_format(),
breaks = seq(0, max(most_common_job_locations_modified$salary), 30000)) +
coord_flip() +
theme_minimal() +
labs(
title = paste("Salary Statistics for", most_common_job, "by Company Location"),
subtitle = "For countries with at least 5 observations",
x = "Company Location",
y = "Annual Salary (USD)",
fill = "Salary Statistic"
)
Insights:
The USA and Canada clearly lead with median salaries around $150K for Data Engineers, while India shows the lowest at approximately $25K.
In the USA, the mean salary is noticeably higher than the median, indicating some high outliers pulling the average up.
Canada shows little difference between mean and median, suggesting a more even distribution.
Combining all three visuals, we can conclude that:
Data science compensation is heavily influenced by:
Geographic location (with North America paying significantly more)
Role seniority (managers and specialized researchers earning the most)
Technical specialization (ML and architecture roles generally outearning analyst positions)
The significant disparity between North American and other markets highlights the global nature of the talent competition, with the USA and Canada appearing to be the most lucrative markets for data professionals.
Research-oriented positions show the widest salary ranges, suggesting that specialized expertise or accomplishments in these areas can command premium compensation beyond the typical ranges.