setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
This report investigates the determinants of data science salaries using a cross-sectional dataset of reported compensation. We combine exploratory data analysis (EDA) with a fixed-effects regression framework to quantify how experience, work model (remote/hybrid/on-site), company size, and year affect (log) salary, while absorbing country-level heterogeneity through location fixed effects. We then assess the robustness of the main findings to (i) restricting the sample to full-time employees, (ii) replacing company-location with employee-residence fixed effects, (iii) trimming the top 1% of salaries, and (iv) allowing the remote-work effect to vary by experience level.
required <- c("tidyverse", "fixest", "modelsummary", "scales")
to_install <- setdiff(required, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install)
library(tidyverse)
library(scales)
library(fixest)
library(modelsummary)
raw_data <- read_csv("data_clean.csv")
glimpse(raw_data)
## Rows: 6,599
## Columns: 11
## $ job_title <chr> "Data Engineer", "Data Engineer", "Data Scientist",…
## $ experience_level <chr> "Mid-level", "Mid-level", "Senior-level", "Senior-l…
## $ employment_type <chr> "Full-time", "Full-time", "Full-time", "Full-time",…
## $ work_models <chr> "Remote", "Remote", "Remote", "Remote", "On-site", …
## $ work_year <dbl> 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 202…
## $ employee_residence <chr> "United States", "United States", "United States", …
## $ salary_in_usd <dbl> 148100, 98700, 140032, 100022, 120000, 62100, 25000…
## $ company_location <chr> "United States", "United States", "United States", …
## $ company_size <chr> "Medium", "Medium", "Medium", "Medium", "Medium", "…
## $ job_category <chr> "Data Engineering", "Data Engineering", "Data Scien…
## $ log_salary <dbl> 11.90564, 11.49984, 11.84963, 11.51315, 11.69525, 1…
A single cleaning pipeline is reused by both the EDA and the econometric models, removing the duplicated cleaning logic from the two original scripts.
cleaned_data <- raw_data %>%
drop_na() %>%
filter(salary_in_usd > 0) %>%
mutate(
experience_level = factor(
experience_level,
levels = c("Entry-level", "Mid-level", "Senior-level", "Executive-level")
),
company_size = factor(company_size, levels = c("Small", "Medium", "Large")),
work_year = as.factor(work_year),
employment_type = as.factor(employment_type),
work_models = as.factor(work_models),
company_location = as.factor(company_location),
employee_residence = as.factor(employee_residence),
log_salary = log(salary_in_usd),
job_category = case_when(
str_detect(job_title, "Data Scientist") ~ "Data Science",
str_detect(job_title, "Data Engineer") ~ "Data Engineering",
str_detect(job_title, "Analyst") ~ "Data Analysis",
str_detect(job_title, "Machine Learning|ML") ~ "Machine Learning",
str_detect(job_title, "Manager|Lead|Head") ~ "Management",
TRUE ~ "Other"
)
) %>%
select(-any_of(c("salary", "salary_currency")))
write_csv(cleaned_data, "data_clean.csv")
dim(cleaned_data)
## [1] 6599 11
ggplot(cleaned_data, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
geom_boxplot(alpha = 0.7) +
scale_y_continuous(labels = label_dollar()) +
labs(title = "Does Experience Actually Pay Off?",
subtitle = "Salary distribution across experience levels",
x = "Level of Experience", y = "Salary (USD)") +
theme_minimal() + theme(legend.position = "none")
ggplot(cleaned_data, aes(x = salary_in_usd)) +
geom_histogram(fill = "#2d5a3f", color = "white", bins = 30) +
scale_x_continuous(labels = label_dollar()) +
labs(title = "The Spread of Data Science Salaries",
subtitle = "Most salaries cluster between $100k and $200k",
x = "Salary (USD)", y = "Number of Employees") +
theme_minimal()
category_summary <- cleaned_data %>%
group_by(job_category) %>%
summarise(avg_salary = mean(salary_in_usd), .groups = "drop") %>%
arrange(desc(avg_salary))
ggplot(category_summary,
aes(x = reorder(job_category, avg_salary), y = avg_salary, fill = job_category)) +
geom_col() + coord_flip() +
scale_y_continuous(labels = label_dollar()) +
labs(title = "Which Field Pays the Best?",
x = "Job Category", y = "Average Salary (USD)") +
theme_minimal() + theme(legend.position = "none")
ggplot(cleaned_data, aes(x = work_models, y = salary_in_usd, fill = work_models)) +
geom_violin(trim = FALSE, alpha = 0.6) +
geom_boxplot(width = 0.1, color = "black", outlier.shape = NA) +
scale_y_continuous(labels = label_dollar()) +
labs(title = "Remote vs. On-site: Is There a Pay Gap?",
x = "Work Model", y = "Salary (USD)") +
theme_minimal() + theme(legend.position = "none")
ggplot(cleaned_data, aes(x = company_size, y = salary_in_usd, fill = company_size)) +
geom_boxplot() +
scale_y_continuous(labels = label_dollar()) +
labs(title = "Does Company Size Matter?",
x = "Company Size", y = "Salary (USD)") +
theme_minimal() + theme(legend.position = "none")
geo_summary <- cleaned_data %>%
group_by(company_location) %>%
filter(n() > 10) %>%
summarise(avg_salary = mean(salary_in_usd), .groups = "drop") %>%
arrange(desc(avg_salary)) %>%
slice_head(n = 10)
ggplot(geo_summary, aes(x = reorder(company_location, avg_salary), y = avg_salary)) +
geom_col(fill = "#5a8a6e") + coord_flip() +
scale_y_continuous(labels = label_dollar()) +
labs(title = "Top 10 Locations with Highest Average Salaries",
subtitle = "Only countries with more than 10 reported roles included",
x = "Country", y = "Average Salary (USD)") +
theme_minimal()
model1 <- feols(
log_salary ~
experience_level +
work_models +
company_size +
work_year |
company_location,
data = cleaned_data,
vcov = "hetero"
)
summary(model1)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,581
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## experience_levelMid-level 0.319570 0.022049 14.493956 < 2.2e-16 ***
## experience_levelSenior-level 0.589480 0.020541 28.697087 < 2.2e-16 ***
## experience_levelExecutive-level 0.776942 0.032577 23.849254 < 2.2e-16 ***
## work_modelsOn-site 0.094438 0.038746 2.437359 1.4822e-02 *
## work_modelsRemote 0.056425 0.037854 1.490594 1.3612e-01
## company_sizeMedium 0.155068 0.043267 3.583981 3.4087e-04 ***
## company_sizeLarge 0.183858 0.043882 4.189796 2.8289e-05 ***
## work_year2021 -0.004570 0.068649 -0.066573 9.4692e-01
## work_year2022 0.005000 0.065542 0.076288 9.3919e-01
## work_year2023 0.074351 0.066156 1.123863 2.6111e-01
## work_year2024 0.075900 0.068741 1.104146 2.6957e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.403023 Adj. R2: 0.417205
## Within R2: 0.187955
cleaned_data_fulltime <- cleaned_data %>%
filter(employment_type == "Full-time")
model2 <- feols(
log_salary ~
experience_level +
work_models +
company_size +
work_year |
company_location,
data = cleaned_data_fulltime,
vcov = "hetero"
)
summary(model2)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,537
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## experience_levelMid-level 0.317821 0.021948 14.480722 < 2.2e-16 ***
## experience_levelSenior-level 0.584756 0.020499 28.525454 < 2.2e-16 ***
## experience_levelExecutive-level 0.767264 0.032310 23.747060 < 2.2e-16 ***
## work_modelsOn-site 0.083990 0.038300 2.192964 0.0283454 *
## work_modelsRemote 0.048041 0.037514 1.280634 0.2003683
## company_sizeMedium 0.125672 0.041633 3.018536 0.0025499 **
## company_sizeLarge 0.158123 0.042274 3.740415 0.0001853 ***
## work_year2021 -0.020637 0.069505 -0.296916 0.7665400
## work_year2022 -0.005215 0.067409 -0.077368 0.9383334
## work_year2023 0.064892 0.068162 0.952027 0.3411189
## work_year2024 0.067448 0.070632 0.954925 0.3396514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.399932 Adj. R2: 0.411489
## Within R2: 0.182948
model3 <- feols(
log_salary ~
experience_level +
work_models +
company_size +
work_year |
employee_residence,
data = cleaned_data,
vcov = "hetero"
)
summary(model3)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,573
## Fixed-effects: employee_residence: 61
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## experience_levelMid-level 0.315612 0.022047 14.315312 < 2.2e-16 ***
## experience_levelSenior-level 0.580040 0.020584 28.179227 < 2.2e-16 ***
## experience_levelExecutive-level 0.765084 0.031693 24.140449 < 2.2e-16 ***
## work_modelsOn-site 0.115846 0.036659 3.160069 0.0015846 **
## work_modelsRemote 0.088142 0.035917 2.454035 0.0141524 *
## company_sizeMedium 0.071116 0.044221 1.608189 0.1078423
## company_sizeLarge 0.115596 0.044555 2.594428 0.0094961 **
## work_year2021 -0.057095 0.060117 -0.949735 0.3422824
## work_year2022 -0.066373 0.057159 -1.161193 0.2456059
## work_year2023 0.006300 0.057651 0.109269 0.9129923
## work_year2024 0.003890 0.060417 0.064387 0.9486641
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.399377 Adj. R2: 0.424603
## Within R2: 0.178992
salary_cutoff <- quantile(cleaned_data$salary_in_usd, 0.99)
cleaned_data_trimmed <- cleaned_data %>%
filter(salary_in_usd < salary_cutoff)
model4 <- feols(
log_salary ~
experience_level +
work_models +
company_size +
work_year |
company_location,
data = cleaned_data_trimmed,
vcov = "hetero"
)
summary(model4)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,515
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error t value Pr(>|t|)
## experience_levelMid-level 0.306057 0.021670 14.123456 < 2.2e-16 ***
## experience_levelSenior-level 0.580920 0.020354 28.540872 < 2.2e-16 ***
## experience_levelExecutive-level 0.760460 0.032090 23.697760 < 2.2e-16 ***
## work_modelsOn-site 0.102666 0.037470 2.739989 6.1610e-03 **
## work_modelsRemote 0.071710 0.036622 1.958121 5.0259e-02 .
## company_sizeMedium 0.156018 0.042812 3.644263 2.7027e-04 ***
## company_sizeLarge 0.174353 0.043346 4.022363 5.8276e-05 ***
## work_year2021 0.021618 0.065194 0.331601 7.4020e-01
## work_year2022 0.033234 0.061874 0.537113 5.9121e-01
## work_year2023 0.100833 0.062397 1.615988 1.0615e-01
## work_year2024 0.085196 0.064735 1.316075 1.8820e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.388659 Adj. R2: 0.433696
## Within R2: 0.198968
model5 <- feols(
log_salary ~
experience_level * work_models +
company_size +
work_year |
company_location,
data = cleaned_data,
vcov = "hetero"
)
summary(model5)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,581
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust
## Estimate Std. Error
## experience_levelMid-level 0.343409 0.075560
## experience_levelSenior-level 0.619033 0.083473
## experience_levelExecutive-level 1.115405 0.143974
## work_modelsOn-site 0.191847 0.065791
## work_modelsRemote 0.028368 0.066908
## company_sizeMedium 0.147008 0.043080
## company_sizeLarge 0.179725 0.043826
## work_year2021 -0.001642 0.067986
## work_year2022 0.002267 0.064911
## work_year2023 0.071562 0.065461
## work_year2024 0.072747 0.067980
## experience_levelMid-level:work_modelsOn-site -0.065259 0.081521
## experience_levelSenior-level:work_modelsOn-site -0.094003 0.087791
## experience_levelExecutive-level:work_modelsOn-site -0.478176 0.152330
## experience_levelMid-level:work_modelsRemote 0.018247 0.084400
## experience_levelSenior-level:work_modelsRemote 0.050915 0.090276
## experience_levelExecutive-level:work_modelsRemote -0.194188 0.151842
## t value Pr(>|t|)
## experience_levelMid-level 4.544838 5.5971e-06 ***
## experience_levelSenior-level 7.415999 1.3606e-13 ***
## experience_levelExecutive-level 7.747288 1.0821e-14 ***
## work_modelsOn-site 2.915993 3.5577e-03 **
## work_modelsRemote 0.423984 6.7159e-01
## company_sizeMedium 3.412463 6.4771e-04 ***
## company_sizeLarge 4.100886 4.1659e-05 ***
## work_year2021 -0.024156 9.8073e-01
## work_year2022 0.034920 9.7214e-01
## work_year2023 1.093207 2.7434e-01
## work_year2024 1.070136 2.8460e-01
## experience_levelMid-level:work_modelsOn-site -0.800525 4.2344e-01
## experience_levelSenior-level:work_modelsOn-site -1.070751 2.8432e-01
## experience_levelExecutive-level:work_modelsOn-site -3.139086 1.7023e-03 **
## experience_levelMid-level:work_modelsRemote 0.216193 8.2884e-01
## experience_levelSenior-level:work_modelsRemote 0.563995 5.7278e-01
## experience_levelExecutive-level:work_modelsRemote -1.278882 2.0098e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.402039 Adj. R2: 0.419514
## Within R2: 0.191916
The main specification confirms the patterns visible in the EDA: salaries rise sharply with experience, and there is meaningful variation across work models and company sizes once country effects are absorbed. Coefficients are in log points and approximate percentage changes relative to the omitted category.
modelsummary(
list(
"Main Model" = model1,
"Full-Time Only" = model2,
"Residence FE" = model3,
"Trimmed Sample" = model4,
"Remote x Exp." = model5
),
stars = TRUE,
output = "markdown"
)
| Main Model | Full-Time Only | Residence FE | Trimmed Sample | Remote x Exp. | |
|---|---|---|---|---|---|
| experience_levelMid-level | 0.320*** | 0.318*** | 0.316*** | 0.306*** | 0.343*** |
| (0.022) | (0.022) | (0.022) | (0.022) | (0.076) | |
| experience_levelSenior-level | 0.589*** | 0.585*** | 0.580*** | 0.581*** | 0.619*** |
| (0.021) | (0.020) | (0.021) | (0.020) | (0.083) | |
| experience_levelExecutive-level | 0.777*** | 0.767*** | 0.765*** | 0.760*** | 1.115*** |
| (0.033) | (0.032) | (0.032) | (0.032) | (0.144) | |
| work_modelsOn-site | 0.094* | 0.084* | 0.116** | 0.103** | 0.192** |
| (0.039) | (0.038) | (0.037) | (0.037) | (0.066) | |
| work_modelsRemote | 0.056 | 0.048 | 0.088* | 0.072+ | 0.028 |
| (0.038) | (0.038) | (0.036) | (0.037) | (0.067) | |
| company_sizeMedium | 0.155*** | 0.126** | 0.071 | 0.156*** | 0.147*** |
| (0.043) | (0.042) | (0.044) | (0.043) | (0.043) | |
| company_sizeLarge | 0.184*** | 0.158*** | 0.116** | 0.174*** | 0.180*** |
| (0.044) | (0.042) | (0.045) | (0.043) | (0.044) | |
| work_year2021 | -0.005 | -0.021 | -0.057 | 0.022 | -0.002 |
| (0.069) | (0.070) | (0.060) | (0.065) | (0.068) | |
| work_year2022 | 0.005 | -0.005 | -0.066 | 0.033 | 0.002 |
| (0.066) | (0.067) | (0.057) | (0.062) | (0.065) | |
| work_year2023 | 0.074 | 0.065 | 0.006 | 0.101 | 0.072 |
| (0.066) | (0.068) | (0.058) | (0.062) | (0.065) | |
| work_year2024 | 0.076 | 0.067 | 0.004 | 0.085 | 0.073 |
| (0.069) | (0.071) | (0.060) | (0.065) | (0.068) | |
| experience_levelMid-level × work_modelsOn-site | -0.065 | ||||
| (0.082) | |||||
| experience_levelSenior-level × work_modelsOn-site | -0.094 | ||||
| (0.088) | |||||
| experience_levelExecutive-level × work_modelsOn-site | -0.478** | ||||
| (0.152) | |||||
| experience_levelMid-level × work_modelsRemote | 0.018 | ||||
| (0.084) | |||||
| experience_levelSenior-level × work_modelsRemote | 0.051 | ||||
| (0.090) | |||||
| experience_levelExecutive-level × work_modelsRemote | -0.194 | ||||
| (0.152) | |||||
| Num.Obs. | 6581 | 6537 | 6573 | 6515 | 6581 |
| R2 | 0.423 | 0.418 | 0.431 | 0.440 | 0.426 |
| R2 Adj. | 0.417 | 0.411 | 0.425 | 0.434 | 0.420 |
| R2 Within | 0.188 | 0.183 | 0.179 | 0.199 | 0.192 |
| R2 Within Adj. | 0.187 | 0.182 | 0.178 | 0.198 | 0.190 |
| AIC | 6850.9 | 6705.4 | 6731.3 | 6310.7 | 6830.8 |
| BIC | 7312.8 | 7166.8 | 7220.3 | 6771.9 | 7333.4 |
| RMSE | 0.40 | 0.40 | 0.40 | 0.39 | 0.40 |
| FE: company_location | X | X | X | X | |
| FE: employee_residence | X | ||||
|
|||||
Columns 2–5 of the table above re-estimate the wage equation under alternative modelling choices.
Across all four alternatives, the experience-level coefficients remain large, positive, and ordered in the expected direction, and the work-model and company-size effects keep their sign and rough magnitude — indicating that the headline findings are not artefacts of sample composition or fixed-effects choice.
The combined evidence suggests that experience is the single largest determinant of compensation in data science roles, with executive-level employees earning substantially more than entry-level peers even after absorbing country-level cost-of-living differences. Company size is positively associated with pay, while the remote-work premium is small on average but interacts with experience — senior workers benefit differently from remote arrangements than juniors. Results are stable across the full-time subsample, an alternative residence-based fixed effect, and a top-1% trimmed sample, indicating that the headline findings are not driven by part-time outliers, country mis-coding, or extreme top earners.
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=Polish_Poland.utf8 LC_CTYPE=Polish_Poland.utf8
## [3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C
## [5] LC_TIME=Polish_Poland.utf8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] modelsummary_2.6.0 fixest_0.14.1 scales_1.4.0 lubridate_1.9.4
## [5] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.3
## [13] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 bayestestR_0.17.0 xfun_0.49
## [4] bslib_0.8.0 insight_1.5.0 lattice_0.22-6
## [7] tzdb_0.4.0 numDeriv_2016.8-1.1 vctrs_0.6.5
## [10] tools_4.4.1 generics_0.1.3 datawizard_1.3.1
## [13] parallel_4.4.1 sandwich_3.1-1 fansi_1.0.6
## [16] pkgconfig_2.0.3 tinytable_0.16.0 checkmate_2.3.2
## [19] data.table_1.17.8 RColorBrewer_1.1-3 S7_0.2.0
## [22] stringmagic_1.2.0 lifecycle_1.0.5 compiler_4.4.1
## [25] farver_2.1.2 codetools_0.2-20 htmltools_0.5.8.1
## [28] sass_0.4.9 yaml_2.3.10 Formula_1.2-5
## [31] pillar_1.9.0 crayon_1.5.3 jquerylib_0.1.4
## [34] cachem_1.1.0 parallelly_1.41.0 nlme_3.1-164
## [37] tidyselect_1.2.1 digest_0.6.37 performance_0.16.0
## [40] future_1.34.0 mvtnorm_1.3-2 stringi_1.8.4
## [43] listenv_0.9.1 labeling_0.4.3 fastmap_1.2.0
## [46] grid_4.4.1 cli_3.6.3 magrittr_2.0.3
## [49] utf8_1.2.4 future.apply_1.11.3 withr_3.0.2
## [52] dreamerr_1.5.0 backports_1.5.0 bit64_4.5.2
## [55] timechange_0.3.0 estimability_1.5.1 rmarkdown_2.29
## [58] globals_0.16.3 emmeans_1.10.6 bit_4.5.0.1
## [61] zoo_1.8-12 hms_1.1.3 evaluate_1.0.5
## [64] knitr_1.49 parameters_0.29.0 rlang_1.1.4
## [67] Rcpp_1.1.1 xtable_1.8-4 glue_1.8.0
## [70] rstudioapi_0.17.1 vroom_1.6.5 jsonlite_1.8.9
## [73] R6_2.6.1 tables_0.9.33