1 Set working directory to the current folder

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

2 Introduction

This report investigates the determinants of data science salaries using a cross-sectional dataset of reported compensation. We combine exploratory data analysis (EDA) with a fixed-effects regression framework to quantify how experience, work model (remote/hybrid/on-site), company size, and year affect (log) salary, while absorbing country-level heterogeneity through location fixed effects. We then assess the robustness of the main findings to (i) restricting the sample to full-time employees, (ii) replacing company-location with employee-residence fixed effects, (iii) trimming the top 1% of salaries, and (iv) allowing the remote-work effect to vary by experience level.

3 Data

3.1 Dependencies

required <- c("tidyverse", "fixest", "modelsummary", "scales")
to_install <- setdiff(required, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install)

library(tidyverse)
library(scales)
library(fixest)
library(modelsummary)

3.2 Load the raw data

raw_data <- read_csv("data_clean.csv")
glimpse(raw_data)
## Rows: 6,599
## Columns: 11
## $ job_title          <chr> "Data Engineer", "Data Engineer", "Data Scientist",…
## $ experience_level   <chr> "Mid-level", "Mid-level", "Senior-level", "Senior-l…
## $ employment_type    <chr> "Full-time", "Full-time", "Full-time", "Full-time",…
## $ work_models        <chr> "Remote", "Remote", "Remote", "Remote", "On-site", …
## $ work_year          <dbl> 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 202…
## $ employee_residence <chr> "United States", "United States", "United States", …
## $ salary_in_usd      <dbl> 148100, 98700, 140032, 100022, 120000, 62100, 25000…
## $ company_location   <chr> "United States", "United States", "United States", …
## $ company_size       <chr> "Medium", "Medium", "Medium", "Medium", "Medium", "…
## $ job_category       <chr> "Data Engineering", "Data Engineering", "Data Scien…
## $ log_salary         <dbl> 11.90564, 11.49984, 11.84963, 11.51315, 11.69525, 1…

3.3 Cleaning and feature construction

A single cleaning pipeline is reused by both the EDA and the econometric models, removing the duplicated cleaning logic from the two original scripts.

cleaned_data <- raw_data %>%
  drop_na() %>%
  filter(salary_in_usd > 0) %>%
  mutate(
    experience_level = factor(
      experience_level,
      levels = c("Entry-level", "Mid-level", "Senior-level", "Executive-level")
    ),
    company_size       = factor(company_size, levels = c("Small", "Medium", "Large")),
    work_year          = as.factor(work_year),
    employment_type    = as.factor(employment_type),
    work_models        = as.factor(work_models),
    company_location   = as.factor(company_location),
    employee_residence = as.factor(employee_residence),
    log_salary         = log(salary_in_usd),
    job_category = case_when(
      str_detect(job_title, "Data Scientist")      ~ "Data Science",
      str_detect(job_title, "Data Engineer")       ~ "Data Engineering",
      str_detect(job_title, "Analyst")             ~ "Data Analysis",
      str_detect(job_title, "Machine Learning|ML") ~ "Machine Learning",
      str_detect(job_title, "Manager|Lead|Head")   ~ "Management",
      TRUE                                          ~ "Other"
    )
  ) %>%
  select(-any_of(c("salary", "salary_currency")))

write_csv(cleaned_data, "data_clean.csv")
dim(cleaned_data)
## [1] 6599   11

4 Exploratory Data Analysis

4.1 Experience level vs. salary

ggplot(cleaned_data, aes(x = experience_level, y = salary_in_usd, fill = experience_level)) +
  geom_boxplot(alpha = 0.7) +
  scale_y_continuous(labels = label_dollar()) +
  labs(title = "Does Experience Actually Pay Off?",
       subtitle = "Salary distribution across experience levels",
       x = "Level of Experience", y = "Salary (USD)") +
  theme_minimal() + theme(legend.position = "none")

4.2 Salary distribution

ggplot(cleaned_data, aes(x = salary_in_usd)) +
  geom_histogram(fill = "#2d5a3f", color = "white", bins = 30) +
  scale_x_continuous(labels = label_dollar()) +
  labs(title = "The Spread of Data Science Salaries",
       subtitle = "Most salaries cluster between $100k and $200k",
       x = "Salary (USD)", y = "Number of Employees") +
  theme_minimal()

4.3 Average salary by job category

category_summary <- cleaned_data %>%
  group_by(job_category) %>%
  summarise(avg_salary = mean(salary_in_usd), .groups = "drop") %>%
  arrange(desc(avg_salary))

ggplot(category_summary,
       aes(x = reorder(job_category, avg_salary), y = avg_salary, fill = job_category)) +
  geom_col() + coord_flip() +
  scale_y_continuous(labels = label_dollar()) +
  labs(title = "Which Field Pays the Best?",
       x = "Job Category", y = "Average Salary (USD)") +
  theme_minimal() + theme(legend.position = "none")

4.4 Remote vs. on-site

ggplot(cleaned_data, aes(x = work_models, y = salary_in_usd, fill = work_models)) +
  geom_violin(trim = FALSE, alpha = 0.6) +
  geom_boxplot(width = 0.1, color = "black", outlier.shape = NA) +
  scale_y_continuous(labels = label_dollar()) +
  labs(title = "Remote vs. On-site: Is There a Pay Gap?",
       x = "Work Model", y = "Salary (USD)") +
  theme_minimal() + theme(legend.position = "none")

4.5 Company size

ggplot(cleaned_data, aes(x = company_size, y = salary_in_usd, fill = company_size)) +
  geom_boxplot() +
  scale_y_continuous(labels = label_dollar()) +
  labs(title = "Does Company Size Matter?",
       x = "Company Size", y = "Salary (USD)") +
  theme_minimal() + theme(legend.position = "none")

4.6 Top 10 locations by average salary

geo_summary <- cleaned_data %>%
  group_by(company_location) %>%
  filter(n() > 10) %>%
  summarise(avg_salary = mean(salary_in_usd), .groups = "drop") %>%
  arrange(desc(avg_salary)) %>%
  slice_head(n = 10)

ggplot(geo_summary, aes(x = reorder(company_location, avg_salary), y = avg_salary)) +
  geom_col(fill = "#5a8a6e") + coord_flip() +
  scale_y_continuous(labels = label_dollar()) +
  labs(title = "Top 10 Locations with Highest Average Salaries",
       subtitle = "Only countries with more than 10 reported roles included",
       x = "Country", y = "Average Salary (USD)") +
  theme_minimal()

5 Model

model1 <- feols(
  log_salary ~
    experience_level +
    work_models +
    company_size +
    work_year |
    company_location,
  
  data = cleaned_data,
  
  vcov = "hetero"
)
summary(model1)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,581
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust 
##                                  Estimate Std. Error   t value   Pr(>|t|)    
## experience_levelMid-level        0.319570   0.022049 14.493956  < 2.2e-16 ***
## experience_levelSenior-level     0.589480   0.020541 28.697087  < 2.2e-16 ***
## experience_levelExecutive-level  0.776942   0.032577 23.849254  < 2.2e-16 ***
## work_modelsOn-site               0.094438   0.038746  2.437359 1.4822e-02 *  
## work_modelsRemote                0.056425   0.037854  1.490594 1.3612e-01    
## company_sizeMedium               0.155068   0.043267  3.583981 3.4087e-04 ***
## company_sizeLarge                0.183858   0.043882  4.189796 2.8289e-05 ***
## work_year2021                   -0.004570   0.068649 -0.066573 9.4692e-01    
## work_year2022                    0.005000   0.065542  0.076288 9.3919e-01    
## work_year2023                    0.074351   0.066156  1.123863 2.6111e-01    
## work_year2024                    0.075900   0.068741  1.104146 2.6957e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.403023     Adj. R2: 0.417205
##                  Within R2: 0.187955

5.1 Full-time employees only

cleaned_data_fulltime <- cleaned_data %>%
  filter(employment_type == "Full-time")

model2 <- feols(
  log_salary ~
    experience_level +
    work_models +
    company_size +
    work_year |
    company_location,
  
  data = cleaned_data_fulltime,
  
  vcov = "hetero"
)
summary(model2)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,537
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust 
##                                  Estimate Std. Error   t value  Pr(>|t|)    
## experience_levelMid-level        0.317821   0.021948 14.480722 < 2.2e-16 ***
## experience_levelSenior-level     0.584756   0.020499 28.525454 < 2.2e-16 ***
## experience_levelExecutive-level  0.767264   0.032310 23.747060 < 2.2e-16 ***
## work_modelsOn-site               0.083990   0.038300  2.192964 0.0283454 *  
## work_modelsRemote                0.048041   0.037514  1.280634 0.2003683    
## company_sizeMedium               0.125672   0.041633  3.018536 0.0025499 ** 
## company_sizeLarge                0.158123   0.042274  3.740415 0.0001853 ***
## work_year2021                   -0.020637   0.069505 -0.296916 0.7665400    
## work_year2022                   -0.005215   0.067409 -0.077368 0.9383334    
## work_year2023                    0.064892   0.068162  0.952027 0.3411189    
## work_year2024                    0.067448   0.070632  0.954925 0.3396514    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.399932     Adj. R2: 0.411489
##                  Within R2: 0.182948

5.2 Employee-residence fixed effects

model3 <- feols(
  log_salary ~
    experience_level +
    work_models +
    company_size +
    work_year |
    employee_residence,
  
  data = cleaned_data,
  
  vcov = "hetero"
)
summary(model3)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,573
## Fixed-effects: employee_residence: 61
## Standard-errors: Heteroskedasticity-robust 
##                                  Estimate Std. Error   t value  Pr(>|t|)    
## experience_levelMid-level        0.315612   0.022047 14.315312 < 2.2e-16 ***
## experience_levelSenior-level     0.580040   0.020584 28.179227 < 2.2e-16 ***
## experience_levelExecutive-level  0.765084   0.031693 24.140449 < 2.2e-16 ***
## work_modelsOn-site               0.115846   0.036659  3.160069 0.0015846 ** 
## work_modelsRemote                0.088142   0.035917  2.454035 0.0141524 *  
## company_sizeMedium               0.071116   0.044221  1.608189 0.1078423    
## company_sizeLarge                0.115596   0.044555  2.594428 0.0094961 ** 
## work_year2021                   -0.057095   0.060117 -0.949735 0.3422824    
## work_year2022                   -0.066373   0.057159 -1.161193 0.2456059    
## work_year2023                    0.006300   0.057651  0.109269 0.9129923    
## work_year2024                    0.003890   0.060417  0.064387 0.9486641    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.399377     Adj. R2: 0.424603
##                  Within R2: 0.178992

5.3 Trimming the top 1% of salaries

salary_cutoff <- quantile(cleaned_data$salary_in_usd, 0.99)

cleaned_data_trimmed <- cleaned_data %>%
  filter(salary_in_usd < salary_cutoff)

model4 <- feols(
  log_salary ~
    experience_level +
    work_models +
    company_size +
    work_year |
    company_location,
  
  data = cleaned_data_trimmed,
  
  vcov = "hetero"
)
summary(model4)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,515
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust 
##                                 Estimate Std. Error   t value   Pr(>|t|)    
## experience_levelMid-level       0.306057   0.021670 14.123456  < 2.2e-16 ***
## experience_levelSenior-level    0.580920   0.020354 28.540872  < 2.2e-16 ***
## experience_levelExecutive-level 0.760460   0.032090 23.697760  < 2.2e-16 ***
## work_modelsOn-site              0.102666   0.037470  2.739989 6.1610e-03 ** 
## work_modelsRemote               0.071710   0.036622  1.958121 5.0259e-02 .  
## company_sizeMedium              0.156018   0.042812  3.644263 2.7027e-04 ***
## company_sizeLarge               0.174353   0.043346  4.022363 5.8276e-05 ***
## work_year2021                   0.021618   0.065194  0.331601 7.4020e-01    
## work_year2022                   0.033234   0.061874  0.537113 5.9121e-01    
## work_year2023                   0.100833   0.062397  1.615988 1.0615e-01    
## work_year2024                   0.085196   0.064735  1.316075 1.8820e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.388659     Adj. R2: 0.433696
##                  Within R2: 0.198968

5.4 Heterogeneous effect of remote work by experience

model5 <- feols(
  log_salary ~
    experience_level * work_models +
    company_size +
    work_year |
    company_location,
  
  data = cleaned_data,
  
  vcov = "hetero"
)
summary(model5)
## OLS estimation, Dep. Var.: log_salary
## Observations: 6,581
## Fixed-effects: company_location: 57
## Standard-errors: Heteroskedasticity-robust 
##                                                     Estimate Std. Error
## experience_levelMid-level                           0.343409   0.075560
## experience_levelSenior-level                        0.619033   0.083473
## experience_levelExecutive-level                     1.115405   0.143974
## work_modelsOn-site                                  0.191847   0.065791
## work_modelsRemote                                   0.028368   0.066908
## company_sizeMedium                                  0.147008   0.043080
## company_sizeLarge                                   0.179725   0.043826
## work_year2021                                      -0.001642   0.067986
## work_year2022                                       0.002267   0.064911
## work_year2023                                       0.071562   0.065461
## work_year2024                                       0.072747   0.067980
## experience_levelMid-level:work_modelsOn-site       -0.065259   0.081521
## experience_levelSenior-level:work_modelsOn-site    -0.094003   0.087791
## experience_levelExecutive-level:work_modelsOn-site -0.478176   0.152330
## experience_levelMid-level:work_modelsRemote         0.018247   0.084400
## experience_levelSenior-level:work_modelsRemote      0.050915   0.090276
## experience_levelExecutive-level:work_modelsRemote  -0.194188   0.151842
##                                                      t value   Pr(>|t|)    
## experience_levelMid-level                           4.544838 5.5971e-06 ***
## experience_levelSenior-level                        7.415999 1.3606e-13 ***
## experience_levelExecutive-level                     7.747288 1.0821e-14 ***
## work_modelsOn-site                                  2.915993 3.5577e-03 ** 
## work_modelsRemote                                   0.423984 6.7159e-01    
## company_sizeMedium                                  3.412463 6.4771e-04 ***
## company_sizeLarge                                   4.100886 4.1659e-05 ***
## work_year2021                                      -0.024156 9.8073e-01    
## work_year2022                                       0.034920 9.7214e-01    
## work_year2023                                       1.093207 2.7434e-01    
## work_year2024                                       1.070136 2.8460e-01    
## experience_levelMid-level:work_modelsOn-site       -0.800525 4.2344e-01    
## experience_levelSenior-level:work_modelsOn-site    -1.070751 2.8432e-01    
## experience_levelExecutive-level:work_modelsOn-site -3.139086 1.7023e-03 ** 
## experience_levelMid-level:work_modelsRemote         0.216193 8.2884e-01    
## experience_levelSenior-level:work_modelsRemote      0.563995 5.7278e-01    
## experience_levelExecutive-level:work_modelsRemote  -1.278882 2.0098e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.402039     Adj. R2: 0.419514
##                  Within R2: 0.191916

6 Results

The main specification confirms the patterns visible in the EDA: salaries rise sharply with experience, and there is meaningful variation across work models and company sizes once country effects are absorbed. Coefficients are in log points and approximate percentage changes relative to the omitted category.

6.1 Combined regression table

modelsummary(
  list(
    "Main Model"     = model1,
    "Full-Time Only" = model2,
    "Residence FE"   = model3,
    "Trimmed Sample" = model4,
    "Remote x Exp."  = model5
  ),
  stars  = TRUE,
  output = "markdown"
)
Main Model Full-Time Only Residence FE Trimmed Sample Remote x Exp.
experience_levelMid-level 0.320*** 0.318*** 0.316*** 0.306*** 0.343***
(0.022) (0.022) (0.022) (0.022) (0.076)
experience_levelSenior-level 0.589*** 0.585*** 0.580*** 0.581*** 0.619***
(0.021) (0.020) (0.021) (0.020) (0.083)
experience_levelExecutive-level 0.777*** 0.767*** 0.765*** 0.760*** 1.115***
(0.033) (0.032) (0.032) (0.032) (0.144)
work_modelsOn-site 0.094* 0.084* 0.116** 0.103** 0.192**
(0.039) (0.038) (0.037) (0.037) (0.066)
work_modelsRemote 0.056 0.048 0.088* 0.072+ 0.028
(0.038) (0.038) (0.036) (0.037) (0.067)
company_sizeMedium 0.155*** 0.126** 0.071 0.156*** 0.147***
(0.043) (0.042) (0.044) (0.043) (0.043)
company_sizeLarge 0.184*** 0.158*** 0.116** 0.174*** 0.180***
(0.044) (0.042) (0.045) (0.043) (0.044)
work_year2021 -0.005 -0.021 -0.057 0.022 -0.002
(0.069) (0.070) (0.060) (0.065) (0.068)
work_year2022 0.005 -0.005 -0.066 0.033 0.002
(0.066) (0.067) (0.057) (0.062) (0.065)
work_year2023 0.074 0.065 0.006 0.101 0.072
(0.066) (0.068) (0.058) (0.062) (0.065)
work_year2024 0.076 0.067 0.004 0.085 0.073
(0.069) (0.071) (0.060) (0.065) (0.068)
experience_levelMid-level × work_modelsOn-site -0.065
(0.082)
experience_levelSenior-level × work_modelsOn-site -0.094
(0.088)
experience_levelExecutive-level × work_modelsOn-site -0.478**
(0.152)
experience_levelMid-level × work_modelsRemote 0.018
(0.084)
experience_levelSenior-level × work_modelsRemote 0.051
(0.090)
experience_levelExecutive-level × work_modelsRemote -0.194
(0.152)
Num.Obs. 6581 6537 6573 6515 6581
R2 0.423 0.418 0.431 0.440 0.426
R2 Adj. 0.417 0.411 0.425 0.434 0.420
R2 Within 0.188 0.183 0.179 0.199 0.192
R2 Within Adj. 0.187 0.182 0.178 0.198 0.190
AIC 6850.9 6705.4 6731.3 6310.7 6830.8
BIC 7312.8 7166.8 7220.3 6771.9 7333.4
RMSE 0.40 0.40 0.40 0.39 0.40
FE: company_location X X X X
FE: employee_residence X
  • p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001

7 Robustness

Columns 2–5 of the table above re-estimate the wage equation under alternative modelling choices.

  • Full-time only (model 2): restricting to full-time employees removes part-time and contract workers as a source of noise.
  • Employee-residence (model 3): absorbing the worker’s country of residence rather than the company’s location addresses concerns about cross-border remote arrangements.
  • Trimmed sample (model 4): removing the top 1% of salaries checks that extreme top earners are not driving the experience premium.
  • Remote × experience (model 5): allowing the remote-work effect to vary by experience level tests whether the average premium hides meaningful heterogeneity.

Across all four alternatives, the experience-level coefficients remain large, positive, and ordered in the expected direction, and the work-model and company-size effects keep their sign and rough magnitude — indicating that the headline findings are not artefacts of sample composition or fixed-effects choice.

8 Conclusion

The combined evidence suggests that experience is the single largest determinant of compensation in data science roles, with executive-level employees earning substantially more than entry-level peers even after absorbing country-level cost-of-living differences. Company size is positively associated with pay, while the remote-work premium is small on average but interacts with experience — senior workers benefit differently from remote arrangements than juniors. Results are stable across the full-time subsample, an alternative residence-based fixed effect, and a top-1% trimmed sample, indicating that the headline findings are not driven by part-time outliers, country mis-coding, or extreme top earners.

9 Session info

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
## [3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=Polish_Poland.utf8    
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] modelsummary_2.6.0 fixest_0.14.1      scales_1.4.0       lubridate_1.9.4   
##  [5] forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2       
##  [9] readr_2.1.5        tidyr_1.3.1        tibble_3.2.1       ggplot2_4.0.3     
## [13] tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6        bayestestR_0.17.0   xfun_0.49          
##  [4] bslib_0.8.0         insight_1.5.0       lattice_0.22-6     
##  [7] tzdb_0.4.0          numDeriv_2016.8-1.1 vctrs_0.6.5        
## [10] tools_4.4.1         generics_0.1.3      datawizard_1.3.1   
## [13] parallel_4.4.1      sandwich_3.1-1      fansi_1.0.6        
## [16] pkgconfig_2.0.3     tinytable_0.16.0    checkmate_2.3.2    
## [19] data.table_1.17.8   RColorBrewer_1.1-3  S7_0.2.0           
## [22] stringmagic_1.2.0   lifecycle_1.0.5     compiler_4.4.1     
## [25] farver_2.1.2        codetools_0.2-20    htmltools_0.5.8.1  
## [28] sass_0.4.9          yaml_2.3.10         Formula_1.2-5      
## [31] pillar_1.9.0        crayon_1.5.3        jquerylib_0.1.4    
## [34] cachem_1.1.0        parallelly_1.41.0   nlme_3.1-164       
## [37] tidyselect_1.2.1    digest_0.6.37       performance_0.16.0 
## [40] future_1.34.0       mvtnorm_1.3-2       stringi_1.8.4      
## [43] listenv_0.9.1       labeling_0.4.3      fastmap_1.2.0      
## [46] grid_4.4.1          cli_3.6.3           magrittr_2.0.3     
## [49] utf8_1.2.4          future.apply_1.11.3 withr_3.0.2        
## [52] dreamerr_1.5.0      backports_1.5.0     bit64_4.5.2        
## [55] timechange_0.3.0    estimability_1.5.1  rmarkdown_2.29     
## [58] globals_0.16.3      emmeans_1.10.6      bit_4.5.0.1        
## [61] zoo_1.8-12          hms_1.1.3           evaluate_1.0.5     
## [64] knitr_1.49          parameters_0.29.0   rlang_1.1.4        
## [67] Rcpp_1.1.1          xtable_1.8-4        glue_1.8.0         
## [70] rstudioapi_0.17.1   vroom_1.6.5         jsonlite_1.8.9     
## [73] R6_2.6.1            tables_0.9.33