Data Scientist Salaries Exploration

This R notebook explores the Data Scientist salary dataset downloaded from Kaggle.com

https://www.kaggle.com/datasets/henryshan/2023-data-scientists-salary

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(tmap)
library(tmaptools)
library(sf)

## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE

library(geojsonsf)
library(rjson)
library(sp)
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

options(scipen = 999)

Read the Data

sal.data <- read.csv(file = "ds_salaries.csv", header = TRUE, sep = ",")
head(sal.data, 10)

##    work_year experience_level employment_type                job_title salary
## 1       2023               SE              FT Principal Data Scientist  80000
## 2       2023               MI              CT              ML Engineer  30000
## 3       2023               MI              CT              ML Engineer  25500
## 4       2023               SE              FT           Data Scientist 175000
## 5       2023               SE              FT           Data Scientist 120000
## 6       2023               SE              FT        Applied Scientist 222200
## 7       2023               SE              FT        Applied Scientist 136000
## 8       2023               SE              FT           Data Scientist 219000
## 9       2023               SE              FT           Data Scientist 141000
## 10      2023               SE              FT           Data Scientist 147100
##    salary_currency salary_in_usd employee_residence remote_ratio
## 1              EUR         85847                 ES          100
## 2              USD         30000                 US          100
## 3              USD         25500                 US          100
## 4              USD        175000                 CA          100
## 5              USD        120000                 CA          100
## 6              USD        222200                 US            0
## 7              USD        136000                 US            0
## 8              USD        219000                 CA            0
## 9              USD        141000                 CA            0
## 10             USD        147100                 US            0
##    company_location company_size
## 1                ES            L
## 2                US            S
## 3                US            S
## 4                CA            M
## 5                CA            M
## 6                US            L
## 7                US            L
## 8                CA            M
## 9                CA            M
## 10               US            M

countries <- fromJSON(file = "world-ash-ms.geojson")

countries <- geojson_sf(toJSON(countries))

countries <- subset(countries, st_is_valid(geometry) & name != "Antarctica")
head(countries[, c("admin", "iso_a2", "geometry")])

## Simple feature collection with 6 features and 2 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -87.67017 ymin: 8.070654 xmax: -63.00942 ymax: 20.09365
## Geodetic CRS:  WGS 84
##                admin iso_a2                       geometry
## 1         Costa Rica     CR POLYGON ((-82.56357 9.57666...
## 2          Nicaragua     NI POLYGON ((-83.15752 14.9930...
## 3       Saint Martin     MF POLYGON ((-63.01118 18.0689...
## 4       Sint Maarten     SX POLYGON ((-63.12305 18.0689...
## 5              Haiti     HT MULTIPOLYGON (((-71.77925 1...
## 6 Dominican Republic     DO POLYGON ((-71.76831 18.0391...

Exploratory Data Analysis

hist(
    sal.data$work_year, 
    main = "Histogram of Work Years", 
    xlab = "Years",
    col = "royalblue"
)
grid(nx = NULL, ny = NULL)

hist(
    sal.data$salary_in_usd,
    main = "Histogram of Salaries", breaks = 30,
    col = "royalblue", xlab = "USD"
)
grid(nx = NULL, ny = NULL)

plot(
    salary_in_usd ~ as.factor(work_year),
    data = sal.data,
    col = "green",
    main = "Distribution of Salaries by Year",
    xlab = "Year",
    ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)

plot(
    salary_in_usd ~ as.factor(company_location),
    data = sal.data,
    col = "green",
    main = "Distribution of Salaries by Country",
    xlab = "Year",
    ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)

chart.data <- sal.data %>%
    group_by(employment_type) %>%
    summarise(counts = n()) %>%
    arrange(employment_type)

barplot(
    chart.data$counts, 
    col = "royalblue", 
    xlab = "Employment Type",
    ylab = "Counts", 
    names.arg = chart.data$employment_type,
    main = "Employment Type Counts"
)
grid(nx = NULL, ny = NULL)

chart.data

## # A tibble: 4 × 2
##   employment_type counts
##   <chr>            <int>
## 1 CT                  10
## 2 FL                  10
## 3 FT                3718
## 4 PT                  17

chart.data <- sal.data %>%
    group_by(company_location) %>%
    summarise(
        salary_in_usd = mean(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(salary_in_usd)) %>%
    head(20)

ggplot(chart.data, aes(x = salary_in_usd, y = company_location)) +
    geom_col(fill = "royalblue") +
    ggtitle("Top 20 Average Salary (USD) per Country") +
    labs(x = "USD", y = "") +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 14),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 10)
    )

chart.data

## # A tibble: 20 × 2
## # Groups:   company_location [20]
##    company_location salary_in_usd
##    <chr>                    <dbl>
##  1 IL                     271446.
##  2 PR                     167500 
##  3 US                     151822.
##  4 RU                     140333.
##  5 CA                     131918.
##  6 NZ                     125000 
##  7 BA                     120000 
##  8 IE                     114943.
##  9 JP                     114127.
## 10 SE                     105000 
## 11 AE                     100000 
## 12 CN                     100000 
## 13 DZ                     100000 
## 14 IQ                     100000 
## 15 IR                     100000 
## 16 MX                      97151.
## 17 LT                      94812 
## 18 DE                      88289.
## 19 GB                      86890.
## 20 CH                      81722

sum.data <- sal.data %>%
    group_by(company_location) %>%
    summarise(
        salary_in_usd = sum(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(salary_in_usd)) %>%
    head(20)

chart.data <- subset(sal.data,
                     company_location %in% unique(sum.data$company_location))

ggplot(chart.data) +
    geom_point(
        aes(x = salary_in_usd, y = company_location), 
        col = "royalblue"
    ) +
    ggtitle(
        "Salaries (USD) per Country", 
        "Top 20 Countries with Highest Salary"
    ) +
    labs(
        x = "USD", 
        y = ""
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 14),
        plot.subtitle = element_text(hjust = 0.5),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 10)
    )

chart.data <- sal.data %>%
    group_by(job_title) %>%
    summarise(
        avg.salary = mean(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(avg.salary)) %>%
    head(20)

ggplot(chart.data, aes(x = avg.salary, y = job_title)) +
    geom_col(fill = "royalblue") +
    ggtitle(
        "Top 20 Job Titles", 
        "with Highest Average Salary (USD)"
    ) +
    labs(
        x = "USD", 
        y = ""
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 14),
        plot.subtitle = element_text(hjust = 0.5),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 10)
    )

sum.data <- sal.data %>%
    group_by(job_title) %>%
    summarise(
        avg.salary = mean(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(avg.salary)) %>%
    head(20)

chart.data <- subset(sal.data,
                     job_title %in% unique(sum.data$job_title)
            )

plot(
    salary_in_usd ~ as.factor(job_title),
    data = chart.data,
    col = "royalblue",
    main = paste("Distribution of Salaries",
                  "for Top 20 Job Titles with Highest Average Salary",
                  sep = "\n"),
    xlab = "",
    cex.axis = 0.8,
    ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)

sum.data %>% arrange(job_title)

## # A tibble: 20 × 2
## # Groups:   job_title [20]
##    job_title                           avg.salary
##    <chr>                                    <dbl>
##  1 Applied Scientist                      190264.
##  2 Business Intelligence Engineer         174150 
##  3 Cloud Data Architect                   250000 
##  4 Data Analytics Lead                    211254.
##  5 Data Architect                         161714.
##  6 Data Infrastructure Engineer           175052.
##  7 Data Lead                              212500 
##  8 Data Science Manager                   191279.
##  9 Data Science Tech Lead                 375000 
## 10 Director of Data Science               195141.
## 11 Head of Data                           183858.
## 12 Head of Data Science                   160592.
## 13 ML Engineer                            158352.
## 14 Machine Learning Scientist             163220.
## 15 Machine Learning Software Engineer     192420 
## 16 Principal Data Engineer                192500 
## 17 Principal Data Scientist               198171.
## 18 Principal Machine Learning Engineer    190000 
## 19 Research Engineer                      163108.
## 20 Research Scientist                     161214.

colors <- c("red", "orange", "coral", "purple", "navy", 
            "royalblue", "brown", "yellow", "green", "darkgreen")

chart.data <- sum.data %>% 
    arrange(desc(avg.salary)) %>% 
    .[1:10, ]

pie(
    chart.data$avg.salary, labels = chart.data$job_title, 
    radius = 1, 
    cex = 0.8,
    main = "Distribution of Average Salary\nfor Top 10 Job Titles",
    col = colors
)

sum.data <- sal.data %>%
    group_by(company_size) %>%
    summarise(
        Min = min(salary_in_usd),
        Avg = mean(salary_in_usd),
        Median = median(salary_in_usd),
        Max = max(salary_in_usd),
        .groups = "keep"
    ) %>%
    mutate(
        company_size = ifelse(company_size == "L", "Large",
                            ifelse(company_size == "M", "Medium",
                                 "Small")
                        )
    )
    
sum.data

## # A tibble: 3 × 5
## # Groups:   company_size [3]
##   company_size   Min     Avg Median    Max
##   <chr>        <int>   <dbl>  <dbl>  <int>
## 1 Large         5409 118301. 108500 423834
## 2 Medium        5132 143131. 140000 450000
## 3 Small         5679  78227.  62146 416000

# Melt the data into variable columns
chart.data <- 
    melt(
        sum.data[, c("company_size", "Min", "Avg", "Median", "Max")], 
        id="company_size"
    ) %>%
    arrange(company_size, variable)

chart.data

##    company_size variable     value
## 1         Large      Min   5409.00
## 2         Large      Avg 118300.98
## 3         Large   Median 108500.00
## 4         Large      Max 423834.00
## 5        Medium      Min   5132.00
## 6        Medium      Avg 143130.55
## 7        Medium   Median 140000.00
## 8        Medium      Max 450000.00
## 9         Small      Min   5679.00
## 10        Small      Avg  78226.68
## 11        Small   Median  62146.00
## 12        Small      Max 416000.00

ggplot(chart.data, 
       aes(x = company_size, y = value, fill = variable)
    ) +
    geom_bar(
        stat = "identity", 
        width = 0.5, 
        position = "dodge"
    ) +
    scale_fill_manual(
        values = c("maroon", "royalblue", "coral", "green")
    ) +
    ggtitle("Salary Statistics by Company Size") +
    labs(x = "", y = "USD") +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 14),
        plot.subtitle = element_text(hjust = 0.5),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 10)
    )

chart.data <- sal.data[, c("company_size", "salary_in_usd")] %>%
    mutate(
        company_size = ifelse(company_size == "L", "Large",
                            ifelse(company_size == "M", "Medium",
                                 "Small"
                            )
                        )
    )

plot(
    salary_in_usd ~ as.factor(company_size),
    data = chart.data,
    col = c("green", "orange", "royalblue"),
    main = "Distribution of Salaries by Company Size",
    xlab = "",
    cex.axis = 0.8,
    ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)

sum.data <- sal.data %>%
    group_by(company_location) %>%
    summarise(
        salary_in_usd = sum(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(salary_in_usd)) %>%
    head(25)

chart.data <- subset(sal.data,
                     company_location %in% unique(sum.data$company_location)
            )

chart.data$company_size <- as.factor(
    ifelse(chart.data$company_size == "L", "Large",
           ifelse(chart.data$company_size == "M", "Medium", "Small"))
)

ggplot(chart.data, aes(x = salary_in_usd, y = company_location,
                       colour = company_size)) +
    geom_point() +
    scale_colour_manual(
        values = c("green", "orange", "royalblue"),
        aesthetics = "colour",
        name = "Company Size"
    ) +
    ggtitle(
        "Salaries (USD) by Company Size per Country", 
        "Top 25 Countries with Highest Salary"
    ) +
    labs(x = "USD", y = "") +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5, size = 14),
        plot.subtitle = element_text(hjust = 0.5),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 10)
    )

Spatial Data Analytics

x <- as.data.frame(countries)[, c("iso_a2", "continent")]
y <- sal.data[, c("company_location", "salary_in_usd")]

chart.data <- merge(
                x, y,
                by.x = "iso_a2",
                by.y = "company_location",
                all.x = FALSE,
                all.y = FALSE
            )

plot(
    salary_in_usd ~ as.factor(continent),
    data = chart.data,
    col = "royalblue",
    xlab = "",
    ylab = "Salary (USD)",
    main = "Distribution of Salaries by Continent",
    cex.axis = 0.8
)
grid(nx = NULL, ny = NULL)

map.data <- sal.data %>%
    group_by(company_location) %>%
    summarise(
        salary = mean(salary_in_usd)
    ) %>%
    merge(
        countries[, c("iso_a2", "continent", "admin")], .,
        by.x = "iso_a2",
        by.y = "company_location",
        all.x = FALSE,
        all.y = FALSE
    )

head(map.data, 5)

## Simple feature collection with 5 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -170.8205 ymin: -55.03213 xmax: 56.38799 ymax: 42.64795
## Geodetic CRS:  WGS 84
##   iso_a2     continent                admin salary
## 1     AE          Asia United Arab Emirates 100000
## 2     AL        Europe              Albania  10000
## 3     AM          Asia              Armenia  50000
## 4     AR South America            Argentina  25000
## 5     AS       Oceania       American Samoa  29351
##                         geometry
## 1 MULTIPOLYGON (((56.29785 25...
## 2 POLYGON ((19.34238 41.86909...
## 3 MULTIPOLYGON (((44.76826 39...
## 4 MULTIPOLYGON (((-57.60889 -...
## 5 POLYGON ((-170.7263 -14.351...

tm_shape(countries) +
    tm_polygons(
        fill = "darkgray"
    ) +
    tm_shape(map.data) +
    tm_polygons(
        fill = "salary",
        fill.legend = tm_legend(
            "USD",
            position = tm_pos_in("left", "bottom")
        ),
        fill.scale = tm_scale_intervals(
            n = 5,
            style = "quantile",
            values = "brewer.spectral"
        )
    ) +
    tm_title(
        "Average Salary (USD) by Country"
    )

## [tip] Consider a suitable map projection, e.g. by adding `+ tm_crs("auto")`.
## This message is displayed once per session.

as.data.frame(map.data)[, c("admin", "salary")] %>%
    arrange(desc(salary)) %>%
    head(8)

##                      admin   salary
## 1                   Israel 271446.5
## 2              Puerto Rico 167500.0
## 3 United States of America 151822.0
## 4                   Russia 140333.3
## 5                   Canada 131917.7
## 6              New Zealand 125000.0
## 7   Bosnia and Herzegovina 120000.0
## 8                  Ireland 114943.4

map.data <- sal.data %>%
    group_by(company_location) %>%
    summarise(counts = n(), avg.salary = mean(salary_in_usd)) %>%
    merge(countries[, c("iso_a2", "continent", "admin")], .,
                     by.x = "iso_a2",
                     by.y = "company_location",
                     all.x = FALSE,
                     all.y = FALSE)

tm_shape(countries) +
    tm_polygons(
        fill = "darkgray"
    ) +
    tm_shape(map.data) +
    tm_polygons(
        fill = "counts",
        fill.legend = tm_legend(
            "Count",
            position = tm_pos_in("left", "bottom")
        ),
        fill.scale = tm_scale_intervals(
            n = 5,
            style = "quantile",
            values = "brewer.spectral"
        )
    ) +
    tm_title("Count of Data Scientists by Country")

as.data.frame(map.data)[, c("admin", "counts", "avg.salary")] %>%
    arrange(desc(counts)) %>%
    head(8)

##                      admin counts avg.salary
## 1 United States of America   3040  151822.01
## 2           United Kingdom    172   86890.05
## 3                   Canada     87  131917.69
## 4                    Spain     77   57676.06
## 5                    India     58   30197.74
## 6                  Germany     56   88288.80
## 7                   Brazil     15   40579.20
## 8                Australia     14   80033.43

map.data <- sal.data %>%
    group_by(company_location, company_size) %>%
    summarise(
        counts = n(),
        avg.salary = mean(salary_in_usd),
        .groups = "keep"
    ) %>%
    arrange(desc(counts)) %>%
    group_by(company_location) %>%
    summarise(
        company_size = first(company_size), 
        counts = first(counts),
        .groups = "keep"
    ) %>%
    mutate(
        company_size = ifelse(company_size == "L", "Large",
                            ifelse(company_size == "M", "Medium",
                                 "Small")
                        )
        ) %>%
    merge(
        countries[, c("iso_a2", "continent", "admin")], .,
        by.x = "iso_a2",
        by.y = "company_location",
        all.x = FALSE,
        all.y = FALSE
    ) %>%
    arrange(desc(counts))

head(as.data.frame(map.data), 5)

##   iso_a2     continent                    admin company_size counts
## 1     US North America United States of America       Medium   2723
## 2     GB        Europe           United Kingdom       Medium    144
## 3     ES        Europe                    Spain       Medium     69
## 4     CA North America                   Canada       Medium     65
## 5     IN          Asia                    India        Large     38
##                         geometry
## 1 MULTIPOLYGON (((-132.7469 5...
## 2 MULTIPOLYGON (((-2.667676 5...
## 3 MULTIPOLYGON (((1.593945 38...
## 4 MULTIPOLYGON (((-132.6555 5...
## 5 MULTIPOLYGON (((68.16504 23...

# Map company size with highest count by country

tm_shape(countries) +
    tm_polygons(
        fill = "darkgray"
    ) +
    tm_shape(map.data) +
    # tm_fill(col = "company_size", title = "Size", 
    #         palette = c("green", "orange", "royalblue")) +
    tm_polygons(
        fill = "company_size",
        fill.legend = tm_legend(
            "Size",
            position = tm_pos_in("left", "bottom")
        ),
        fill.scale = tm_scale_categorical(
            values = c("green", "orange", "royalblue")
        )
    ) +
    tm_title(
        text = "Company Size with most Data Scientists by Country",
        position = tm_pos_out("center", "top")
    )

## Multiple palettes called "green" found: "kovesi.green", "tableau.green". The first one, "kovesi.green", is returned.

map.data <- sal.data %>%
    group_by(company_location, company_size) %>%
    summarise(
        counts = n(), 
        avg.salary = mean(salary_in_usd), 
        .groups = "keep"
    ) %>%
    arrange(desc(avg.salary)) %>%
    group_by(company_location) %>%
    summarise(
        company_size = first(company_size), 
        avg.salary = first(avg.salary),
        .groups = "keep"
    ) %>%
    mutate(
        company_size = ifelse(company_size == "L", "Large",
                            ifelse(company_size == "M", "Medium",
                                 "Small")
                        )
        ) %>%
    merge(
        countries[, c("iso_a2", "continent", "admin")], .,
        by.x = "iso_a2",
        by.y = "company_location",
        all.x = FALSE,
        all.y = FALSE
    ) %>%
    arrange(desc(avg.salary))

tm_shape(countries) +
    tm_polygons(
        fill = "darkgray"
    ) +
    tm_shape(map.data) +
    tm_polygons(
        fill = "company_size",
        fill.legend = tm_legend(
            "Size",
            position = tm_pos_in("left", "bottom")
        ),
        fill.scale = tm_scale_categorical(
            values = c("green", "orange", "royalblue")
        )
    ) +
    tm_title(
        text = "Company Size with Higest Average Salary by Country",
        position = tm_pos_out("center", "top")
    )

## Multiple palettes called "green" found: "kovesi.green", "tableau.green". The first one, "kovesi.green", is returned.