This R notebook explores the Data Scientist salary dataset downloaded from Kaggle.com
https://www.kaggle.com/datasets/henryshan/2023-data-scientists-salary
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(tmap)
library(tmaptools)
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE
library(geojsonsf)
library(rjson)
library(sp)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
options(scipen = 999)
sal.data <- read.csv(file = "ds_salaries.csv", header = TRUE, sep = ",")
head(sal.data, 10)
## work_year experience_level employment_type job_title salary
## 1 2023 SE FT Principal Data Scientist 80000
## 2 2023 MI CT ML Engineer 30000
## 3 2023 MI CT ML Engineer 25500
## 4 2023 SE FT Data Scientist 175000
## 5 2023 SE FT Data Scientist 120000
## 6 2023 SE FT Applied Scientist 222200
## 7 2023 SE FT Applied Scientist 136000
## 8 2023 SE FT Data Scientist 219000
## 9 2023 SE FT Data Scientist 141000
## 10 2023 SE FT Data Scientist 147100
## salary_currency salary_in_usd employee_residence remote_ratio
## 1 EUR 85847 ES 100
## 2 USD 30000 US 100
## 3 USD 25500 US 100
## 4 USD 175000 CA 100
## 5 USD 120000 CA 100
## 6 USD 222200 US 0
## 7 USD 136000 US 0
## 8 USD 219000 CA 0
## 9 USD 141000 CA 0
## 10 USD 147100 US 0
## company_location company_size
## 1 ES L
## 2 US S
## 3 US S
## 4 CA M
## 5 CA M
## 6 US L
## 7 US L
## 8 CA M
## 9 CA M
## 10 US M
countries <- fromJSON(file = "world-ash-ms.geojson")
countries <- geojson_sf(toJSON(countries))
countries <- subset(countries, st_is_valid(geometry) & name != "Antarctica")
head(countries[, c("admin", "iso_a2", "geometry")])
## Simple feature collection with 6 features and 2 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: -87.67017 ymin: 8.070654 xmax: -63.00942 ymax: 20.09365
## Geodetic CRS: WGS 84
## admin iso_a2 geometry
## 1 Costa Rica CR POLYGON ((-82.56357 9.57666...
## 2 Nicaragua NI POLYGON ((-83.15752 14.9930...
## 3 Saint Martin MF POLYGON ((-63.01118 18.0689...
## 4 Sint Maarten SX POLYGON ((-63.12305 18.0689...
## 5 Haiti HT MULTIPOLYGON (((-71.77925 1...
## 6 Dominican Republic DO POLYGON ((-71.76831 18.0391...
hist(
sal.data$work_year,
main = "Histogram of Work Years",
xlab = "Years",
col = "royalblue"
)
grid(nx = NULL, ny = NULL)
hist(
sal.data$salary_in_usd,
main = "Histogram of Salaries", breaks = 30,
col = "royalblue", xlab = "USD"
)
grid(nx = NULL, ny = NULL)
plot(
salary_in_usd ~ as.factor(work_year),
data = sal.data,
col = "green",
main = "Distribution of Salaries by Year",
xlab = "Year",
ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)
plot(
salary_in_usd ~ as.factor(company_location),
data = sal.data,
col = "green",
main = "Distribution of Salaries by Country",
xlab = "Year",
ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)
chart.data <- sal.data %>%
group_by(employment_type) %>%
summarise(counts = n()) %>%
arrange(employment_type)
barplot(
chart.data$counts,
col = "royalblue",
xlab = "Employment Type",
ylab = "Counts",
names.arg = chart.data$employment_type,
main = "Employment Type Counts"
)
grid(nx = NULL, ny = NULL)
chart.data
## # A tibble: 4 × 2
## employment_type counts
## <chr> <int>
## 1 CT 10
## 2 FL 10
## 3 FT 3718
## 4 PT 17
chart.data <- sal.data %>%
group_by(company_location) %>%
summarise(
salary_in_usd = mean(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(salary_in_usd)) %>%
head(20)
ggplot(chart.data, aes(x = salary_in_usd, y = company_location)) +
geom_col(fill = "royalblue") +
ggtitle("Top 20 Average Salary (USD) per Country") +
labs(x = "USD", y = "") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10)
)
chart.data
## # A tibble: 20 × 2
## # Groups: company_location [20]
## company_location salary_in_usd
## <chr> <dbl>
## 1 IL 271446.
## 2 PR 167500
## 3 US 151822.
## 4 RU 140333.
## 5 CA 131918.
## 6 NZ 125000
## 7 BA 120000
## 8 IE 114943.
## 9 JP 114127.
## 10 SE 105000
## 11 AE 100000
## 12 CN 100000
## 13 DZ 100000
## 14 IQ 100000
## 15 IR 100000
## 16 MX 97151.
## 17 LT 94812
## 18 DE 88289.
## 19 GB 86890.
## 20 CH 81722
sum.data <- sal.data %>%
group_by(company_location) %>%
summarise(
salary_in_usd = sum(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(salary_in_usd)) %>%
head(20)
chart.data <- subset(sal.data,
company_location %in% unique(sum.data$company_location))
ggplot(chart.data) +
geom_point(
aes(x = salary_in_usd, y = company_location),
col = "royalblue"
) +
ggtitle(
"Salaries (USD) per Country",
"Top 20 Countries with Highest Salary"
) +
labs(
x = "USD",
y = ""
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10)
)
chart.data <- sal.data %>%
group_by(job_title) %>%
summarise(
avg.salary = mean(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(avg.salary)) %>%
head(20)
ggplot(chart.data, aes(x = avg.salary, y = job_title)) +
geom_col(fill = "royalblue") +
ggtitle(
"Top 20 Job Titles",
"with Highest Average Salary (USD)"
) +
labs(
x = "USD",
y = ""
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10)
)
sum.data <- sal.data %>%
group_by(job_title) %>%
summarise(
avg.salary = mean(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(avg.salary)) %>%
head(20)
chart.data <- subset(sal.data,
job_title %in% unique(sum.data$job_title)
)
plot(
salary_in_usd ~ as.factor(job_title),
data = chart.data,
col = "royalblue",
main = paste("Distribution of Salaries",
"for Top 20 Job Titles with Highest Average Salary",
sep = "\n"),
xlab = "",
cex.axis = 0.8,
ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)
sum.data %>% arrange(job_title)
## # A tibble: 20 × 2
## # Groups: job_title [20]
## job_title avg.salary
## <chr> <dbl>
## 1 Applied Scientist 190264.
## 2 Business Intelligence Engineer 174150
## 3 Cloud Data Architect 250000
## 4 Data Analytics Lead 211254.
## 5 Data Architect 161714.
## 6 Data Infrastructure Engineer 175052.
## 7 Data Lead 212500
## 8 Data Science Manager 191279.
## 9 Data Science Tech Lead 375000
## 10 Director of Data Science 195141.
## 11 Head of Data 183858.
## 12 Head of Data Science 160592.
## 13 ML Engineer 158352.
## 14 Machine Learning Scientist 163220.
## 15 Machine Learning Software Engineer 192420
## 16 Principal Data Engineer 192500
## 17 Principal Data Scientist 198171.
## 18 Principal Machine Learning Engineer 190000
## 19 Research Engineer 163108.
## 20 Research Scientist 161214.
colors <- c("red", "orange", "coral", "purple", "navy",
"royalblue", "brown", "yellow", "green", "darkgreen")
chart.data <- sum.data %>%
arrange(desc(avg.salary)) %>%
.[1:10, ]
pie(
chart.data$avg.salary, labels = chart.data$job_title,
radius = 1,
cex = 0.8,
main = "Distribution of Average Salary\nfor Top 10 Job Titles",
col = colors
)
sum.data <- sal.data %>%
group_by(company_size) %>%
summarise(
Min = min(salary_in_usd),
Avg = mean(salary_in_usd),
Median = median(salary_in_usd),
Max = max(salary_in_usd),
.groups = "keep"
) %>%
mutate(
company_size = ifelse(company_size == "L", "Large",
ifelse(company_size == "M", "Medium",
"Small")
)
)
sum.data
## # A tibble: 3 × 5
## # Groups: company_size [3]
## company_size Min Avg Median Max
## <chr> <int> <dbl> <dbl> <int>
## 1 Large 5409 118301. 108500 423834
## 2 Medium 5132 143131. 140000 450000
## 3 Small 5679 78227. 62146 416000
# Melt the data into variable columns
chart.data <-
melt(
sum.data[, c("company_size", "Min", "Avg", "Median", "Max")],
id="company_size"
) %>%
arrange(company_size, variable)
chart.data
## company_size variable value
## 1 Large Min 5409.00
## 2 Large Avg 118300.98
## 3 Large Median 108500.00
## 4 Large Max 423834.00
## 5 Medium Min 5132.00
## 6 Medium Avg 143130.55
## 7 Medium Median 140000.00
## 8 Medium Max 450000.00
## 9 Small Min 5679.00
## 10 Small Avg 78226.68
## 11 Small Median 62146.00
## 12 Small Max 416000.00
ggplot(chart.data,
aes(x = company_size, y = value, fill = variable)
) +
geom_bar(
stat = "identity",
width = 0.5,
position = "dodge"
) +
scale_fill_manual(
values = c("maroon", "royalblue", "coral", "green")
) +
ggtitle("Salary Statistics by Company Size") +
labs(x = "", y = "USD") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10)
)
chart.data <- sal.data[, c("company_size", "salary_in_usd")] %>%
mutate(
company_size = ifelse(company_size == "L", "Large",
ifelse(company_size == "M", "Medium",
"Small"
)
)
)
plot(
salary_in_usd ~ as.factor(company_size),
data = chart.data,
col = c("green", "orange", "royalblue"),
main = "Distribution of Salaries by Company Size",
xlab = "",
cex.axis = 0.8,
ylab = "Salary (USD)"
)
grid(nx = NULL, ny = NULL)
sum.data <- sal.data %>%
group_by(company_location) %>%
summarise(
salary_in_usd = sum(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(salary_in_usd)) %>%
head(25)
chart.data <- subset(sal.data,
company_location %in% unique(sum.data$company_location)
)
chart.data$company_size <- as.factor(
ifelse(chart.data$company_size == "L", "Large",
ifelse(chart.data$company_size == "M", "Medium", "Small"))
)
ggplot(chart.data, aes(x = salary_in_usd, y = company_location,
colour = company_size)) +
geom_point() +
scale_colour_manual(
values = c("green", "orange", "royalblue"),
aesthetics = "colour",
name = "Company Size"
) +
ggtitle(
"Salaries (USD) by Company Size per Country",
"Top 25 Countries with Highest Salary"
) +
labs(x = "USD", y = "") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
axis.text = element_text(size = 9),
axis.title = element_text(size = 10)
)
x <- as.data.frame(countries)[, c("iso_a2", "continent")]
y <- sal.data[, c("company_location", "salary_in_usd")]
chart.data <- merge(
x, y,
by.x = "iso_a2",
by.y = "company_location",
all.x = FALSE,
all.y = FALSE
)
plot(
salary_in_usd ~ as.factor(continent),
data = chart.data,
col = "royalblue",
xlab = "",
ylab = "Salary (USD)",
main = "Distribution of Salaries by Continent",
cex.axis = 0.8
)
grid(nx = NULL, ny = NULL)
map.data <- sal.data %>%
group_by(company_location) %>%
summarise(
salary = mean(salary_in_usd)
) %>%
merge(
countries[, c("iso_a2", "continent", "admin")], .,
by.x = "iso_a2",
by.y = "company_location",
all.x = FALSE,
all.y = FALSE
)
head(map.data, 5)
## Simple feature collection with 5 features and 4 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: -170.8205 ymin: -55.03213 xmax: 56.38799 ymax: 42.64795
## Geodetic CRS: WGS 84
## iso_a2 continent admin salary
## 1 AE Asia United Arab Emirates 100000
## 2 AL Europe Albania 10000
## 3 AM Asia Armenia 50000
## 4 AR South America Argentina 25000
## 5 AS Oceania American Samoa 29351
## geometry
## 1 MULTIPOLYGON (((56.29785 25...
## 2 POLYGON ((19.34238 41.86909...
## 3 MULTIPOLYGON (((44.76826 39...
## 4 MULTIPOLYGON (((-57.60889 -...
## 5 POLYGON ((-170.7263 -14.351...
tm_shape(countries) +
tm_polygons(
fill = "darkgray"
) +
tm_shape(map.data) +
tm_polygons(
fill = "salary",
fill.legend = tm_legend(
"USD",
position = tm_pos_in("left", "bottom")
),
fill.scale = tm_scale_intervals(
n = 5,
style = "quantile",
values = "brewer.spectral"
)
) +
tm_title(
"Average Salary (USD) by Country"
)
## [tip] Consider a suitable map projection, e.g. by adding `+ tm_crs("auto")`.
## This message is displayed once per session.
as.data.frame(map.data)[, c("admin", "salary")] %>%
arrange(desc(salary)) %>%
head(8)
## admin salary
## 1 Israel 271446.5
## 2 Puerto Rico 167500.0
## 3 United States of America 151822.0
## 4 Russia 140333.3
## 5 Canada 131917.7
## 6 New Zealand 125000.0
## 7 Bosnia and Herzegovina 120000.0
## 8 Ireland 114943.4
map.data <- sal.data %>%
group_by(company_location) %>%
summarise(counts = n(), avg.salary = mean(salary_in_usd)) %>%
merge(countries[, c("iso_a2", "continent", "admin")], .,
by.x = "iso_a2",
by.y = "company_location",
all.x = FALSE,
all.y = FALSE)
tm_shape(countries) +
tm_polygons(
fill = "darkgray"
) +
tm_shape(map.data) +
tm_polygons(
fill = "counts",
fill.legend = tm_legend(
"Count",
position = tm_pos_in("left", "bottom")
),
fill.scale = tm_scale_intervals(
n = 5,
style = "quantile",
values = "brewer.spectral"
)
) +
tm_title("Count of Data Scientists by Country")
as.data.frame(map.data)[, c("admin", "counts", "avg.salary")] %>%
arrange(desc(counts)) %>%
head(8)
## admin counts avg.salary
## 1 United States of America 3040 151822.01
## 2 United Kingdom 172 86890.05
## 3 Canada 87 131917.69
## 4 Spain 77 57676.06
## 5 India 58 30197.74
## 6 Germany 56 88288.80
## 7 Brazil 15 40579.20
## 8 Australia 14 80033.43
map.data <- sal.data %>%
group_by(company_location, company_size) %>%
summarise(
counts = n(),
avg.salary = mean(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(counts)) %>%
group_by(company_location) %>%
summarise(
company_size = first(company_size),
counts = first(counts),
.groups = "keep"
) %>%
mutate(
company_size = ifelse(company_size == "L", "Large",
ifelse(company_size == "M", "Medium",
"Small")
)
) %>%
merge(
countries[, c("iso_a2", "continent", "admin")], .,
by.x = "iso_a2",
by.y = "company_location",
all.x = FALSE,
all.y = FALSE
) %>%
arrange(desc(counts))
head(as.data.frame(map.data), 5)
## iso_a2 continent admin company_size counts
## 1 US North America United States of America Medium 2723
## 2 GB Europe United Kingdom Medium 144
## 3 ES Europe Spain Medium 69
## 4 CA North America Canada Medium 65
## 5 IN Asia India Large 38
## geometry
## 1 MULTIPOLYGON (((-132.7469 5...
## 2 MULTIPOLYGON (((-2.667676 5...
## 3 MULTIPOLYGON (((1.593945 38...
## 4 MULTIPOLYGON (((-132.6555 5...
## 5 MULTIPOLYGON (((68.16504 23...
# Map company size with highest count by country
tm_shape(countries) +
tm_polygons(
fill = "darkgray"
) +
tm_shape(map.data) +
# tm_fill(col = "company_size", title = "Size",
# palette = c("green", "orange", "royalblue")) +
tm_polygons(
fill = "company_size",
fill.legend = tm_legend(
"Size",
position = tm_pos_in("left", "bottom")
),
fill.scale = tm_scale_categorical(
values = c("green", "orange", "royalblue")
)
) +
tm_title(
text = "Company Size with most Data Scientists by Country",
position = tm_pos_out("center", "top")
)
## Multiple palettes called "green" found: "kovesi.green", "tableau.green". The first one, "kovesi.green", is returned.
map.data <- sal.data %>%
group_by(company_location, company_size) %>%
summarise(
counts = n(),
avg.salary = mean(salary_in_usd),
.groups = "keep"
) %>%
arrange(desc(avg.salary)) %>%
group_by(company_location) %>%
summarise(
company_size = first(company_size),
avg.salary = first(avg.salary),
.groups = "keep"
) %>%
mutate(
company_size = ifelse(company_size == "L", "Large",
ifelse(company_size == "M", "Medium",
"Small")
)
) %>%
merge(
countries[, c("iso_a2", "continent", "admin")], .,
by.x = "iso_a2",
by.y = "company_location",
all.x = FALSE,
all.y = FALSE
) %>%
arrange(desc(avg.salary))
tm_shape(countries) +
tm_polygons(
fill = "darkgray"
) +
tm_shape(map.data) +
tm_polygons(
fill = "company_size",
fill.legend = tm_legend(
"Size",
position = tm_pos_in("left", "bottom")
),
fill.scale = tm_scale_categorical(
values = c("green", "orange", "royalblue")
)
) +
tm_title(
text = "Company Size with Higest Average Salary by Country",
position = tm_pos_out("center", "top")
)
## Multiple palettes called "green" found: "kovesi.green", "tableau.green". The first one, "kovesi.green", is returned.