Data description
ds_salaries <- read.csv("ds_salaries.csv")
summary(ds_salaries)
## X work_year experience_level employment_type
## Min. : 0.0 Min. :2020 Length:607 Length:607
## 1st Qu.:151.5 1st Qu.:2021 Class :character Class :character
## Median :303.0 Median :2022 Mode :character Mode :character
## Mean :303.0 Mean :2021
## 3rd Qu.:454.5 3rd Qu.:2022
## Max. :606.0 Max. :2022
## job_title salary salary_currency salary_in_usd
## Length:607 Min. : 4000 Length:607 Min. : 2859
## Class :character 1st Qu.: 70000 Class :character 1st Qu.: 62726
## Mode :character Median : 115000 Mode :character Median :101570
## Mean : 324000 Mean :112298
## 3rd Qu.: 165000 3rd Qu.:150000
## Max. :30400000 Max. :600000
## employee_residence remote_ratio company_location company_size
## Length:607 Min. : 0.00 Length:607 Length:607
## Class :character 1st Qu.: 50.00 Class :character Class :character
## Mode :character Median :100.00 Mode :character Mode :character
## Mean : 70.92
## 3rd Qu.:100.00
## Max. :100.00
## 'data.frame': 607 obs. of 12 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ work_year : int 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ experience_level : chr "MI" "SE" "SE" "MI" ...
## $ employment_type : chr "FT" "FT" "FT" "FT" ...
## $ job_title : chr "Data Scientist" "Machine Learning Scientist" "Big Data Engineer" "Product Data Analyst" ...
## $ salary : int 70000 260000 85000 20000 150000 72000 190000 11000000 135000 125000 ...
## $ salary_currency : chr "EUR" "USD" "GBP" "USD" ...
## $ salary_in_usd : int 79833 260000 109024 20000 150000 72000 190000 35735 135000 125000 ...
## $ employee_residence: chr "DE" "JP" "GB" "HN" ...
## $ remote_ratio : int 0 0 50 0 50 100 100 50 100 50 ...
## $ company_location : chr "DE" "JP" "GB" "HN" ...
## $ company_size : chr "L" "S" "M" "S" ...
Exploratory data analysis
ds_salaries %>%
filter(work_year %in% c(2020, 2021) & employment_type %in% c("FL","FT")) %>%
group_by(job_title) %>%
summarise(meansalary = mean(salary_in_usd)) %>%
ggplot(aes(x = reorder(job_title, -meansalary), y = meansalary / 10000)) +
geom_col(fill = "#005f73") +
xlab("Job title") +
ylab("Mean salary(in 10K)") +
ggtitle("Mean salary by job title") +
theme_light() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1 )) +
coord_flip() +
removeGrid() +
geom_text(aes(label = round(meansalary/10000)),hjust = -0.1) +
labs(caption = "Source: Kaggle")

ds_salaries %>%
filter(experience_level == "MI" & remote_ratio == 100) %>%
group_by(company_location) %>%
summarise(rows = n()) %>%
ggplot(aes(x = rows, y = reorder(company_location, -rows))) +
geom_col(fill = "#0a9396") +
theme_light() +
removeGrid() +
geom_text(aes(label = rows),hjust = -0.1) +
labs(title = "Number of employees in each company location", subtitle = "Employees with experience level MI & remote ratio 100%", caption = "Source: Kaggle") +
xlab("Count") +
ylab("Company location")

ds_salaries %>%
filter(remote_ratio == 100) %>%
ggplot(aes(x = company_location)) +
geom_bar(aes(fill = experience_level ), width = 0.9, position = "dodge") +
scale_fill_manual(values = c("#005f73","#0a9396","#94d2bd","#e9d8a6")) +
theme_light() +
removeGrid() +
labs(title = "Number of employees in each company location by experience level", subtitle = "Employees with remote ratio 100%", caption = "Source: Kaggle") +
ylab("Count") +
xlab("Company location") +
theme(legend.position = "none") +
facet_wrap(~experience_level,scales = "free") +
stat_count(geom = "text",
aes(y = after_stat(count), label = after_stat(count)),vjust = -0.5)

ds_salaries$salaryKUSD <- ds_salaries$salary_in_usd/1000
ds_salaries %>%
filter(job_title == "Data Scientist") %>%
group_by(work_year) %>%
summarise(meanSalary = mean(salaryKUSD)) %>%
mutate(work_year = as.Date(paste0(work_year,"-01-01"), format = "%Y-%m-%d")) %>%
ggplot( aes(x= work_year, y= meanSalary)) +
geom_line(linetype = "dashed") +
geom_point() +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
labs(title = "Mean Salary by year", subtitle = "Employees with job title Data Scientist", caption = "Source: Kaggle") +
ylab("Mean Salary in (KUSD)") +
xlab("Work Year") +
geom_text(aes(label = round(meanSalary)),vjust = -0.3) +
theme_light()

ds_salaries$salaryKUSD <- ds_salaries$salary_in_usd/1000
ds_salaries %>%
mutate(work_year = as.Date(paste0(work_year,"-01-01"), format = "%Y-%m-%d"),
cat = ifelse(grepl("Machine Learning",job_title),"Machine Learning",
ifelse(grepl("Data Scien",job_title),"Data Science",
ifelse(grepl("Data Analy",job_title),"Data Analytics","Other")
)
)
) %>%
group_by(work_year,cat) %>%
summarise(meanSalary = mean(salaryKUSD)) %>%
ggplot( aes(x= work_year, y= meanSalary,color = cat)) +
scale_color_manual(values = c("#005f73","#0a9396","#94d2bd","#e9d8a6")) +
geom_line(linetype = "dashed") +
geom_point() +
scale_x_date(date_breaks = "1 year",
date_labels = "%Y") +
labs(title = "Mean salary by year for each job category", caption = "Source: Kaggle") +
ylab("Mean Salary in (KUSD)") +
xlab("Work Year") +
geom_text(aes(label = round(meanSalary)),vjust = -0.1) +
theme_light() +
theme(legend.position = "none") +
facet_wrap(~cat)

ds_salaries %>%
filter(work_year == 2022) %>%
group_by(employee_residence) %>%
summarise(aaa = n(),meanslary = mean(salary_in_usd)) %>%
ggplot(aes(x = reorder(employee_residence , -meanslary), y = meanslary/10000)) +
geom_col(col= "white",fill="#94d2bd") +
labs(title = "Mean salary by employee residence", subtitle = "Year: 2022",caption = "source: kaggle") +
xlab("Employee residence") +
ylab("Mean salary (in 10k USD)") +
geom_text(aes(label = round(meanslary/10000)), vjust = .5,hjust = - 0.3) +
removeGrid() +
coord_flip() +
theme_light() +
removeGrid()

ds_salaries %>%
group_by(remote_ratio) %>%
filter(job_title == "Data Scientist") %>%
summarise(count = n(), meansalary = mean(salaryKUSD)) %>%
ggplot(aes(y = meansalary,x = remote_ratio)) +
geom_segment(aes(yend = 0, xend = remote_ratio)) +
geom_point(size = 4 , color = "#e9d8a6") +
ylab("Mean salary (KUSD)")+
xlab("Remote ratio") +
geom_text(aes(label = paste("Count\n",count))) +
labs(title = "Number of employees and their mean salary by remote work ratio",subtitle = "Job title: Data Scientist",caption = "Source: Kaggle") +
theme_light() +
removeGrid()
