# libraries
library(tidyverse)
library(knitr)
library(ggExtra)

Data description

ds_salaries <- read.csv("ds_salaries.csv")
summary(ds_salaries)

##        X           work_year    experience_level   employment_type   
##  Min.   :  0.0   Min.   :2020   Length:607         Length:607        
##  1st Qu.:151.5   1st Qu.:2021   Class :character   Class :character  
##  Median :303.0   Median :2022   Mode  :character   Mode  :character  
##  Mean   :303.0   Mean   :2021                                        
##  3rd Qu.:454.5   3rd Qu.:2022                                        
##  Max.   :606.0   Max.   :2022                                        
##   job_title             salary         salary_currency    salary_in_usd   
##  Length:607         Min.   :    4000   Length:607         Min.   :  2859  
##  Class :character   1st Qu.:   70000   Class :character   1st Qu.: 62726  
##  Mode  :character   Median :  115000   Mode  :character   Median :101570  
##                     Mean   :  324000                      Mean   :112298  
##                     3rd Qu.:  165000                      3rd Qu.:150000  
##                     Max.   :30400000                      Max.   :600000  
##  employee_residence  remote_ratio    company_location   company_size      
##  Length:607         Min.   :  0.00   Length:607         Length:607        
##  Class :character   1st Qu.: 50.00   Class :character   Class :character  
##  Mode  :character   Median :100.00   Mode  :character   Mode  :character  
##                     Mean   : 70.92                                        
##                     3rd Qu.:100.00                                        
##                     Max.   :100.00

str(ds_salaries)

## 'data.frame':    607 obs. of  12 variables:
##  $ X                 : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ work_year         : int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ experience_level  : chr  "MI" "SE" "SE" "MI" ...
##  $ employment_type   : chr  "FT" "FT" "FT" "FT" ...
##  $ job_title         : chr  "Data Scientist" "Machine Learning Scientist" "Big Data Engineer" "Product Data Analyst" ...
##  $ salary            : int  70000 260000 85000 20000 150000 72000 190000 11000000 135000 125000 ...
##  $ salary_currency   : chr  "EUR" "USD" "GBP" "USD" ...
##  $ salary_in_usd     : int  79833 260000 109024 20000 150000 72000 190000 35735 135000 125000 ...
##  $ employee_residence: chr  "DE" "JP" "GB" "HN" ...
##  $ remote_ratio      : int  0 0 50 0 50 100 100 50 100 50 ...
##  $ company_location  : chr  "DE" "JP" "GB" "HN" ...
##  $ company_size      : chr  "L" "S" "M" "S" ...

Exploratory data analysis

ds_salaries %>%
  filter(work_year %in% c(2020, 2021) & employment_type %in% c("FL","FT")) %>%
  group_by(job_title) %>%
  summarise(meansalary = mean(salary_in_usd)) %>%
  ggplot(aes(x = reorder(job_title, -meansalary), y = meansalary / 10000)) +
  geom_col(fill = "#005f73") +
  xlab("Job title") +
  ylab("Mean salary(in 10K)") +
  ggtitle("Mean salary by job title") +
  theme_light() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1 )) +
  coord_flip() +
  removeGrid() +
  geom_text(aes(label = round(meansalary/10000)),hjust = -0.1) +
  labs(caption = "Source: Kaggle")

ds_salaries %>%
  filter(experience_level == "MI" & remote_ratio == 100) %>%
  group_by(company_location) %>%
  summarise(rows = n()) %>%
  ggplot(aes(x = rows, y = reorder(company_location, -rows))) +
  geom_col(fill = "#0a9396") +
  theme_light() +
  removeGrid() +
  geom_text(aes(label = rows),hjust = -0.1) +
  labs(title = "Number of employees in each company location", subtitle = "Employees with experience level MI & remote ratio 100%", caption = "Source: Kaggle") +
  xlab("Count") +
  ylab("Company location")

ds_salaries %>%
  filter(remote_ratio == 100) %>%
  ggplot(aes(x = company_location)) +
  geom_bar(aes(fill = experience_level ), width = 0.9, position = "dodge") +
  scale_fill_manual(values = c("#005f73","#0a9396","#94d2bd","#e9d8a6")) +
  theme_light() +
  removeGrid() +
  labs(title = "Number of employees in each company location by experience level", subtitle = "Employees with remote ratio 100%", caption = "Source: Kaggle") +
  ylab("Count") +
  xlab("Company location") +
  theme(legend.position = "none") +
  facet_wrap(~experience_level,scales = "free") +
  stat_count(geom = "text",
             aes(y = after_stat(count), label = after_stat(count)),vjust = -0.5)

ds_salaries$salaryKUSD <- ds_salaries$salary_in_usd/1000
ds_salaries %>%
  filter(job_title == "Data Scientist") %>%
  group_by(work_year) %>%
  summarise(meanSalary = mean(salaryKUSD)) %>%
  mutate(work_year = as.Date(paste0(work_year,"-01-01"), format = "%Y-%m-%d")) %>%
  ggplot( aes(x= work_year, y= meanSalary)) +
    geom_line(linetype = "dashed") +
    geom_point() +
  scale_x_date(date_breaks = "1 year",
             date_labels = "%Y") +
  labs(title = "Mean Salary by year", subtitle = "Employees with job title Data Scientist", caption = "Source: Kaggle") +
  ylab("Mean Salary in (KUSD)") +
  xlab("Work Year") +
  geom_text(aes(label = round(meanSalary)),vjust = -0.3) +
  theme_light()

ds_salaries$salaryKUSD <- ds_salaries$salary_in_usd/1000
ds_salaries %>%
  mutate(work_year = as.Date(paste0(work_year,"-01-01"), format = "%Y-%m-%d"),
         cat = ifelse(grepl("Machine Learning",job_title),"Machine Learning",
                      ifelse(grepl("Data Scien",job_title),"Data Science",
                             ifelse(grepl("Data Analy",job_title),"Data Analytics","Other")
                             )
                      )
         ) %>%
  group_by(work_year,cat) %>%
  summarise(meanSalary = mean(salaryKUSD)) %>%
  
  ggplot( aes(x= work_year, y= meanSalary,color = cat)) +
  scale_color_manual(values = c("#005f73","#0a9396","#94d2bd","#e9d8a6")) +
  geom_line(linetype = "dashed") +
    geom_point() +
  scale_x_date(date_breaks = "1 year",
             date_labels = "%Y") +
  labs(title = "Mean salary by year for each job category", caption = "Source: Kaggle") +
  ylab("Mean Salary in (KUSD)") +
  xlab("Work Year") +
  geom_text(aes(label = round(meanSalary)),vjust = -0.1) +
   theme_light() +
   theme(legend.position = "none") +
  facet_wrap(~cat)

ds_salaries %>%
  filter(work_year == 2022) %>%
  group_by(employee_residence) %>%
  summarise(aaa = n(),meanslary = mean(salary_in_usd)) %>%

  ggplot(aes(x = reorder(employee_residence , -meanslary), y = meanslary/10000)) +
  geom_col(col= "white",fill="#94d2bd") +
  labs(title = "Mean salary by employee residence", subtitle = "Year: 2022",caption = "source: kaggle") +
  xlab("Employee residence") +
  ylab("Mean salary (in 10k USD)") +
  geom_text(aes(label = round(meanslary/10000)), vjust = .5,hjust = - 0.3) +
  removeGrid() +
  coord_flip() +
  theme_light() +
  removeGrid()

ds_salaries %>% 
  group_by(remote_ratio) %>%
  filter(job_title == "Data Scientist") %>%
  summarise(count = n(), meansalary = mean(salaryKUSD)) %>%
  
  ggplot(aes(y = meansalary,x = remote_ratio)) +
  geom_segment(aes(yend = 0, xend = remote_ratio)) +
  geom_point(size = 4 , color = "#e9d8a6") +
  ylab("Mean salary (KUSD)")+
  xlab("Remote ratio") +
  geom_text(aes(label = paste("Count\n",count))) +
  labs(title = "Number of employees and their mean salary by remote work ratio",subtitle = "Job title: Data Scientist",caption = "Source: Kaggle") +
  theme_light() +
  removeGrid()

Data scientists Salaries Analysis

Project report

abdullah sami

2023-06-18

Data description

Exploratory data analysis