Data science Salary Analysis

library(readr)
ds_salaries<- read.csv("ds_salaries.csv")
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

glimpse(ds_salaries)

## Rows: 607
## Columns: 12
## $ X                  <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ work_year          <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ experience_level   <chr> "MI", "SE", "SE", "MI", "SE", "EN", "SE", "MI", "MI…
## $ employment_type    <chr> "FT", "FT", "FT", "FT", "FT", "FT", "FT", "FT", "FT…
## $ job_title          <chr> "Data Scientist", "Machine Learning Scientist", "Bi…
## $ salary             <int> 70000, 260000, 85000, 20000, 150000, 72000, 190000,…
## $ salary_currency    <chr> "EUR", "USD", "GBP", "USD", "USD", "USD", "USD", "H…
## $ salary_in_usd      <int> 79833, 260000, 109024, 20000, 150000, 72000, 190000…
## $ employee_residence <chr> "DE", "JP", "GB", "HN", "US", "US", "US", "HU", "US…
## $ remote_ratio       <int> 0, 0, 50, 0, 50, 100, 100, 50, 100, 50, 0, 0, 0, 10…
## $ company_location   <chr> "DE", "JP", "GB", "HN", "US", "US", "US", "HU", "US…
## $ company_size       <chr> "L", "S", "M", "S", "L", "L", "S", "L", "L", "S", "…

library(dplyr)
summary(ds_salaries)

##        X           work_year    experience_level   employment_type   
##  Min.   :  0.0   Min.   :2020   Length:607         Length:607        
##  1st Qu.:151.5   1st Qu.:2021   Class :character   Class :character  
##  Median :303.0   Median :2022   Mode  :character   Mode  :character  
##  Mean   :303.0   Mean   :2021                                        
##  3rd Qu.:454.5   3rd Qu.:2022                                        
##  Max.   :606.0   Max.   :2022                                        
##   job_title             salary         salary_currency    salary_in_usd   
##  Length:607         Min.   :    4000   Length:607         Min.   :  2859  
##  Class :character   1st Qu.:   70000   Class :character   1st Qu.: 62726  
##  Mode  :character   Median :  115000   Mode  :character   Median :101570  
##                     Mean   :  324000                      Mean   :112298  
##                     3rd Qu.:  165000                      3rd Qu.:150000  
##                     Max.   :30400000                      Max.   :600000  
##  employee_residence  remote_ratio    company_location   company_size      
##  Length:607         Min.   :  0.00   Length:607         Length:607        
##  Class :character   1st Qu.: 50.00   Class :character   Class :character  
##  Mode  :character   Median :100.00   Mode  :character   Mode  :character  
##                     Mean   : 70.92                                        
##                     3rd Qu.:100.00                                        
##                     Max.   :100.00

na_counts <- colSums(is.na(ds_salaries))
na_counts

##                  X          work_year   experience_level    employment_type 
##                  0                  0                  0                  0 
##          job_title             salary    salary_currency      salary_in_usd 
##                  0                  0                  0                  0 
## employee_residence       remote_ratio   company_location       company_size 
##                  0                  0                  0                  0

#1 What are the top 10 most common job titles (job_title) in the ds_salaries dataset?
library(ggplot2)
library(dplyr)
top10_job_title <- as.data.frame(table(ds_salaries$job_title))
top10_job_title <- top10_job_title[order(-top10_job_title$Freq), ]
top10_job_title <- head(top10_job_title, 10)
ggplot(top10_job_title, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c('orange', "pink", "lightblue", 'purple', "blue", "lightgreen", "darkblue", "darkgreen", "red", "green")) +
  geom_text(aes(label = Freq), vjust = -0.5) +
  labs(title = "Top 10 Job Titles",
       x = "Job Titles",
       y = "Count")+theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

#2 What are the top 5 jobs with the highest salaries in the ds_salaries dataset?
library(dplyr)
library(ggplot2)
options(scipen =999) 
top_jobs <- ds_salaries %>%
  group_by(job_title) %>%
  summarise(avg_salary = mean(salary_in_usd)) %>%
  top_n(5, avg_salary) %>%
  arrange(desc(avg_salary))
ggplot(top_jobs, aes(x = reorder(job_title, avg_salary), y = avg_salary, fill = job_title)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 Jobs with Highest Salaries",
       x = "Job Title",
       y = "Average Salary (USD)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(ggplot2)
#3  What is the distribution of salaries?
options(scipen =999) 
ggplot(ds_salaries, aes(x = "", y = salary_in_usd)) + 
  geom_boxplot(fill= "lightblue", color="blue", outlier.color = "red", outlier.shape = "o",width = 0.2)+
  labs(title = "Distribution of Salary", y = "Salary (USD)")

library(ggplot2)
#4 What is the distribution of salaries across different experience levels?
ggplot(ds_salaries, aes(x = as.factor (experience_level), y = salary_in_usd, fill = factor (experience_level))) + 
  geom_boxplot(outlier.color = "red", outlier.shape = "o")+
  labs(title = "Salary vs. Experience level", x = "Experience level", y = "Salary (in USD)")

#5 What is the relationship between company size and the average salary and how does this relationship vary across different company sizes?
library(ggplot2)
ggplot(ds_salaries, aes(x = company_size, y = salary_in_usd, fill = as.factor(company_size))) + 
geom_bar(stat = "summary", fun = "mean")

library(ggplot2)
#6 What is the average salary for each type of employment and how do these average salaries compare across different employment types?
ggplot(ds_salaries, aes(x = factor(employment_type), y = salary_in_usd, fill = factor(employment_type))) + 
  geom_bar(stat = "summary", fun = "mean")

#7 What is the correlation between different numeric variables (work_year, salary, salary_in_usd, remote_ratio) in the ds_salaries dataset,and how can this correlation be visualized?
numeric_columns <- c("work_year", "salary_in_usd")
numeric_data <- ds_salaries[, numeric_columns]
correlation_matrix <- cor(numeric_data)
library(corrplot)

## corrplot 0.92 loaded

corrplot(correlation_matrix, method = 'pie')

#8 What is the linear relationship between years of work and salary in USD (salary_in_usd) in the ds_salaries dataset, and how well does a linear regression model fit the data?
library(ggplot2)
model <- lm(salary_in_usd ~ work_year, data = ds_salaries)
model

## 
## Call:
## lm(formula = salary_in_usd ~ work_year, data = ds_salaries)
## 
## Coefficients:
## (Intercept)    work_year  
##   -35219689        17479

summary(model)

## 
## Call:
## lm(formula = salary_in_usd ~ work_year, data = ds_salaries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -112693  -45637   -9793   34786  494786 
## 
## Coefficients:
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -35219689    8301899  -4.242 0.0000256 ***
## work_year       17479       4107   4.256 0.0000241 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69980 on 605 degrees of freedom
## Multiple R-squared:  0.02907,    Adjusted R-squared:  0.02746 
## F-statistic: 18.11 on 1 and 605 DF,  p-value: 0.00002413

#9 What are the distributions of experience level?
library(ggplot2)
ggplot(ds_salaries, aes(x = experience_level, fill = factor(experience_level))) + 
  geom_bar() +scale_fill_manual(values = c('lightblue', 'lightgreen', 'pink','brown'))+geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
  labs(title = "Experience Level Distribution",
       x = "Experience Level",
       y = "Count") +theme_minimal()

## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#10  What is the distribution of work years in the ds_salaries dataset, and how can this distribution be visualized using a pie chart?
library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

colors=c('darkblue', 'blue','lightblue')
pie(table(ds_salaries$work_year), main = 'Distribution of work year',
    labels = paste0(names(table(ds_salaries$work_year)), "\n",
                    format(round(prop.table(table(ds_salaries$work_year)) * 100, 1), nsmall = 1), "%"),col=colors)

#11 How does the distribution of experience levels vary across different company sizes in the ds_salaries dataset?" 
library(dplyr)
library(ggplot2)
exlevel_size <- ds_salaries %>%
  group_by(experience_level, company_size) %>%
  summarise(count = n()) %>%
  ungroup()

## `summarise()` has grouped output by 'experience_level'. You can override using
## the `.groups` argument.

ggplot(exlevel_size, aes(x = company_size, y = count, fill = experience_level)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(title = "Experience Level with Company Size",
       x = "Company Size",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

#12 What are the top 5 company locations by job count in the ds_salaries dataset, and how does the distribution of job counts vary across these locations?
library(dplyr)
library(ggplot2)
top_locations <- ds_salaries %>%
  count(company_location) %>%
  top_n(5, n) %>%
  arrange(desc(n))
ggplot(top_locations, aes(x = reorder(company_location, n), y = n, fill = company_location)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 Company Locations",
       x = "Company Location",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# 13 How does the average salary in USD vary over the years in the `ds_salaries` dataset, and how can this variation be visualized using a line graph?
library(ggplot2)
average_salary <- ds_salaries %>%
  group_by(work_year) %>%
  summarise(avg_salary_in_usd = mean(salary_in_usd))
ggplot(average_salary, aes(x = work_year, y = avg_salary_in_usd)) +
  geom_line() +
  labs(title = "Average Salary in USD Over the Years",
       x = "Work Year",
       y = "Average Salary in USD")

#14 What is distribution of emp;oyment type?
library(ggplot2)
ggplot(ds_salaries, aes(x = employment_type, fill = factor(employment_type))) + 
  geom_bar()+ geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
  labs(title = "Employment type Distribution",
       x = "Employment type",
       y = "Count") +theme_minimal()+scale_fill_manual(values = c('orange', 'yellow', 'pink','brown'))

#15 #How does the average salary in USD vary over the years (work_year) for different experience levels (experience_level)?
library(ggplot2)
library(dplyr)
avg_salary <- ds_salaries %>%
  group_by(work_year, experience_level) %>%
  summarise(avg_salary_in_usd = mean(salary_in_usd)) %>%
  ungroup()

## `summarise()` has grouped output by 'work_year'. You can override using the
## `.groups` argument.

ggplot(avg_salary, aes(x = work_year, y = avg_salary_in_usd, color = experience_level)) +
  geom_point() +
  labs(title = "Average Salary in USD Over the Years by Experience Level",
       x = "Work Year",
       y = "Average Salary in USD",
       color = "Experience Level") +
  theme_minimal()

```

Data science Salary Analysis

Deeshinbayar Odvogmed

2024-04-29