library(readr)
ds_salaries<- read.csv("ds_salaries.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(ds_salaries)
## Rows: 607
## Columns: 12
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ work_year <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 202…
## $ experience_level <chr> "MI", "SE", "SE", "MI", "SE", "EN", "SE", "MI", "MI…
## $ employment_type <chr> "FT", "FT", "FT", "FT", "FT", "FT", "FT", "FT", "FT…
## $ job_title <chr> "Data Scientist", "Machine Learning Scientist", "Bi…
## $ salary <int> 70000, 260000, 85000, 20000, 150000, 72000, 190000,…
## $ salary_currency <chr> "EUR", "USD", "GBP", "USD", "USD", "USD", "USD", "H…
## $ salary_in_usd <int> 79833, 260000, 109024, 20000, 150000, 72000, 190000…
## $ employee_residence <chr> "DE", "JP", "GB", "HN", "US", "US", "US", "HU", "US…
## $ remote_ratio <int> 0, 0, 50, 0, 50, 100, 100, 50, 100, 50, 0, 0, 0, 10…
## $ company_location <chr> "DE", "JP", "GB", "HN", "US", "US", "US", "HU", "US…
## $ company_size <chr> "L", "S", "M", "S", "L", "L", "S", "L", "L", "S", "…
library(dplyr)
summary(ds_salaries)
## X work_year experience_level employment_type
## Min. : 0.0 Min. :2020 Length:607 Length:607
## 1st Qu.:151.5 1st Qu.:2021 Class :character Class :character
## Median :303.0 Median :2022 Mode :character Mode :character
## Mean :303.0 Mean :2021
## 3rd Qu.:454.5 3rd Qu.:2022
## Max. :606.0 Max. :2022
## job_title salary salary_currency salary_in_usd
## Length:607 Min. : 4000 Length:607 Min. : 2859
## Class :character 1st Qu.: 70000 Class :character 1st Qu.: 62726
## Mode :character Median : 115000 Mode :character Median :101570
## Mean : 324000 Mean :112298
## 3rd Qu.: 165000 3rd Qu.:150000
## Max. :30400000 Max. :600000
## employee_residence remote_ratio company_location company_size
## Length:607 Min. : 0.00 Length:607 Length:607
## Class :character 1st Qu.: 50.00 Class :character Class :character
## Mode :character Median :100.00 Mode :character Mode :character
## Mean : 70.92
## 3rd Qu.:100.00
## Max. :100.00
na_counts <- colSums(is.na(ds_salaries))
na_counts
## X work_year experience_level employment_type
## 0 0 0 0
## job_title salary salary_currency salary_in_usd
## 0 0 0 0
## employee_residence remote_ratio company_location company_size
## 0 0 0 0
#1 What are the top 10 most common job titles (job_title) in the ds_salaries dataset?
library(ggplot2)
library(dplyr)
top10_job_title <- as.data.frame(table(ds_salaries$job_title))
top10_job_title <- top10_job_title[order(-top10_job_title$Freq), ]
top10_job_title <- head(top10_job_title, 10)
ggplot(top10_job_title, aes(x = reorder(Var1, -Freq), y = Freq, fill = Var1)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c('orange', "pink", "lightblue", 'purple', "blue", "lightgreen", "darkblue", "darkgreen", "red", "green")) +
geom_text(aes(label = Freq), vjust = -0.5) +
labs(title = "Top 10 Job Titles",
x = "Job Titles",
y = "Count")+theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none")
#2 What are the top 5 jobs with the highest salaries in the ds_salaries dataset?
library(dplyr)
library(ggplot2)
options(scipen =999)
top_jobs <- ds_salaries %>%
group_by(job_title) %>%
summarise(avg_salary = mean(salary_in_usd)) %>%
top_n(5, avg_salary) %>%
arrange(desc(avg_salary))
ggplot(top_jobs, aes(x = reorder(job_title, avg_salary), y = avg_salary, fill = job_title)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Jobs with Highest Salaries",
x = "Job Title",
y = "Average Salary (USD)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
#3 What is the distribution of salaries?
options(scipen =999)
ggplot(ds_salaries, aes(x = "", y = salary_in_usd)) +
geom_boxplot(fill= "lightblue", color="blue", outlier.color = "red", outlier.shape = "o",width = 0.2)+
labs(title = "Distribution of Salary", y = "Salary (USD)")
library(ggplot2)
#4 What is the distribution of salaries across different experience levels?
ggplot(ds_salaries, aes(x = as.factor (experience_level), y = salary_in_usd, fill = factor (experience_level))) +
geom_boxplot(outlier.color = "red", outlier.shape = "o")+
labs(title = "Salary vs. Experience level", x = "Experience level", y = "Salary (in USD)")
#5 What is the relationship between company size and the average salary and how does this relationship vary across different company sizes?
library(ggplot2)
ggplot(ds_salaries, aes(x = company_size, y = salary_in_usd, fill = as.factor(company_size))) +
geom_bar(stat = "summary", fun = "mean")
library(ggplot2)
#6 What is the average salary for each type of employment and how do these average salaries compare across different employment types?
ggplot(ds_salaries, aes(x = factor(employment_type), y = salary_in_usd, fill = factor(employment_type))) +
geom_bar(stat = "summary", fun = "mean")
#7 What is the correlation between different numeric variables (work_year, salary, salary_in_usd, remote_ratio) in the ds_salaries dataset,and how can this correlation be visualized?
numeric_columns <- c("work_year", "salary_in_usd")
numeric_data <- ds_salaries[, numeric_columns]
correlation_matrix <- cor(numeric_data)
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlation_matrix, method = 'pie')
#8 What is the linear relationship between years of work and salary in USD (salary_in_usd) in the ds_salaries dataset, and how well does a linear regression model fit the data?
library(ggplot2)
model <- lm(salary_in_usd ~ work_year, data = ds_salaries)
model
##
## Call:
## lm(formula = salary_in_usd ~ work_year, data = ds_salaries)
##
## Coefficients:
## (Intercept) work_year
## -35219689 17479
summary(model)
##
## Call:
## lm(formula = salary_in_usd ~ work_year, data = ds_salaries)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112693 -45637 -9793 34786 494786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -35219689 8301899 -4.242 0.0000256 ***
## work_year 17479 4107 4.256 0.0000241 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69980 on 605 degrees of freedom
## Multiple R-squared: 0.02907, Adjusted R-squared: 0.02746
## F-statistic: 18.11 on 1 and 605 DF, p-value: 0.00002413
#9 What are the distributions of experience level?
library(ggplot2)
ggplot(ds_salaries, aes(x = experience_level, fill = factor(experience_level))) +
geom_bar() +scale_fill_manual(values = c('lightblue', 'lightgreen', 'pink','brown'))+geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
labs(title = "Experience Level Distribution",
x = "Experience Level",
y = "Count") +theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#10 What is the distribution of work years in the ds_salaries dataset, and how can this distribution be visualized using a pie chart?
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
colors=c('darkblue', 'blue','lightblue')
pie(table(ds_salaries$work_year), main = 'Distribution of work year',
labels = paste0(names(table(ds_salaries$work_year)), "\n",
format(round(prop.table(table(ds_salaries$work_year)) * 100, 1), nsmall = 1), "%"),col=colors)
#11 How does the distribution of experience levels vary across different company sizes in the ds_salaries dataset?"
library(dplyr)
library(ggplot2)
exlevel_size <- ds_salaries %>%
group_by(experience_level, company_size) %>%
summarise(count = n()) %>%
ungroup()
## `summarise()` has grouped output by 'experience_level'. You can override using
## the `.groups` argument.
ggplot(exlevel_size, aes(x = company_size, y = count, fill = experience_level)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Experience Level with Company Size",
x = "Company Size",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
#12 What are the top 5 company locations by job count in the ds_salaries dataset, and how does the distribution of job counts vary across these locations?
library(dplyr)
library(ggplot2)
top_locations <- ds_salaries %>%
count(company_location) %>%
top_n(5, n) %>%
arrange(desc(n))
ggplot(top_locations, aes(x = reorder(company_location, n), y = n, fill = company_location)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Company Locations",
x = "Company Location",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# 13 How does the average salary in USD vary over the years in the `ds_salaries` dataset, and how can this variation be visualized using a line graph?
library(ggplot2)
average_salary <- ds_salaries %>%
group_by(work_year) %>%
summarise(avg_salary_in_usd = mean(salary_in_usd))
ggplot(average_salary, aes(x = work_year, y = avg_salary_in_usd)) +
geom_line() +
labs(title = "Average Salary in USD Over the Years",
x = "Work Year",
y = "Average Salary in USD")
#14 What is distribution of emp;oyment type?
library(ggplot2)
ggplot(ds_salaries, aes(x = employment_type, fill = factor(employment_type))) +
geom_bar()+ geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 3) +
labs(title = "Employment type Distribution",
x = "Employment type",
y = "Count") +theme_minimal()+scale_fill_manual(values = c('orange', 'yellow', 'pink','brown'))
#15 #How does the average salary in USD vary over the years (work_year) for different experience levels (experience_level)?
library(ggplot2)
library(dplyr)
avg_salary <- ds_salaries %>%
group_by(work_year, experience_level) %>%
summarise(avg_salary_in_usd = mean(salary_in_usd)) %>%
ungroup()
## `summarise()` has grouped output by 'work_year'. You can override using the
## `.groups` argument.
ggplot(avg_salary, aes(x = work_year, y = avg_salary_in_usd, color = experience_level)) +
geom_point() +
labs(title = "Average Salary in USD Over the Years by Experience Level",
x = "Work Year",
y = "Average Salary in USD",
color = "Experience Level") +
theme_minimal()
```