In today’s data-driven world, our project focuses on Exploratory Data Analysis (EDA) of data science job salaries using R. Utilizing a Kaggle dataset with attributes like work experience, job titles, salaries, company size, and location, we aim to uncover insights into the data science job market. Our objective is to clean and prepare the data, ensuring integrity by handling missing values and standardizing formats with R packages. Using libraries like ggplot2, dplyr, and tidyr, we perform analyses to reveal salary distributions, trends, and the impact of remote work. This project provides valuable insights for data professionals and recruiters, transforming raw data into actionable information.
library(ggplot2)
library(magrittr)
library(dplyr)
library(kableExtra)
library(randomForest)
The dataset which is used in this project is downloaded from Kaggle called EDA on Data Science Job Salaries, dataset
data <- read.csv("C:/Users/kkart/Downloads/ds_salaries.csv")
head(data)
## X work_year experience_level employment_type job_title
## 1 0 2020 MI FT Data Scientist
## 2 1 2020 SE FT Machine Learning Scientist
## 3 2 2020 SE FT Big Data Engineer
## 4 3 2020 MI FT Product Data Analyst
## 5 4 2020 SE FT Machine Learning Engineer
## 6 5 2020 EN FT Data Analyst
## salary salary_currency salary_in_usd employee_residence remote_ratio
## 1 70000 EUR 79833 DE 0
## 2 260000 USD 260000 JP 0
## 3 85000 GBP 109024 GB 50
## 4 20000 USD 20000 HN 0
## 5 150000 USD 150000 US 50
## 6 72000 USD 72000 US 100
## company_location company_size
## 1 DE L
## 2 JP S
## 3 GB M
## 4 HN S
## 5 US L
## 6 US L
dim(data)
## [1] 607 12
summary_table<-summary(data)
kable(summary_table, format = "markdown") %>%
kable_styling() %>%
scroll_box(width = "100%")
| X | work_year | experience_level | employment_type | job_title | salary | salary_currency | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 0.0 | Min. :2020 | Length:607 | Length:607 | Length:607 | Min. : 4000 | Length:607 | Min. : 2859 | Length:607 | Min. : 0.00 | Length:607 | Length:607 | |
| 1st Qu.:151.5 | 1st Qu.:2021 | Class :character | Class :character | Class :character | 1st Qu.: 70000 | Class :character | 1st Qu.: 62726 | Class :character | 1st Qu.: 50.00 | Class :character | Class :character | |
| Median :303.0 | Median :2022 | Mode :character | Mode :character | Mode :character | Median : 115000 | Mode :character | Median :101570 | Mode :character | Median :100.00 | Mode :character | Mode :character | |
| Mean :303.0 | Mean :2021 | NA | NA | NA | Mean : 324000 | NA | Mean :112298 | NA | Mean : 70.92 | NA | NA | |
| 3rd Qu.:454.5 | 3rd Qu.:2022 | NA | NA | NA | 3rd Qu.: 165000 | NA | 3rd Qu.:150000 | NA | 3rd Qu.:100.00 | NA | NA | |
| Max. :606.0 | Max. :2022 | NA | NA | NA | Max. :30400000 | NA | Max. :600000 | NA | Max. :100.00 | NA | NA |
Two coloumns experience_level and employment_type are categorical values, hence encoding them in to numerical values using label encoding
# Handling Categorical Data
# Performing label encoding for 'experience_level' and 'employment_type'
data$coded_experience_level <- as.integer(factor(data$experience_level))
data$employment_type <- as.integer(factor(data$employment_type))
print(head(data$coded_experience_level))
## [1] 3 4 4 3 4 1
print(head(data$employment_type))
## [1] 3 3 3 3 3 3
created a new column dased on existing columns for more good analysis of data
# Feature Engineering
# Creating new feature: Salary per year
data$salary_per_year <- data$salary_in_usd / (2024 - data$work_year)
DT:: datatable(head(data,5), rownames=FALSE, options=list(pageLength=10, scrollx=TRUE))
# Splitting the Data Set
# Splitting the dataset into training and testing sets (e.g., 80% training, 20% testing)
train_indices <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]
subset<-data[1:1000,]
# Double Density Plots for Visualization
# Double density plot of salary by experience level
ggplot(data, aes(x = salary_in_usd, fill = experience_level)) +
geom_density(alpha = 0.5) +
labs(x = "Salary (USD)", y = "Density", fill = "Experience Level",
title = "Double Density Plot of Salary by Experience Level")
ggplot(subset, aes(x = salary_in_usd, fill = experience_level)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Salary by Company Size") +
facet_wrap(~company_size, scales = "free") +
theme_minimal()
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_density()`).
# Shadow histogram of salary by experience level
ggplot(data, aes(x = salary_in_usd, fill = experience_level)) +
geom_histogram(alpha = 0.5, bins = 30, position = "identity") +
labs(x = "Salary (USD)", y = "Frequency", fill = "Experience Level",
title = "Shadow Histogram of Salary by Experience Level")
# Double density plot of salary by company size
ggplot(subset, aes(x = salary_in_usd, fill = company_size)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Salary by Company Size") +
facet_wrap(~company_size, scales = "free") +
theme_minimal()
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(data, aes(x = salary_in_usd, fill = company_size)) +
geom_density(alpha = 0.5) +
labs(x = "Salary (USD)", y = "Density", fill = "Company Size",
title = "Density Plot of Salary by Company Size") +
theme_minimal()
data$remote_ratio <- factor(data$remote_ratio)
# Double density plot of salary by remote work ratio
ggplot(data, aes(x = salary_in_usd, fill = remote_ratio)) +
geom_density(alpha = 0.5) +
labs(x = "Salary (USD)", y = "Density", fill = "Remote Work Ratio",
title = "Double Density Plot of Salary by Remote Work Ratio")
# Spliting data into features (X) and target variable (y)
# Subset the data frame to select specific columns
# Scaling numerical feature (salary_in_usd)
data$scaled_salary <- scale(data$salary_in_usd)
names(data)
## [1] "X" "work_year" "experience_level"
## [4] "employment_type" "job_title" "salary"
## [7] "salary_currency" "salary_in_usd" "employee_residence"
## [10] "remote_ratio" "company_location" "company_size"
## [13] "coded_experience_level" "salary_per_year" "scaled_salary"
X <- data[, c("work_year", "experience_level", "employment_type", "job_title",
"salary", "salary_currency", "salary_in_usd", "employee_residence",
"remote_ratio", "company_location", "company_size",
"coded_experience_level", "scaled_salary", "salary_per_year")]
# Remove the "X" column from X
X <- X[, -1] # Remove the first column
# Alternatively, you can directly subset the columns without the "X" column
X <- data[, c("work_year", "experience_level", "employment_type", "job_title",
"salary", "salary_currency", "salary_in_usd", "employee_residence",
"remote_ratio", "company_location", "company_size",
"coded_experience_level", "scaled_salary", "salary_per_year")]
y <- data$salary_in_usd
# Split data into training and testing sets
train_indices <- sample(1:nrow(data), 0.8 * nrow(data))
X_train <- X[train_indices, ]
X_test <- X[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]
# Train Random Forest model
model <- randomForest(y_train ~ ., data = X_train)
# Makeing predictions
predicted_salaries <- predict(model, X_test)
# Combining predicted salaries with job titles
predicted_data <- data.frame(job_title = test_data$job_title, predicted_salaries)
# Group by job title and concatenate predicted salaries horizontally
predicted_summary <- predicted_data %>%
group_by(job_title) %>%
summarize(predicted_salaries = paste(predicted_salaries, collapse = ", "))
# Printing the summary
print(predicted_summary)
## # A tibble: 28 × 2
## job_title predicted_salaries
## <chr> <chr>
## 1 AI Scientist 78834.9748
## 2 Applied Machine Learning Scientist 148820.245147619
## 3 BI Data Analyst 65226.7033333333
## 4 Big Data Engineer 132692.177466667, 357746.402194118
## 5 Computer Vision Engineer 20493.6498666667, 41932.3100333333
## 6 Computer Vision Software Engineer 100498.722738095
## 7 Data Analyst 41594.8122666667, 237642.610766667, 11750…
## 8 Data Analytics Engineer 148263.209719048, 32555.7488666666, 15478…
## 9 Data Analytics Manager 90844.2951428571, 128886.210072222, 15060…
## 10 Data Architect 10767.9357666667
## # ℹ 18 more rows
** Job Titles **
# Calculating RMSE
predictions <- predict(model, X_test)
rmse <- sqrt(mean((predictions - y_test)^2))
# Print RMSE
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 8594.01161297899"
mean_y <- mean(y_test)
total_ss <- sum((y_test - mean_y)^2)
residual_ss <- sum((y_test - predictions)^2)
rsquared <- 1 - (residual_ss / total_ss)
print(paste("R-squared:", rsquared))
## [1] "R-squared: 0.983019338876961"
# Plotting US map with average salary by state
average_salary_by_country <- data.frame(
country = c("United Arab Emirates", "American Samoa", "Austria", "Australia", "Belgium", "Brazil", "Canada", "Switzerland", "Chile", "China"), # Country names
avg_salary = c(100000, 18053, 72921, 108043, 85699, 18603, 99824, 64114, 40038, 71666) # Example average salaries
)
# Load world map data
world_map <- map_data("world")
# Merge average salary data with world map data
map_data_with_salary <- merge(world_map, average_salary_by_country, by.x = "region", by.y = "country", all.x = TRUE)
# Plot average salary by country
ggplot(map_data_with_salary, aes(x = long, y = lat, group = group, fill = avg_salary)) +
geom_polygon(color = "black") +
scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Average Salary (USD)") +
labs(title = "Average Salary by Country") +
theme_void()
Our project on data science job salaries provides valuable insights into the dynamics of the data science job market. Through exploratory data analysis (EDA) using R, we uncovered trends and patterns in salary distributions, job titles, work experience, company sizes, and remote work impact. Key findings include the identification of salary variations based on location, company size, and experience level. Our analysis also revealed the growing importance of remote work arrangements in influencing compensation. By transforming raw data into actionable insights, our project aims to empower data professionals and recruiters with the information needed to navigate and thrive in the evolving landscape of the data science industry.