About the Project

In today’s data-driven world, our project focuses on Exploratory Data Analysis (EDA) of data science job salaries using R. Utilizing a Kaggle dataset with attributes like work experience, job titles, salaries, company size, and location, we aim to uncover insights into the data science job market. Our objective is to clean and prepare the data, ensuring integrity by handling missing values and standardizing formats with R packages. Using libraries like ggplot2, dplyr, and tidyr, we perform analyses to reveal salary distributions, trends, and the impact of remote work. This project provides valuable insights for data professionals and recruiters, transforming raw data into actionable information.

Libraries Required

library(ggplot2)
library(magrittr)
library(dplyr)
library(kableExtra)
library(randomForest)

Dataset Description

The dataset which is used in this project is downloaded from Kaggle called EDA on Data Science Job Salaries, dataset

data <- read.csv("C:/Users/kkart/Downloads/ds_salaries.csv")
head(data)
##   X work_year experience_level employment_type                  job_title
## 1 0      2020               MI              FT             Data Scientist
## 2 1      2020               SE              FT Machine Learning Scientist
## 3 2      2020               SE              FT          Big Data Engineer
## 4 3      2020               MI              FT       Product Data Analyst
## 5 4      2020               SE              FT  Machine Learning Engineer
## 6 5      2020               EN              FT               Data Analyst
##   salary salary_currency salary_in_usd employee_residence remote_ratio
## 1  70000             EUR         79833                 DE            0
## 2 260000             USD        260000                 JP            0
## 3  85000             GBP        109024                 GB           50
## 4  20000             USD         20000                 HN            0
## 5 150000             USD        150000                 US           50
## 6  72000             USD         72000                 US          100
##   company_location company_size
## 1               DE            L
## 2               JP            S
## 3               GB            M
## 4               HN            S
## 5               US            L
## 6               US            L
dim(data)
## [1] 607  12

Summary Of Data

summary_table<-summary(data)
kable(summary_table, format = "markdown") %>%
  kable_styling() %>%
  scroll_box(width = "100%")
X work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
Min. : 0.0 Min. :2020 Length:607 Length:607 Length:607 Min. : 4000 Length:607 Min. : 2859 Length:607 Min. : 0.00 Length:607 Length:607
1st Qu.:151.5 1st Qu.:2021 Class :character Class :character Class :character 1st Qu.: 70000 Class :character 1st Qu.: 62726 Class :character 1st Qu.: 50.00 Class :character Class :character
Median :303.0 Median :2022 Mode :character Mode :character Mode :character Median : 115000 Mode :character Median :101570 Mode :character Median :100.00 Mode :character Mode :character
Mean :303.0 Mean :2021 NA NA NA Mean : 324000 NA Mean :112298 NA Mean : 70.92 NA NA
3rd Qu.:454.5 3rd Qu.:2022 NA NA NA 3rd Qu.: 165000 NA 3rd Qu.:150000 NA 3rd Qu.:100.00 NA NA
Max. :606.0 Max. :2022 NA NA NA Max. :30400000 NA Max. :600000 NA Max. :100.00 NA NA

Preprocessing

Two coloumns experience_level and employment_type are categorical values, hence encoding them in to numerical values using label encoding

# Handling Categorical Data
# Performing label encoding for 'experience_level' and 'employment_type'
data$coded_experience_level <- as.integer(factor(data$experience_level))
data$employment_type <- as.integer(factor(data$employment_type))
print(head(data$coded_experience_level))
## [1] 3 4 4 3 4 1
print(head(data$employment_type))
## [1] 3 3 3 3 3 3

Vsualizing categorical values

Feature Engineering

created a new column dased on existing columns for more good analysis of data

# Feature Engineering
# Creating new feature: Salary per year
data$salary_per_year <- data$salary_in_usd / (2024 - data$work_year)
DT:: datatable(head(data,5), rownames=FALSE, options=list(pageLength=10, scrollx=TRUE))

Including Plots

Splitting the Data Set and analyzing the relationships between the attributes

# Splitting the Data Set
# Splitting the dataset into training and testing sets (e.g., 80% training, 20% testing)
train_indices <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[train_indices, ]
test_data <- data[-train_indices, ]

subset<-data[1:1000,]

# Double Density Plots for Visualization
# Double density plot of salary by experience level
ggplot(data, aes(x = salary_in_usd, fill = experience_level)) +
  geom_density(alpha = 0.5) +
  labs(x = "Salary (USD)", y = "Density", fill = "Experience Level",
       title = "Double Density Plot of Salary by Experience Level")

ggplot(subset, aes(x = salary_in_usd, fill = experience_level)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Salary by Company Size") +
  facet_wrap(~company_size, scales = "free") +
  theme_minimal()
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_density()`).

# Shadow histogram of salary by experience level
ggplot(data, aes(x = salary_in_usd, fill = experience_level)) +
  geom_histogram(alpha = 0.5, bins = 30, position = "identity") +
  labs(x = "Salary (USD)", y = "Frequency", fill = "Experience Level",
       title = "Shadow Histogram of Salary by Experience Level")

# Double density plot of salary by company size


ggplot(subset, aes(x = salary_in_usd, fill = company_size)) +
  geom_density(alpha = 0.5) +
  labs(title = "Density Plot of Salary by Company Size") +
  facet_wrap(~company_size, scales = "free") +
  theme_minimal()
## Warning: Removed 393 rows containing non-finite outside the scale range
## (`stat_density()`).

ggplot(data, aes(x = salary_in_usd, fill = company_size)) +
  geom_density(alpha = 0.5) +
  labs(x = "Salary (USD)", y = "Density", fill = "Company Size",
       title = "Density Plot of Salary by Company Size") +
  theme_minimal()

data$remote_ratio <- factor(data$remote_ratio)
# Double density plot of salary by remote work ratio
ggplot(data, aes(x = salary_in_usd, fill = remote_ratio)) +
  geom_density(alpha = 0.5) +
  labs(x = "Salary (USD)", y = "Density", fill = "Remote Work Ratio",
       title = "Double Density Plot of Salary by Remote Work Ratio")

Model Building and Testing

# Spliting data into features (X) and target variable (y)
# Subset the data frame to select specific columns
# Scaling numerical feature (salary_in_usd)
data$scaled_salary <- scale(data$salary_in_usd)
names(data)
##  [1] "X"                      "work_year"              "experience_level"      
##  [4] "employment_type"        "job_title"              "salary"                
##  [7] "salary_currency"        "salary_in_usd"          "employee_residence"    
## [10] "remote_ratio"           "company_location"       "company_size"          
## [13] "coded_experience_level" "salary_per_year"        "scaled_salary"
X <- data[, c("work_year", "experience_level", "employment_type", "job_title",
              "salary", "salary_currency", "salary_in_usd", "employee_residence",
              "remote_ratio", "company_location", "company_size",
              "coded_experience_level", "scaled_salary", "salary_per_year")]

# Remove the "X" column from X
X <- X[, -1]  # Remove the first column

# Alternatively, you can directly subset the columns without the "X" column
X <- data[, c("work_year", "experience_level", "employment_type", "job_title",
              "salary", "salary_currency", "salary_in_usd", "employee_residence",
              "remote_ratio", "company_location", "company_size",
              "coded_experience_level", "scaled_salary", "salary_per_year")]


y <- data$salary_in_usd

# Split data into training and testing sets
train_indices <- sample(1:nrow(data), 0.8 * nrow(data))
X_train <- X[train_indices, ]
X_test <- X[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]

# Train Random Forest model
model <- randomForest(y_train ~ ., data = X_train)

# Makeing predictions
predicted_salaries <- predict(model, X_test)

# Combining predicted salaries with job titles
predicted_data <- data.frame(job_title = test_data$job_title, predicted_salaries)

# Group by job title and concatenate predicted salaries horizontally
predicted_summary <- predicted_data %>%
  group_by(job_title) %>%
  summarize(predicted_salaries = paste(predicted_salaries, collapse = ", "))

# Printing the summary
print(predicted_summary)
## # A tibble: 28 × 2
##    job_title                          predicted_salaries                        
##    <chr>                              <chr>                                     
##  1 AI Scientist                       78834.9748                                
##  2 Applied Machine Learning Scientist 148820.245147619                          
##  3 BI Data Analyst                    65226.7033333333                          
##  4 Big Data Engineer                  132692.177466667, 357746.402194118        
##  5 Computer Vision Engineer           20493.6498666667, 41932.3100333333        
##  6 Computer Vision Software Engineer  100498.722738095                          
##  7 Data Analyst                       41594.8122666667, 237642.610766667, 11750…
##  8 Data Analytics Engineer            148263.209719048, 32555.7488666666, 15478…
##  9 Data Analytics Manager             90844.2951428571, 128886.210072222, 15060…
## 10 Data Architect                     10767.9357666667                          
## # ℹ 18 more rows

Visualizing predicted salaries by

** Job Titles **

Caluclating RMSE and Rsquared errors

# Calculating RMSE
predictions <- predict(model, X_test)
rmse <- sqrt(mean((predictions - y_test)^2))

# Print RMSE
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 8594.01161297899"
mean_y <- mean(y_test)
total_ss <- sum((y_test - mean_y)^2)
residual_ss <- sum((y_test - predictions)^2)
rsquared <- 1 - (residual_ss / total_ss)
print(paste("R-squared:", rsquared))
## [1] "R-squared: 0.983019338876961"

Geospatial Analysis

# Plotting US map with average salary by state
average_salary_by_country <- data.frame(
  country = c("United Arab Emirates", "American Samoa", "Austria", "Australia", "Belgium", "Brazil", "Canada", "Switzerland", "Chile", "China"), # Country names
  avg_salary = c(100000, 18053, 72921, 108043, 85699, 18603, 99824, 64114, 40038, 71666) # Example average salaries
)

# Load world map data
world_map <- map_data("world")

# Merge average salary data with world map data
map_data_with_salary <- merge(world_map, average_salary_by_country, by.x = "region", by.y = "country", all.x = TRUE)

# Plot average salary by country
ggplot(map_data_with_salary, aes(x = long, y = lat, group = group, fill = avg_salary)) +
  geom_polygon(color = "black") +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Average Salary (USD)") +
  labs(title = "Average Salary by Country") +
  theme_void()

Conclusion

Our project on data science job salaries provides valuable insights into the dynamics of the data science job market. Through exploratory data analysis (EDA) using R, we uncovered trends and patterns in salary distributions, job titles, work experience, company sizes, and remote work impact. Key findings include the identification of salary variations based on location, company size, and experience level. Our analysis also revealed the growing importance of remote work arrangements in influencing compensation. By transforming raw data into actionable insights, our project aims to empower data professionals and recruiters with the information needed to navigate and thrive in the evolving landscape of the data science industry.