This project aims to establish a quantitative assessment of the relative value of specific skills for data science professionals. We will achieve this by analyzing data extracted from job postings on relevant job boards. The analysis will focus on two key aspects of data scientist job postings: advertised salary and the frequency of specific skills mentioned in the job descriptions. By correlating these factors, we can develop a proxy measure to compare the relative value of various skills sought after in the data science job market.
Postings_url <- 'https://media.githubusercontent.com/media/RDLong718/data607-project3/main/Data/job_postings.csv'
Postings_raw <- readr::read_csv(Postings_url)
Salries_url <- 'https://media.githubusercontent.com/media/RDLong718/data607-project3/main/Data/job_details/salaries.csv'
Salaries_raw <- readr::read_csv(Salries_url)
linkedin_postings <- Postings_raw[c('job_id', 'title', 'description')]
linkedin_salaries <- Postings_raw[c('job_id', 'max_salary', 'min_salary', 'med_salary', 'pay_period')]
linkedin_salaries$med_salary <- (linkedin_salaries$max_salary + linkedin_salaries$min_salary)/2
linkedin_salaries$med_salary[linkedin_salaries$pay_period == 'HOURLY' & !is.na(linkedin_salaries$pay_period)] <- linkedin_salaries$med_salary[linkedin_salaries$pay_period == 'HOURLY' & !is.na(linkedin_salaries$pay_period)] * 40 * 52
linkedin_salaries$med_salary[linkedin_salaries$med_salary < 1000 & !is.na(linkedin_salaries$med_salary)] <- linkedin_salaries$med_salary[linkedin_salaries$med_salary < 1000] * 1000
salaries_clean <- linkedin_salaries[c('job_id', 'med_salary')]
#top_5 <- head(salaries_clean, 5)
#kable(top_5)
data_science_postings <- linkedin_postings[grepl("data science|data scientist", linkedin_postings$title, ignore.case = TRUE), ]
linkedin_clean <- merge(data_science_postings, salaries_clean, by = 'job_id', all.x=TRUE)
linkedin_clean$med_salary[is.na(linkedin_clean$med_salary)] <- median(linkedin_clean$med_salary, na.rm=TRUE)
#top_5 <- head(linkedin_clean, 5)
#kable(top_5)
After cleaning up the data sets we now have a combined data set with the job titles, descriptions, and median expected salaries for each position.
Here’s a template for searching for words/terms: #linkedin_clean\(colName <- ifelse(grepl('term1|term2|term3', linkedin_clean\)description, ignore.case = TRUE), 1,0) #linkedin_clean$ <- ifelse(grepl(’’, linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Programming <- ifelse(grepl('python| r | r/|/r/|/r|\\(r,| r,| r.|c\\+\\+|sql', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Machine_Learning <- ifelse(grepl('machine learning| ml|algorithms|scikit learn|transformers|language model', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Statistics <- ifelse(grepl('data analysis|data analytics|statistical analysis|data-driven analysis', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Cloud_Computing <- ifelse(grepl('cloud computing| aws|\\(aws|/aws|amazon web|google cloud', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Visualization <- ifelse(grepl('visualization|presentation', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Collaboration <- ifelse(grepl('team work|with a team|collaborat|scrum team|agile team', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Detail_Orientation <- ifelse(grepl("detail oriented|meticulous|precise|thorough|attention to detail|accuracy", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Critical_Thinking <- ifelse(grepl( "critical thinking|problem solving|analytical skills|logical analysis|creative problem solving|critical reasoning", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Communication_Skills <- ifelse(grepl("written and verbal communication|effective communication|articulate|persuasive|clear communication|concise communication|communication skills", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Predictive_Analytics <- ifelse(grepl("forecasting|predictive modeling|predictive analytics|statistical forecasting|data forecasting|forecasting techniques", linkedin_clean$description, ignore.case = TRUE), 1, 0)
skill_ranking <- linkedin_clean[,4:14]
skill_ranking_long <- skill_ranking %>%
pivot_longer(cols = -med_salary, names_to = "skill", values_to = "has_skill") %>%
filter(has_skill == 1) %>%
select(-has_skill)
skill_count <- skill_ranking_long %>%
group_by(skill) %>%
summarize(
count = n()
)
skill_count <- skill_count %>%
mutate(
count_rank = rank(desc(count))
)
skill_median_salary <- skill_ranking %>%
select(med_salary, Programming, Machine_Learning, Statistics, Cloud_Computing, Visualization, Collaboration,Detail_Orientation,Critical_Thinking,Communication_Skills,Predictive_Analytics) %>%
pivot_longer(cols = -med_salary, names_to = "skill", values_to = "has_skill") %>%
filter(has_skill == 1) %>%
select(-has_skill) %>%
distinct() %>%
arrange(desc(med_salary)) %>%
mutate(
median_salary_rank = rank(desc(med_salary))
)
skill_ranking_summary <- inner_join(skill_count, skill_median_salary, by = "skill") %>%
mutate(
combined_rank = (count_rank + median_salary_rank) / 2
) %>%
select(skill, count_rank, median_salary_rank, combined_rank)
ggplot(skill_count, aes(x = reorder(skill, -count), y = count, fill = skill)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Number of Times Each Skill Shows Up",
x = "Skill",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.title = element_text(size = 14),
plot.title = element_text(size = 16),
panel.background = element_rect(fill = "lightgray"))
### wordcloud (optional)
library(ggwordcloud)
ggplot(skill_count, aes(label = skill, size = count)) +
geom_text_wordcloud() +
theme_light()
We will now analyze the data to determine the top 5 skills that are most frequently mentioned in the job postings.
# make a vector of the column skill names in linkedin_clean
skill_names <- colnames(linkedin_clean)[5:ncol(linkedin_clean)]
# Make a new data frame with the total number of times each skill is mentioned top 5
top5_skills <- linkedin_clean |>
select(skill_names) |>
summarise_all(sum) |>
gather(skill, total) |>
arrange(desc(total)) |>
head(5)
top5_skills
## skill total
## 1 Programming 54
## 2 Collaboration 40
## 3 Statistics 31
## 4 Machine_Learning 30
## 5 Visualization 24
We see that the top 5 skills are: Programming, Collaboration, Statistics, Machine Learning, and Visualization. We will now create a bar plot to visualize the top 5 skills and their frequency in the job postings.
# Position the logo via rasterGrob.
get_png <-
function(filename,
x = unit(0.5, "npc"),
y = unit(0.5, "npc"),
width = NULL,
height = NULL,
gp = grid::gpar()) {
grid::rasterGrob(
png::readPNG(filename),
interpolate = TRUE,
x = x,
y = y,
width = width,
height = height,
gp = gp
)
}
width = 3
# Save the logo as a grob
logo <-
get_png(
here("linkedin_logo.png"),
x = unit(1.31, "npc") - unit(width / 2, "lines"),
y = unit(0, "lines"),
width = unit(width, "lines")
)
# Plot the Top 5 skills
top5_skills_plot <- function(x, width = 3) {
ggplot(top5_skills, aes(
x = reorder(skill, total),
y = total,
fill = skill
)) +
geom_col() +
coord_cartesian(clip = "off") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()
) +
geom_text(aes(label = total), vjust = -0.2) +
labs(x = "", y = "", title = "Top 5 Skills on LinkedIn for Data Scientists")
}
top5_skills_plot(top5_skills) + scale_fill_manual(values = c("#caf0f8", "#00b4d8", "#03045e", "#0077b6", "#90e0ef")) + annotation_custom(logo)
cols <- names(linkedin_clean)[5:14]
Skill_Salary <- data.frame(Skill = cols,
Avg_Salary = NA)
for (col in cols) {
sum_salary <- sum(linkedin_clean$med_salary[linkedin_clean[, col] == 1])
sum_ones <- sum(linkedin_clean[, col] == 1)
avg_salary <- sum_salary / sum_ones
Skill_Salary$Avg_Salary[Skill_Salary$Skill == col] <- avg_salary
}
kable(Skill_Salary[order(Skill_Salary$Avg_Salary, decreasing = TRUE), ], row.names = FALSE) %>%
kable_styling(full_width = FALSE)
| Skill | Avg_Salary |
|---|---|
| Cloud_Computing | 182090.9 |
| Statistics | 178434.3 |
| Programming | 177982.7 |
| Collaboration | 176426.6 |
| Critical_Thinking | 175000.0 |
| Machine_Learning | 174855.0 |
| Detail_Orientation | 173641.4 |
| Visualization | 172106.8 |
| Communication_Skills | 171963.6 |
| Predictive_Analytics | 168725.0 |
With this we can see that the most monetarily valuable skill appears to be Cloud Computing with LinkedIn Data science position postings that mention this skill having an expected salary over $3,000 higher than second place Statistics, however the gap of all skills is less than might be expected with the top ranking skill offering an average salary only around 8% higher than the lowest ranking skill, Predictive Analytics.
manual_colors <- c("#caf0f8", "#90e0ef", "#48cae4", "#00b4d8", "#0096c7", "#0077b6", "#023e8a", "#03045e", "#001d3d", "#000080")
ggplot(Skill_Salary, aes(x = reorder(Skill, -Avg_Salary), y = Avg_Salary, fill=Skill)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Salary by Required Skill on LinkedIn for Data Scientist", x = "Skill", y = "Average Salary") +
coord_flip() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_manual(values = manual_colors)
combined_ranking_plot <- ggplot(skill_ranking_summary, aes(x = reorder(skill, combined_rank), y = combined_rank)) +
geom_bar(stat = "identity", aes(fill = skill)) +
scale_fill_manual(values = manual_colors) +
labs(title = "Combined Ranking of Skills", x = "Skill", y = "Combined Rank") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
axis.title = element_text(size = 12),
plot.title = element_text(size = 14, hjust = 0.5),
panel.background = element_rect(fill = "lightgray"))
print(combined_ranking_plot)