Project 3 - Data Science Skills

Introduction

This project aims to establish a quantitative assessment of the relative value of specific skills for data science professionals. We will achieve this by analyzing data extracted from job postings on relevant job boards. The analysis will focus on two key aspects of data scientist job postings: advertised salary and the frequency of specific skills mentioned in the job descriptions. By correlating these factors, we can develop a proxy measure to compare the relative value of various skills sought after in the data science job market.

Postings_url <- 'https://media.githubusercontent.com/media/RDLong718/data607-project3/main/Data/job_postings.csv'
Postings_raw <- readr::read_csv(Postings_url)
Salries_url <- 'https://media.githubusercontent.com/media/RDLong718/data607-project3/main/Data/job_details/salaries.csv'
Salaries_raw <- readr::read_csv(Salries_url)

linkedin_postings <- Postings_raw[c('job_id', 'title', 'description')]
linkedin_salaries <- Postings_raw[c('job_id', 'max_salary', 'min_salary', 'med_salary', 'pay_period')]

linkedin_salaries$med_salary <- (linkedin_salaries$max_salary + linkedin_salaries$min_salary)/2
linkedin_salaries$med_salary[linkedin_salaries$pay_period == 'HOURLY' & !is.na(linkedin_salaries$pay_period)] <- linkedin_salaries$med_salary[linkedin_salaries$pay_period == 'HOURLY' & !is.na(linkedin_salaries$pay_period)] * 40 * 52
linkedin_salaries$med_salary[linkedin_salaries$med_salary < 1000 & !is.na(linkedin_salaries$med_salary)] <- linkedin_salaries$med_salary[linkedin_salaries$med_salary < 1000] * 1000
salaries_clean <- linkedin_salaries[c('job_id', 'med_salary')]
#top_5 <- head(salaries_clean, 5)
#kable(top_5)

data_science_postings <- linkedin_postings[grepl("data science|data scientist", linkedin_postings$title, ignore.case = TRUE), ]
linkedin_clean <- merge(data_science_postings, salaries_clean, by = 'job_id', all.x=TRUE)
linkedin_clean$med_salary[is.na(linkedin_clean$med_salary)] <- median(linkedin_clean$med_salary, na.rm=TRUE)
#top_5 <- head(linkedin_clean, 5)
#kable(top_5)

After cleaning up the data sets we now have a combined data set with the job titles, descriptions, and median expected salaries for each position.

Here’s a template for searching for words/terms: #linkedin_clean$colName <- ifelse(grepl('term1|term2|term3', linkedin_clean$description, ignore.case = TRUE), 1,0) #linkedin_clean$ <- ifelse(grepl(’’, linkedin_clean$description, ignore.case = TRUE), 1,0)

linkedin_clean$Programming <- ifelse(grepl('python| r | r/|/r/|/r|\\(r,| r,| r.|c\\+\\+|sql', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Machine_Learning <- ifelse(grepl('machine learning| ml|algorithms|scikit learn|transformers|language model', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Statistics <- ifelse(grepl('data analysis|data analytics|statistical analysis|data-driven analysis', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Cloud_Computing <- ifelse(grepl('cloud computing| aws|\\(aws|/aws|amazon web|google cloud', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Visualization <- ifelse(grepl('visualization|presentation', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Collaboration <- ifelse(grepl('team work|with a team|collaborat|scrum team|agile team', linkedin_clean$description, ignore.case = TRUE), 1,0)
linkedin_clean$Detail_Orientation <- ifelse(grepl("detail oriented|meticulous|precise|thorough|attention to detail|accuracy", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Critical_Thinking <- ifelse(grepl( "critical thinking|problem solving|analytical skills|logical analysis|creative problem solving|critical reasoning", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Communication_Skills <- ifelse(grepl("written and verbal communication|effective communication|articulate|persuasive|clear communication|concise communication|communication skills", linkedin_clean$description, ignore.case = TRUE), 1, 0)
linkedin_clean$Predictive_Analytics <- ifelse(grepl("forecasting|predictive modeling|predictive analytics|statistical forecasting|data forecasting|forecasting techniques", linkedin_clean$description, ignore.case = TRUE), 1, 0)

skill_ranking <- linkedin_clean[,4:14]

skill_ranking_long <- skill_ranking %>%
  pivot_longer(cols = -med_salary, names_to = "skill", values_to = "has_skill") %>%
  filter(has_skill == 1) %>%
  select(-has_skill)

skill_count <- skill_ranking_long %>%
  group_by(skill) %>%
  summarize(
    count = n()
  )

skill_count <- skill_count %>%
  mutate(
    count_rank = rank(desc(count))
  )

skill_median_salary <- skill_ranking %>%
  select(med_salary, Programming, Machine_Learning, Statistics, Cloud_Computing, Visualization, Collaboration,Detail_Orientation,Critical_Thinking,Communication_Skills,Predictive_Analytics) %>%
  pivot_longer(cols = -med_salary, names_to = "skill", values_to = "has_skill") %>%
  filter(has_skill == 1) %>%
  select(-has_skill) %>%
  distinct() %>%
  arrange(desc(med_salary)) %>%
  mutate(
    median_salary_rank = rank(desc(med_salary))
  )

skill_ranking_summary <- inner_join(skill_count, skill_median_salary, by = "skill") %>%
  mutate(
    combined_rank = (count_rank + median_salary_rank) / 2
  ) %>%
  select(skill, count_rank, median_salary_rank, combined_rank)

ggplot(skill_count, aes(x = reorder(skill, -count), y = count, fill = skill)) +
  geom_bar(stat = "identity")  +
  coord_flip() +
  labs(title = "Number of Times Each Skill Shows Up",
       x = "Skill",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),  
        axis.title = element_text(size = 14),  
        plot.title = element_text(size = 16),  
        panel.background = element_rect(fill = "lightgray"))

### wordcloud (optional) 
library(ggwordcloud)
ggplot(skill_count, aes(label = skill, size = count)) +
  geom_text_wordcloud() +
  theme_light()

Analysis

We will now analyze the data to determine the top 5 skills that are most frequently mentioned in the job postings.

# make a vector of the column skill names in linkedin_clean
skill_names <- colnames(linkedin_clean)[5:ncol(linkedin_clean)]

# Make a new data frame with the total number of times each skill is mentioned top 5
top5_skills <- linkedin_clean |>
  select(skill_names) |>
  summarise_all(sum) |>
  gather(skill, total) |>
  arrange(desc(total)) |>
  head(5)

top5_skills

##              skill total
## 1      Programming    54
## 2    Collaboration    40
## 3       Statistics    31
## 4 Machine_Learning    30
## 5    Visualization    24

We see that the top 5 skills are: Programming, Collaboration, Statistics, Machine Learning, and Visualization. We will now create a bar plot to visualize the top 5 skills and their frequency in the job postings.

# Position the logo via rasterGrob.
get_png <-
  function(filename,
           x = unit(0.5, "npc"),
           y = unit(0.5, "npc"),
           width = NULL,
           height = NULL,
           gp = grid::gpar()) {
    grid::rasterGrob(
      png::readPNG(filename),
      interpolate = TRUE,
      x = x,
      y = y,
      width = width,
      height = height,
      gp = gp
    )
  }

width = 3

# Save the logo as a grob
logo <-
  get_png(
    here("linkedin_logo.png"),
    x = unit(1.31, "npc") - unit(width / 2, "lines"),
    y = unit(0, "lines"),
    width = unit(width, "lines")
  )

# Plot the Top 5 skills
top5_skills_plot <- function(x, width = 3) {
  ggplot(top5_skills, aes(
    x = reorder(skill, total),
    y = total,
    fill = skill
  )) +
    geom_col() +
    coord_cartesian(clip = "off") +
    theme(
      axis.text.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks = element_blank()
    ) +
    geom_text(aes(label = total), vjust = -0.2) +
    labs(x = "", y = "", title = "Top 5 Skills on LinkedIn for Data Scientists")
}

top5_skills_plot(top5_skills) + scale_fill_manual(values = c("#caf0f8", "#00b4d8", "#03045e", "#0077b6", "#90e0ef")) + annotation_custom(logo)

cols <- names(linkedin_clean)[5:14]  

Skill_Salary <- data.frame(Skill = cols,
                     Avg_Salary = NA) 

for (col in cols) {
  sum_salary <- sum(linkedin_clean$med_salary[linkedin_clean[, col] == 1])

  sum_ones <- sum(linkedin_clean[, col] == 1)

  avg_salary <- sum_salary / sum_ones

  Skill_Salary$Avg_Salary[Skill_Salary$Skill == col] <- avg_salary
}

kable(Skill_Salary[order(Skill_Salary$Avg_Salary, decreasing = TRUE), ], row.names = FALSE) %>%
  kable_styling(full_width = FALSE)

Skill	Avg_Salary
Cloud_Computing	182090.9
Statistics	178434.3
Programming	177982.7
Collaboration	176426.6
Critical_Thinking	175000.0
Machine_Learning	174855.0
Detail_Orientation	173641.4
Visualization	172106.8
Communication_Skills	171963.6
Predictive_Analytics	168725.0

With this we can see that the most monetarily valuable skill appears to be Cloud Computing with LinkedIn Data science position postings that mention this skill having an expected salary over $3,000 higher than second place Statistics, however the gap of all skills is less than might be expected with the top ranking skill offering an average salary only around 8% higher than the lowest ranking skill, Predictive Analytics.

manual_colors <- c("#caf0f8", "#90e0ef", "#48cae4", "#00b4d8", "#0096c7", "#0077b6", "#023e8a", "#03045e", "#001d3d", "#000080")

ggplot(Skill_Salary, aes(x = reorder(Skill, -Avg_Salary), y = Avg_Salary, fill=Skill)) + 
  geom_bar(stat = "identity") +
  theme_minimal() + 
  labs(title = "Average Salary by Required Skill on LinkedIn for Data Scientist", x = "Skill", y = "Average Salary") +
  coord_flip() + 
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_manual(values = manual_colors)

combined_ranking_plot <- ggplot(skill_ranking_summary, aes(x = reorder(skill, combined_rank), y = combined_rank)) +
  geom_bar(stat = "identity", aes(fill = skill)) +
  scale_fill_manual(values = manual_colors) +   
  labs(title = "Combined Ranking of Skills", x = "Skill", y = "Combined Rank") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
  axis.title = element_text(size = 12),
  plot.title = element_text(size = 14, hjust = 0.5),
  panel.background = element_rect(fill = "lightgray"))

print(combined_ranking_plot)

Project 3 - Data Science Skills

Brandon Cunningham¹

Rashad Long²

Biyag Dukuray³

Nikoleta Emanouilidi⁴

Markella Gialouris⁵

2024-03-17

Introduction

Analysis

Project 3 - Data Science Skills

Brandon Cunningham1

Rashad Long2

Biyag Dukuray3

Nikoleta Emanouilidi4

Markella Gialouris5

2024-03-17

Introduction

Analysis

Brandon Cunningham¹

Rashad Long²

Biyag Dukuray³

Nikoleta Emanouilidi⁴

Markella Gialouris⁵