Project 3 - Data Science Skills

Read the Data

job_postings_norm <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/job_postings_norm.csv", show_col_types = FALSE)
job_posting_with_skills <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/job_postings_with_skills.csv", show_col_types = FALSE)
sep_skills <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/sep_skills.csv", show_col_types = FALSE)

Filter out unrelated job titles:

head(job_posting_with_skills, 20)

## # A tibble: 20 × 16
##    job_link  last_processed_time last_status got_summary got_ner is_being_worked
##    <chr>     <dttm>              <chr>       <lgl>       <lgl>   <lgl>          
##  1 https://… 2024-01-21 08:08:48 Finished N… TRUE        TRUE    FALSE          
##  2 https://… 2024-01-20 04:02:12 Finished N… TRUE        TRUE    FALSE          
##  3 https://… 2024-01-21 08:08:31 Finished N… TRUE        TRUE    FALSE          
##  4 https://… 2024-01-20 15:30:55 Finished N… TRUE        TRUE    FALSE          
##  5 https://… 2024-01-21 08:08:58 Finished N… TRUE        TRUE    FALSE          
##  6 https://… 2024-01-21 07:14:11 Finished N… TRUE        TRUE    FALSE          
##  7 https://… 2024-01-21 07:14:09 Finished N… TRUE        TRUE    FALSE          
##  8 https://… 2024-01-21 07:39:58 Finished N… TRUE        TRUE    FALSE          
##  9 https://… 2024-01-21 07:14:50 Finished N… TRUE        TRUE    FALSE          
## 10 https://… 2024-01-21 07:40:40 Finished N… TRUE        TRUE    FALSE          
## 11 https://… 2024-01-21 07:15:01 Finished N… TRUE        TRUE    FALSE          
## 12 https://… 2024-01-20 07:01:01 Finished N… TRUE        TRUE    FALSE          
## 13 https://… 2024-01-21 05:31:32 Finished N… TRUE        TRUE    FALSE          
## 14 https://… 2024-01-21 07:45:17 Finished N… TRUE        TRUE    FALSE          
## 15 https://… 2024-01-21 07:24:26 Finished N… TRUE        TRUE    FALSE          
## 16 https://… 2024-01-21 07:15:37 Finished N… TRUE        TRUE    FALSE          
## 17 https://… 2024-01-21 00:27:40 Finished N… TRUE        TRUE    FALSE          
## 18 https://… 2024-01-21 07:16:26 Finished N… TRUE        TRUE    FALSE          
## 19 https://… 2024-01-21 07:34:51 Finished N… TRUE        TRUE    FALSE          
## 20 https://… 2024-01-21 07:34:56 Finished N… TRUE        TRUE    FALSE          
## # ℹ 10 more variables: job_title <chr>, company <chr>, job_location <chr>,
## #   first_seen <date>, search_city <chr>, search_country <chr>,
## #   search_position <chr>, job_level <chr>, job_type <chr>, job_skills <chr>

# Break up job title in all possible pairs of words
titles_broken_3_words <- job_posting_with_skills %>%
  unnest_tokens(title, job_title, token = "ngrams", n = 3) %>%
  filter(!is.na(title)) %>%
  count(title, sort = TRUE)
head(titles_broken_3_words, 50)

## # A tibble: 50 × 2
##    title                         n
##    <chr>                     <int>
##  1 senior data engineer        453
##  2 machine learning engineer   426
##  3 senior data analyst         321
##  4 senior data scientist       257
##  5 lead data engineer          252
##  6 with security clearance     166
##  7 bangkok based relocation    152
##  8 based relocation provided   152
##  9 senior machine learning     141
## 10 senior mlops engineer       138
## # ℹ 40 more rows

titles_broken_3_words <- titles_broken_3_words %>%
  mutate(
    # Replace "sr" with "senior" as a whole word
    title = str_replace_all(title, "\\bsr\\b", "senior"),
    # Recode specific variants to join similar titles
    title = case_when(
      title %in% c("data loss prevention", "loss prevention dlp") ~ "prevention dlp engineer",
      title %in% c("service representative data") ~ "customer service representative",
      TRUE ~ title
    )
  ) %>% 
  # Group by the recoded title and sum their counts
  group_by(title) %>%
  summarise(n = sum(n)) %>%  
  ungroup() %>%
  # Filter to include only titles that contain one of the keywords (case-insensitive)
  filter(str_detect(title, regex("analytic|model|engineer|data|machine", ignore_case = TRUE))) %>%
  arrange(desc(n))

popular_data_science_skills_vector <- titles_broken_3_words$title[1:20]
popular_data_science_skills_vector

##  [1] "senior data engineer"          "machine learning engineer"    
##  [3] "senior data analyst"           "senior data scientist"        
##  [5] "prevention dlp engineer"       "lead data engineer"           
##  [7] "senior machine learning"       "senior mlops engineer"        
##  [9] "staff machine learning"        "business data analyst"        
## [11] "manager data engineering"      "data entry clerk"             
## [13] "engineer series a"             "learning engineer series"     
## [15] "data analyst data"             "manager data center"          
## [17] "representative data analyst"   "analyst data entry"           
## [19] "senior database administrator" "data center construction"

# Refresh job_posting_with_skills_filtered using the updated popular titles
job_posting_with_skills_filtered <- job_posting_with_skills %>%
  mutate(job_title_duplicate = job_title) %>%
  unnest_tokens(title, job_title, token = "ngrams", n = 3)  %>%
  filter(title %in% popular_data_science_skills_vector) %>%
  left_join(titles_broken_3_words, by = "title") %>%
  group_by(job_title_duplicate) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  ungroup()

head(job_posting_with_skills_filtered, 5)

## # A tibble: 5 × 18
##   job_link   last_processed_time last_status got_summary got_ner is_being_worked
##   <chr>      <dttm>              <chr>       <lgl>       <lgl>   <lgl>          
## 1 https://w… 2024-01-19 14:29:15 Finished N… TRUE        TRUE    FALSE          
## 2 https://w… 2024-01-19 09:45:09 Finished N… TRUE        TRUE    FALSE          
## 3 https://w… 2024-01-19 09:45:09 Finished N… TRUE        TRUE    FALSE          
## 4 https://w… 2024-01-20 08:57:35 Finished N… TRUE        TRUE    FALSE          
## 5 https://w… 2024-01-19 22:38:30 Finished N… TRUE        TRUE    FALSE          
## # ℹ 12 more variables: company <chr>, job_location <chr>, first_seen <date>,
## #   search_city <chr>, search_country <chr>, search_position <chr>,
## #   job_level <chr>, job_type <chr>, job_skills <chr>,
## #   job_title_duplicate <chr>, title <chr>, n <int>

top_20_titles <- head(titles_broken_3_words, 20)
top_20_titles

## # A tibble: 20 × 2
##    title                             n
##    <chr>                         <int>
##  1 senior data engineer            559
##  2 machine learning engineer       426
##  3 senior data analyst             402
##  4 senior data scientist           343
##  5 prevention dlp engineer         335
##  6 lead data engineer              252
##  7 senior machine learning         154
##  8 senior mlops engineer           145
##  9 staff machine learning          130
## 10 business data analyst           124
## 11 manager data engineering        120
## 12 data entry clerk                113
## 13 engineer series a               101
## 14 learning engineer series        101
## 15 data analyst data                99
## 16 manager data center              87
## 17 representative data analyst      87
## 18 analyst data entry               86
## 19 senior database administrator    86
## 20 data center construction         82

Make data frame tidy, and get the count for each skill:

# Make the dataframe tidy - break up the job_skills variable so each observation is job title/single skill combo
tidy_top_skills <- job_posting_with_skills_filtered  %>%
  unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
  count(skill, sort = TRUE) # get the frequency

# Grab the top 20 skills
top_20_skills <- head(tidy_top_skills, 20)  

# Plot 
top_20_skills |>
  ggplot(aes(x = reorder(skill, n), y = n)) +
  geom_col(fill = "steelblue") + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  labs(title = "Top 20 Skills",
    x = "Skill",
    y = "Count")

Top Skills per job title:

#  For each title, break up the 'job_skills' column into individual skills and count them.
top_skills_by_job_title <- job_posting_with_skills_filtered |>
  group_by(title) |>  # Group by the data science job title 
  # Split the comma-separated skills into individual tokens
  unnest_tokens(skill, job_skills, token = "regex", pattern = ",") |>
  count(skill, sort = TRUE) |>
  # Optionally, limit to the top 10 skills per job title 
  group_by(title) |>
  slice_max(n, n = 10) |>
  ungroup() |>
  arrange(title, desc(n))

# View the resulting summary table
print(top_skills_by_job_title)

## # A tibble: 247 × 3
##    title                 skill                        n
##    <chr>                 <chr>                    <int>
##  1 business data analyst " sql"                      29
##  2 business data analyst " data analysis"            23
##  3 business data analyst " data visualization"       17
##  4 business data analyst " project management"       17
##  5 business data analyst " communication"            15
##  6 business data analyst " excel"                    14
##  7 business data analyst " business intelligence"    13
##  8 business data analyst " data governance"          13
##  9 business data analyst " data mining"              12
## 10 business data analyst " problem solving"          11
## # ℹ 237 more rows

# Create separate plots for each job title 
unique_titles <- unique(top_skills_by_job_title$title)
print(unique_titles)

##  [1] "business data analyst"         "data analyst data"            
##  [3] "data center construction"      "data entry clerk"             
##  [5] "lead data engineer"            "machine learning engineer"    
##  [7] "manager data center"           "manager data engineering"     
##  [9] "prevention dlp engineer"       "representative data analyst"  
## [11] "senior data analyst"           "senior data engineer"         
## [13] "senior data scientist"         "senior database administrator"
## [15] "senior machine learning"       "senior mlops engineer"        
## [17] "staff machine learning"

# Here we exclude all non-data related job titles and turn them into upper scales
pattern <- "analytic|model|engineer|data|machine"
unique_titles <- unique_titles[grepl(pattern, unique_titles, ignore.case = TRUE)]
#unique_titles <- toupper(unique_titles)
print(unique_titles)

##  [1] "business data analyst"         "data analyst data"            
##  [3] "data center construction"      "data entry clerk"             
##  [5] "lead data engineer"            "machine learning engineer"    
##  [7] "manager data center"           "manager data engineering"     
##  [9] "prevention dlp engineer"       "representative data analyst"  
## [11] "senior data analyst"           "senior data engineer"         
## [13] "senior data scientist"         "senior database administrator"
## [15] "senior machine learning"       "senior mlops engineer"        
## [17] "staff machine learning"

for (job in unique_titles) {
  p <- top_skills_by_job_title |>
    filter(title == job) |>
    ggplot(aes(x = reorder(skill, n), y = n)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = paste("Top Skills for:", job),
         x = "Skill",
         y = "Count")
  print(p)
}

state_summary <- job_posting_with_skills_filtered %>%
  mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
  # This excludes NA states
  filter(!is.na(state)) %>%
  group_by(state) %>%
  summarise(openings = n(), .groups = "drop") %>%
  arrange(desc(openings)) %>%
  slice_head(n = 10)  # keep only the top 10 states

# Display the summary table
knitr::kable(state_summary, caption = "Top 10 States by Job Openings")

Top 10 States by Job Openings
state	openings
CA	82
TX	48
VA	44
IL	36
NY	35
MA	22
FL	16
WA	16
AZ	14
GA	12

# Plot the top 10 states by job openings
ggplot(state_summary, aes(x = reorder(state, openings), y = openings)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 States by Job Openings",
       x = "State",
       y = "Number of Openings") +
  theme_minimal()

tidy_top_skills_ny <- job_posting_with_skills_filtered %>%
  unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
  mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
  filter(state == "NY") %>%
  group_by(state) %>%
  count(skill, sort = TRUE) %>%
  slice(1:10)
head(tidy_top_skills_ny)

## # A tibble: 6 × 3
## # Groups:   state [1]
##   state skill                   n
##   <chr> <chr>               <int>
## 1 NY    " python"              20
## 2 NY    " sql"                 16
## 3 NY    " communication"       11
## 4 NY    " data analysis"       10
## 5 NY    " data modeling"       10
## 6 NY    " machine learning"    10

# Plot the top 10 skills in NY
ggplot(tidy_top_skills_ny, aes(x = reorder(skill, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Skills in NY",
       x = "State",
       y = "Count") +
  theme_minimal()

tidy_top_skills_ca <- job_posting_with_skills_filtered %>%
  unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
  mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
  filter(state == "CA") %>%
  group_by(state) %>%
  count(skill, sort = TRUE) %>%
  slice(1:10)
head(tidy_top_skills_ca)

## # A tibble: 6 × 3
## # Groups:   state [1]
##   state skill                   n
##   <chr> <chr>               <int>
## 1 CA    " python"              51
## 2 CA    " sql"                 41
## 3 CA    " machine learning"    30
## 4 CA    " pytorch"             24
## 5 CA    " tensorflow"          22
## 6 CA    " deep learning"       20

# Plot the top 10 skills in CA
ggplot(tidy_top_skills_ca, aes(x = reorder(skill, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Skills in CA",
       x = "State",
       y = "Count") +
  theme_minimal()

tidy_top_skills_by_job_level <- job_posting_with_skills_filtered %>%
  unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
  group_by(job_level) %>%
  count(skill, sort = TRUE) %>%
  slice(1:5) %>%
  mutate(percent = n/sum(n))

print(tidy_top_skills_by_job_level)

## # A tibble: 10 × 4
## # Groups:   job_level [2]
##    job_level  skill                     n percent
##    <chr>      <chr>                 <int>   <dbl>
##  1 Associate  " sql"                   14   0.28 
##  2 Associate  " python"                12   0.24 
##  3 Associate  " data visualization"     9   0.18 
##  4 Associate  " data analysis"          8   0.16 
##  5 Associate  " aws"                    7   0.14 
##  6 Mid senior " python"               288   0.323
##  7 Mid senior " sql"                  227   0.254
##  8 Mid senior " machine learning"     137   0.153
##  9 Mid senior " communication"        129   0.144
## 10 Mid senior " data visualization"   112   0.125

# Plot the top 5 by job level
ggplot(tidy_top_skills_by_job_level, aes(x = reorder(skill, n), y = percent)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  facet_wrap(~tidy_top_skills_by_job_level$job_level) +
  coord_flip() +
  labs(title = "Top 5 Skills by Job Level",
       x = "Job Level",
       y = "Percent as Decimal") +
  theme_minimal()

Project 3 - Data Science Skills

Gillian McGovern, Aali John-Harry, Farhod Ibragimov

2025-03-20

Load the Libraries

Read the Data