library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(slackr)
library(dplyr)
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringr)
library(topicmodels)
library(knitr)
job_postings_norm <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/job_postings_norm.csv", show_col_types = FALSE)
job_posting_with_skills <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/job_postings_with_skills.csv", show_col_types = FALSE)
sep_skills <- read_csv("https://raw.githubusercontent.com/gillianmcgovern0/cuny-data-607-project-3/refs/heads/main/sep_skills.csv", show_col_types = FALSE)
Filter out unrelated job titles:
head(job_posting_with_skills, 20)
## # A tibble: 20 × 16
## job_link last_processed_time last_status got_summary got_ner is_being_worked
## <chr> <dttm> <chr> <lgl> <lgl> <lgl>
## 1 https://… 2024-01-21 08:08:48 Finished N… TRUE TRUE FALSE
## 2 https://… 2024-01-20 04:02:12 Finished N… TRUE TRUE FALSE
## 3 https://… 2024-01-21 08:08:31 Finished N… TRUE TRUE FALSE
## 4 https://… 2024-01-20 15:30:55 Finished N… TRUE TRUE FALSE
## 5 https://… 2024-01-21 08:08:58 Finished N… TRUE TRUE FALSE
## 6 https://… 2024-01-21 07:14:11 Finished N… TRUE TRUE FALSE
## 7 https://… 2024-01-21 07:14:09 Finished N… TRUE TRUE FALSE
## 8 https://… 2024-01-21 07:39:58 Finished N… TRUE TRUE FALSE
## 9 https://… 2024-01-21 07:14:50 Finished N… TRUE TRUE FALSE
## 10 https://… 2024-01-21 07:40:40 Finished N… TRUE TRUE FALSE
## 11 https://… 2024-01-21 07:15:01 Finished N… TRUE TRUE FALSE
## 12 https://… 2024-01-20 07:01:01 Finished N… TRUE TRUE FALSE
## 13 https://… 2024-01-21 05:31:32 Finished N… TRUE TRUE FALSE
## 14 https://… 2024-01-21 07:45:17 Finished N… TRUE TRUE FALSE
## 15 https://… 2024-01-21 07:24:26 Finished N… TRUE TRUE FALSE
## 16 https://… 2024-01-21 07:15:37 Finished N… TRUE TRUE FALSE
## 17 https://… 2024-01-21 00:27:40 Finished N… TRUE TRUE FALSE
## 18 https://… 2024-01-21 07:16:26 Finished N… TRUE TRUE FALSE
## 19 https://… 2024-01-21 07:34:51 Finished N… TRUE TRUE FALSE
## 20 https://… 2024-01-21 07:34:56 Finished N… TRUE TRUE FALSE
## # ℹ 10 more variables: job_title <chr>, company <chr>, job_location <chr>,
## # first_seen <date>, search_city <chr>, search_country <chr>,
## # search_position <chr>, job_level <chr>, job_type <chr>, job_skills <chr>
# Break up job title in all possible pairs of words
titles_broken_3_words <- job_posting_with_skills %>%
unnest_tokens(title, job_title, token = "ngrams", n = 3) %>%
filter(!is.na(title)) %>%
count(title, sort = TRUE)
head(titles_broken_3_words, 50)
## # A tibble: 50 × 2
## title n
## <chr> <int>
## 1 senior data engineer 453
## 2 machine learning engineer 426
## 3 senior data analyst 321
## 4 senior data scientist 257
## 5 lead data engineer 252
## 6 with security clearance 166
## 7 bangkok based relocation 152
## 8 based relocation provided 152
## 9 senior machine learning 141
## 10 senior mlops engineer 138
## # ℹ 40 more rows
titles_broken_3_words <- titles_broken_3_words %>%
mutate(
# Replace "sr" with "senior" as a whole word
title = str_replace_all(title, "\\bsr\\b", "senior"),
# Recode specific variants to join similar titles
title = case_when(
title %in% c("data loss prevention", "loss prevention dlp") ~ "prevention dlp engineer",
title %in% c("service representative data") ~ "customer service representative",
TRUE ~ title
)
) %>%
# Group by the recoded title and sum their counts
group_by(title) %>%
summarise(n = sum(n)) %>%
ungroup() %>%
# Filter to include only titles that contain one of the keywords (case-insensitive)
filter(str_detect(title, regex("analytic|model|engineer|data|machine", ignore_case = TRUE))) %>%
arrange(desc(n))
popular_data_science_skills_vector <- titles_broken_3_words$title[1:20]
popular_data_science_skills_vector
## [1] "senior data engineer" "machine learning engineer"
## [3] "senior data analyst" "senior data scientist"
## [5] "prevention dlp engineer" "lead data engineer"
## [7] "senior machine learning" "senior mlops engineer"
## [9] "staff machine learning" "business data analyst"
## [11] "manager data engineering" "data entry clerk"
## [13] "engineer series a" "learning engineer series"
## [15] "data analyst data" "manager data center"
## [17] "representative data analyst" "analyst data entry"
## [19] "senior database administrator" "data center construction"
# Refresh job_posting_with_skills_filtered using the updated popular titles
job_posting_with_skills_filtered <- job_posting_with_skills %>%
mutate(job_title_duplicate = job_title) %>%
unnest_tokens(title, job_title, token = "ngrams", n = 3) %>%
filter(title %in% popular_data_science_skills_vector) %>%
left_join(titles_broken_3_words, by = "title") %>%
group_by(job_title_duplicate) %>%
arrange(desc(n)) %>%
slice(1) %>%
ungroup()
head(job_posting_with_skills_filtered, 5)
## # A tibble: 5 × 18
## job_link last_processed_time last_status got_summary got_ner is_being_worked
## <chr> <dttm> <chr> <lgl> <lgl> <lgl>
## 1 https://w… 2024-01-19 14:29:15 Finished N… TRUE TRUE FALSE
## 2 https://w… 2024-01-19 09:45:09 Finished N… TRUE TRUE FALSE
## 3 https://w… 2024-01-19 09:45:09 Finished N… TRUE TRUE FALSE
## 4 https://w… 2024-01-20 08:57:35 Finished N… TRUE TRUE FALSE
## 5 https://w… 2024-01-19 22:38:30 Finished N… TRUE TRUE FALSE
## # ℹ 12 more variables: company <chr>, job_location <chr>, first_seen <date>,
## # search_city <chr>, search_country <chr>, search_position <chr>,
## # job_level <chr>, job_type <chr>, job_skills <chr>,
## # job_title_duplicate <chr>, title <chr>, n <int>
top_20_titles <- head(titles_broken_3_words, 20)
top_20_titles
## # A tibble: 20 × 2
## title n
## <chr> <int>
## 1 senior data engineer 559
## 2 machine learning engineer 426
## 3 senior data analyst 402
## 4 senior data scientist 343
## 5 prevention dlp engineer 335
## 6 lead data engineer 252
## 7 senior machine learning 154
## 8 senior mlops engineer 145
## 9 staff machine learning 130
## 10 business data analyst 124
## 11 manager data engineering 120
## 12 data entry clerk 113
## 13 engineer series a 101
## 14 learning engineer series 101
## 15 data analyst data 99
## 16 manager data center 87
## 17 representative data analyst 87
## 18 analyst data entry 86
## 19 senior database administrator 86
## 20 data center construction 82
Make data frame tidy, and get the count for each skill:
# Make the dataframe tidy - break up the job_skills variable so each observation is job title/single skill combo
tidy_top_skills <- job_posting_with_skills_filtered %>%
unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
count(skill, sort = TRUE) # get the frequency
# Grab the top 20 skills
top_20_skills <- head(tidy_top_skills, 20)
# Plot
top_20_skills |>
ggplot(aes(x = reorder(skill, n), y = n)) +
geom_col(fill = "steelblue") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
labs(title = "Top 20 Skills",
x = "Skill",
y = "Count")
Top Skills per job title:
# For each title, break up the 'job_skills' column into individual skills and count them.
top_skills_by_job_title <- job_posting_with_skills_filtered |>
group_by(title) |> # Group by the data science job title
# Split the comma-separated skills into individual tokens
unnest_tokens(skill, job_skills, token = "regex", pattern = ",") |>
count(skill, sort = TRUE) |>
# Optionally, limit to the top 10 skills per job title
group_by(title) |>
slice_max(n, n = 10) |>
ungroup() |>
arrange(title, desc(n))
# View the resulting summary table
print(top_skills_by_job_title)
## # A tibble: 247 × 3
## title skill n
## <chr> <chr> <int>
## 1 business data analyst " sql" 29
## 2 business data analyst " data analysis" 23
## 3 business data analyst " data visualization" 17
## 4 business data analyst " project management" 17
## 5 business data analyst " communication" 15
## 6 business data analyst " excel" 14
## 7 business data analyst " business intelligence" 13
## 8 business data analyst " data governance" 13
## 9 business data analyst " data mining" 12
## 10 business data analyst " problem solving" 11
## # ℹ 237 more rows
# Create separate plots for each job title
unique_titles <- unique(top_skills_by_job_title$title)
print(unique_titles)
## [1] "business data analyst" "data analyst data"
## [3] "data center construction" "data entry clerk"
## [5] "lead data engineer" "machine learning engineer"
## [7] "manager data center" "manager data engineering"
## [9] "prevention dlp engineer" "representative data analyst"
## [11] "senior data analyst" "senior data engineer"
## [13] "senior data scientist" "senior database administrator"
## [15] "senior machine learning" "senior mlops engineer"
## [17] "staff machine learning"
# Here we exclude all non-data related job titles and turn them into upper scales
pattern <- "analytic|model|engineer|data|machine"
unique_titles <- unique_titles[grepl(pattern, unique_titles, ignore.case = TRUE)]
#unique_titles <- toupper(unique_titles)
print(unique_titles)
## [1] "business data analyst" "data analyst data"
## [3] "data center construction" "data entry clerk"
## [5] "lead data engineer" "machine learning engineer"
## [7] "manager data center" "manager data engineering"
## [9] "prevention dlp engineer" "representative data analyst"
## [11] "senior data analyst" "senior data engineer"
## [13] "senior data scientist" "senior database administrator"
## [15] "senior machine learning" "senior mlops engineer"
## [17] "staff machine learning"
for (job in unique_titles) {
p <- top_skills_by_job_title |>
filter(title == job) |>
ggplot(aes(x = reorder(skill, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = paste("Top Skills for:", job),
x = "Skill",
y = "Count")
print(p)
}
state_summary <- job_posting_with_skills_filtered %>%
mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
# This excludes NA states
filter(!is.na(state)) %>%
group_by(state) %>%
summarise(openings = n(), .groups = "drop") %>%
arrange(desc(openings)) %>%
slice_head(n = 10) # keep only the top 10 states
# Display the summary table
knitr::kable(state_summary, caption = "Top 10 States by Job Openings")
state | openings |
---|---|
CA | 82 |
TX | 48 |
VA | 44 |
IL | 36 |
NY | 35 |
MA | 22 |
FL | 16 |
WA | 16 |
AZ | 14 |
GA | 12 |
# Plot the top 10 states by job openings
ggplot(state_summary, aes(x = reorder(state, openings), y = openings)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 States by Job Openings",
x = "State",
y = "Number of Openings") +
theme_minimal()
tidy_top_skills_ny <- job_posting_with_skills_filtered %>%
unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
filter(state == "NY") %>%
group_by(state) %>%
count(skill, sort = TRUE) %>%
slice(1:10)
head(tidy_top_skills_ny)
## # A tibble: 6 × 3
## # Groups: state [1]
## state skill n
## <chr> <chr> <int>
## 1 NY " python" 20
## 2 NY " sql" 16
## 3 NY " communication" 11
## 4 NY " data analysis" 10
## 5 NY " data modeling" 10
## 6 NY " machine learning" 10
# Plot the top 10 skills in NY
ggplot(tidy_top_skills_ny, aes(x = reorder(skill, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Skills in NY",
x = "State",
y = "Count") +
theme_minimal()
tidy_top_skills_ca <- job_posting_with_skills_filtered %>%
unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
mutate(state = str_extract(job_location, "[A-Z]{2}$")) %>%
filter(state == "CA") %>%
group_by(state) %>%
count(skill, sort = TRUE) %>%
slice(1:10)
head(tidy_top_skills_ca)
## # A tibble: 6 × 3
## # Groups: state [1]
## state skill n
## <chr> <chr> <int>
## 1 CA " python" 51
## 2 CA " sql" 41
## 3 CA " machine learning" 30
## 4 CA " pytorch" 24
## 5 CA " tensorflow" 22
## 6 CA " deep learning" 20
# Plot the top 10 skills in CA
ggplot(tidy_top_skills_ca, aes(x = reorder(skill, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Skills in CA",
x = "State",
y = "Count") +
theme_minimal()
tidy_top_skills_by_job_level <- job_posting_with_skills_filtered %>%
unnest_tokens(skill, job_skills, token = 'regex', pattern=",") %>%
group_by(job_level) %>%
count(skill, sort = TRUE) %>%
slice(1:5) %>%
mutate(percent = n/sum(n))
print(tidy_top_skills_by_job_level)
## # A tibble: 10 × 4
## # Groups: job_level [2]
## job_level skill n percent
## <chr> <chr> <int> <dbl>
## 1 Associate " sql" 14 0.28
## 2 Associate " python" 12 0.24
## 3 Associate " data visualization" 9 0.18
## 4 Associate " data analysis" 8 0.16
## 5 Associate " aws" 7 0.14
## 6 Mid senior " python" 288 0.323
## 7 Mid senior " sql" 227 0.254
## 8 Mid senior " machine learning" 137 0.153
## 9 Mid senior " communication" 129 0.144
## 10 Mid senior " data visualization" 112 0.125
# Plot the top 5 by job level
ggplot(tidy_top_skills_by_job_level, aes(x = reorder(skill, n), y = percent)) +
geom_bar(stat = "identity", fill = "steelblue") +
facet_wrap(~tidy_top_skills_by_job_level$job_level) +
coord_flip() +
labs(title = "Top 5 Skills by Job Level",
x = "Job Level",
y = "Percent as Decimal") +
theme_minimal()