# Create IDs + clean base
data_science_posts_clean_df <- data_science_posts_df |>
mutate(
posting_id = row_number(),
company_id = as.integer(factor(company))
) |>
relocate(posting_id, company_id)
# Company df creation to match ERD
company_df <- data_science_posts_clean_df |>
distinct(company_id, .keep_all = TRUE) |>
select(company_id, industry, ownership, headquarter, company_size, revenue)
# Job Posting df creation to match ERD
job_posting_df <- data_science_posts_clean_df |>
select(
posting_id,
company_id,
seniority_level,
work_status = status,
location,
post_date,
salary_range = salary
)
# Posting Skill df - cleaning skills field to prep for mapping
posting_skill_df <- data_science_posts_clean_df |>
select(posting_id, skills) |>
mutate(skills = str_remove_all(skills, "\\[|\\]|'")) |>
separate_rows(skills, sep = ",") |>
mutate(
skills = str_trim(skills),
skills = str_to_lower(skills)
) |>
rename(skill_name = skills) |>
mutate(
skill_name = case_when(
skill_name %in% c("sklearn", "scikit learn") ~ "scikit-learn",
skill_name %in% c("amazon", "amazon web services") ~ "aws",
TRUE ~ skill_name
),
skill_name = str_to_title(skill_name)
) |>
filter(skill_name != "", !is.na(skill_name))
# Posting skill mapping to O*Net skill sources
skill_crosswalk <- tibble::tribble(
~skill_id, ~job_skill, ~onet_skill_name,
1, "Spark", "Apache Spark",
2, "R", "R",
3, "Python", "Python",
4, "Scala", "Scala",
5, "Machine Learning", NA,
6, "Tensorflow", "TensorFlow",
7, "Sql", "Structured query language SQL",
8, "Aws", "Amazon Web Services AWS software",
9, "Git", "Git",
10, "Docker", "Docker",
11, "Gcp", "Google Cloud software",
12, "Kubernetes", "Kubernetes",
13, "Deep Learning", NA,
14, "Scikit-Learn", "Scikit-learn",
15, "Pytorch", "PyTorch",
16, "Keras", NA,
17, "Java", "Oracle Java",
18, "Pandas", "pandas",
19, "Powerbi", "Microsoft Power BI",
20, "Tableau", "Tableau",
21, "Hadoop", "Apache Hadoop",
22, "Azure", "Microsoft Azure software",
23, "Airflow", "Apache Airflow",
24, "Linux", "Linux",
25, "Bash", "Bash",
26, "Numpy", "NumPy",
27, "Neural Network", NA,
28, "Matplotlib", NA,
29, "Database", NA,
30, "Scipy", NA,
31, "Opencv", NA
)
# Posting skills remapping
posting_skill_df <- posting_skill_df |>
left_join(skill_crosswalk, by = c("skill_name" = "job_skill")) |>
mutate(skill_name = coalesce(onet_skill_name, skill_name)) |>
select(posting_id, skill_name)
# Skill Category creation to match ERD
skill_category_df <- data_science_skills_df |>
rename(section = Section, category_name = Category, skill_name = Example) |>
distinct(section, category_name, skill_name) |>
arrange(section, category_name, skill_name) |>
mutate(category_id = row_number()) |>
relocate(category_id)
# Prepping skill for eventual mapping
skill <- data_science_demand_df |>
rename(skill_name = Technology.Skill,
is_hot_technology = Hot.Technology,
demand_percentage = Percentage
)
# Building master skill table from all unique skill names from all 3 sources
all_skills_df <- bind_rows(
skill_category_df |> select(skill_name),
skill |> select(skill_name),
posting_skill_df |> select(skill_name)
) |>
distinct(skill_name) |>
arrange(skill_name)
# Using skill_demand_df as the base for demand fields
skill <- all_skills_df |>
left_join(
skill |>
select(skill_name, demand_percentage, is_hot_technology),
by = "skill_name"
) |>
mutate(is_hot_technology = if_else(is.na(is_hot_technology) | is_hot_technology == "", "No",is_hot_technology)) |>
arrange(skill_name) |>
mutate(skill_id = row_number()) |>
select(skill_id, skill_name, demand_percentage, is_hot_technology)
# Skill df Category join
skill <- skill |>
left_join(
skill_category_df |>
select(skill_name, category_id),
by = "skill_name"
) |>
relocate(skill_id, category_id)
# Skill category clean
skill_category_df <- skill_category_df |>
select(category_id, section, category_name) |>
distinct()
# Posting skill_id join and clean
posting_skill_df <- posting_skill_df |>
left_join(
skill |> select(skill_id, skill_name),
by = "skill_name"
) |>
select(posting_id, skill_id, skill_name)
# Skill demand creation
skill_demand_df <- skill |>
select(skill_id, skill_name, demand_percentage) |>
filter(!is.na(demand_percentage)) |>
mutate(
demand_id = row_number(),
source = "O*Net"
) |>
select(demand_id, skill_id, demand_percentage, source)