DATA607_Project

Project 3 - Data Science Skills

Which are the most valued data science skills?

Introduction

This project aims to identify the most valued data science skills by finding the most requested data science skills in job listings; this doubles as our motivation, to find out how to best prepare ourselves for furthering our careers or joining in the data science field. Our team found a set of data on Kaggle called Data Science Job Postings & Skills that provides Data Science job listings from LinkedIn for 2024. This data set includes three csv files:
- job_postings.csv: job_link, job_title, company, job_location, job_level, job_type
- job_skills.csv: job_link, job_skills
- job_summary.csv: job_link, job_summary

We will focus our analysis on data from job_postings.csv and job_skills.csv.

First, we will clean up our data from both files. Then, we will normalize the data to prepare it. Finally, we will analyze the data to find out which is the most asked for data science skill in LinkedIn job listings.

Cleaning up data

In this part of the project we will load the data into R data frames and clean up the observations so that we can put them into normalized tables.

library(tidyverse)

Read csv files

# Loading job_skills.csv into the dataframe job_skills
file <-"job_skills.csv"
job_skills <- read.csv(file, sep=',')

# Loading job_postings.csv into the dataframe job_postings
file <- "job_postings.csv"
job_postings <- read.csv(file, sep=',')

Cleaning up job_skills table Make sure that job_skills has a job_id column and associated skills; and that the skills column does not include any white spaces nor any null values.

# Create a function to extract the ID numbers from the job_link:
id <- function(df) {
  stringr::str_extract(df, "(\\d)+$")
}

# Transform and rename job_link to job_ID:
job_skills$job_link<-id(job_skills$job_link)
names(job_skills)[names(job_skills)=='job_link'] <-"job_ID"

# Remove duplicate job_ID
job_skills <- job_skills %>% 
  group_by(job_ID) %>% 
  slice(1) %>% 
  ungroup()

# Separate the skills:
job_skills_clean <- job_skills %>%
  separate_wider_delim(job_skills, delim=',',
                       names_sep = '_',
                       too_few = 'align_start')

# Pivot_longer to put all skills into one column:
job_skills_clean <- job_skills_clean %>%
  pivot_longer(cols = starts_with("job_skills"),
               names_to = 'Sets',
               values_to = 'Skills',
               values_drop_na = TRUE)

# Cleaning up the data in skills
# Trim white space
job_skills_clean$Skills <- trimws(job_skills_clean$Skills)

# Set empty spaces to null
job_skills_clean$Skills <- na_if(job_skills_clean$Skills, "")

# Change all cases to upper case so when we group by case is not a factor
job_skills_clean$Skills <- toupper(job_skills_clean$Skills)

# Drop the sets column
job_skills_clean <- job_skills_clean %>% select(-c(Sets))

head(job_skills_clean)

## # A tibble: 6 × 2
##   job_ID     Skills          
##   <chr>      <chr>           
## 1 2690930489 COMPUTER VISION 
## 2 2690930489 MACHINE LEARNING
## 3 2690930489 ROBOTICS        
## 4 2690930489 PERCEPTION      
## 5 2690930489 DECISIONMAKING  
## 6 2690930489 DEEP LEARNING

Cleaning up the job_postings table

# Using the function created above to convert job_link to job_"ID
job_postings$job_link <- id(job_postings$job_link)
names(job_postings)[names(job_postings)=='job_link'] <-"job_ID"

# Extract the columns we need
job_postings<-job_postings[-c(3:6, 10:11)]

# Remove duplicate job_ID
job_postings <- job_postings %>% 
  group_by(job_ID) %>% 
  slice(1) %>% 
  ungroup()

Separate job_location

#  Set of pattern for Countries and States
Countries <- c("Australia", "Canada", "Italy", "Mexico", "United Kingdom", "United States")
Countries <- paste(Countries, collapse = "|")
States <- c("DC" , "AL" , "AK" , "AS" , "AZ" , "AR" , "CA" , "CO" , "CT" , "DE" , "FL" , "GA" , "GU" ,
            "HI" , "ID" , "IL" , "IN" , "IA" , "KS" , "KY" , "LA" , "ME" , "MD" , "MA" , "MI" , "MN" , 
            "MS" , "MO" , "MT" , "NE" , "NV" , "NH" , "NJ" , "NM" , "NY" , "NC" , "ND" , "OH" , "OK" , 
            "OR" , "PA" , "PR" , "RI" , "SC" , "SD" , "TN" , "TX" , "UT" , "VT" , "VA" , "WA" , "WV" , 
            "WI" , "WY")
States <- paste(States, collapse = "|")

#-------------------------------------------------------------------
# job_location can be read as City, State/Province, and Country separated by ","
# but this column has entries where it's missing either entry so we first have
# to split the job_location by deliminator then check:
# -Country can be listed out in either State/Province or City
# -State_or_Province may contain only US States without listing Country
# -City may only contain the name of the Country
#-------------------------------------------------------------------

# Split job_location to City, State/Province, and Country
job_postings_clean<- job_postings %>%
  separate_wider_delim(cols = job_location,
                       delim=",",
                       names=c("Ci", "SoP", "Co"),
                       too_few = 'align_start',
                       too_many = 'merge')

# Cleaning up the data in job_posting
job_postings_clean <- job_postings_clean %>% 
# trim white space for City, State/Province, and Country
  mutate(Ci = str_trim(Ci), 
      SoP = str_trim(SoP),
      Co = str_trim(Co)) %>% 
  # set empty spaces to null
  mutate(Ci = na_if(Ci,""), 
      SoP = na_if(SoP,""),
      Co = na_if(Co,""))
  
# Helper columns to see if city or state contains country
job_postings_clean <- job_postings_clean %>% 
  mutate(
    # check if City contains country
    matchCity = sapply(Ci, function(x) {
      matches <- regmatches(x, regexpr(Countries, x))
      if (length(matches) > 0) matches else NA
        }
      ),
    # check if State contains country
    matchSoP1 = sapply(SoP, function(x) {
      matches <- regmatches(x, regexpr(Countries, x))
      if (length(matches) > 0) matches else NA
        }
    ),
    # check if States contains the 50 US states
    matchSoP2 = sapply(SoP, function(x) {
      matches <- regmatches(x, regexpr(States, x))
      if (length(matches) > 0) "United States" else NA
        }
    ),
    matSoP = coalesce(matchSoP1, matchSoP2)
  )

job_postings_clean <- job_postings_clean %>%
  mutate(
# Check for cases when Country is empty  
    Country = case_when(
      !is.na(Co) ~ Co,
      !is.na(matSoP) ~ matSoP,
      !is.na(matchCity) ~ matchCity,
      is.na(Ci) ~ NA,
      TRUE ~ NA
    ),
# Check the cases for when State or Province does not have Country name
    State_or_Province = ifelse(is.na(matchSoP1),SoP, NA),

# Check the cases for when City does not contain State or Country
    City = case_when(
      !is.na(State_or_Province) ~ Ci,
      is.na(matchCity) ~ Ci,
      Country != matchCity ~ Ci,
      TRUE ~ NA
      )
  ) %>% 
  # Remove helper columns
  select(-c(Ci, SoP, Co, matchCity,matchSoP1,matchSoP2,matSoP))

# Extract date from last_processing column:
job_postings_clean$last_process_date<- format(as.Date(job_postings_clean$last_processed_time), format = "%m-%d-%Y") 

job_postings_clean <- job_postings_clean%>% 
  select(-c(last_processed_time))

head(job_postings_clean)

## # A tibble: 6 × 11
##   job_ID     job_title company search_country search_position job_level job_type
##   <chr>      <chr>     <chr>   <chr>          <chr>           <chr>     <chr>   
## 1 2690930489 Senior S… Waymo   United States  Agricultural-R… Mid seni… Onsite  
## 2 2721624310 Software… Anysca… United States  Mechanical Res… Mid seni… Onsite  
## 3 2740117798 Sr Staff… Aurora  United States  Set-Key Driver  Mid seni… Onsite  
## 4 2811830681 Senior J… Global… United States  Architect       Mid seni… Onsite  
## 5 2815549932 Senior D… Brooks… United States  Computer Syste… Mid seni… Onsite  
## 6 2830014073 Enterpri… The Ti… United States  Charter         Mid seni… Onsite  
## # ℹ 4 more variables: Country <chr>, State_or_Province <chr>, City <chr>,
## #   last_process_date <chr>

Normalization

Based on our Normalized Relational Diagram we will normalize our data into 4 tables. We want to organize our tables to represent that a company can have multiple locations and multiple job listings (1 to many) and each listing may require multiple skills.

Relation Entity Diagram

# Normalized skills table with skill_ID assigned
Skills_Normalized <- job_skills_clean %>% 
  drop_na(Skills) %>% 
  arrange(Skills) %>% 
  distinct(Skills) %>% 
  mutate(Skill_ID = 1:n())

Skills_Normalized$Skills<- gsub("[\"*]","", Skills_Normalized$Skills)
head(Skills_Normalized)

## # A tibble: 6 × 2
##   Skills                          Skill_ID
##   <chr>                              <int>
## 1 MISSION CRITICAL UPTIME MINDSET        1
## 2 R                                      2
## 3 SHOW PROCESSLIST                       3
## 4 TRUSTWORTHY AI                         4
## 5 $100000  $150000 SALARY RANGE          5
## 6 $20M+ ANNUAL SALES QUOTA               6

# Normalized company table with Company_ID assigned
Company_Normalized <- job_postings_clean %>%
  # Removing duplicate rows across company, city, and state_or_province or company per location ID creation
  distinct(company, City, State_or_Province, Country, .keep_all = TRUE) %>%
  mutate(Company_id = row_number()) %>%
  relocate(Company_id, .before = 1) %>% 
  select(Company_id, Company = company, City, State_or_Province, Country)
head(Company_Normalized)

## # A tibble: 6 × 5
##   Company_id Company                   City          State_or_Province Country  
##        <int> <chr>                     <chr>         <chr>             <chr>    
## 1          1 Waymo                     San Francisco CA                United S…
## 2          2 Anyscale                  San Francisco CA                United S…
## 3          3 Aurora                    San Francisco CA                United S…
## 4          4 Global Placement Firm     Dayton        OH                United S…
## 5          5 Brooksource               Fort Mill     SC                United S…
## 6          6 The Timberline Group, LLC St Louis      MO                United S…

# Normalized Job_Listings table
Job_Listing_Normalized <- job_postings_clean %>%
  left_join(Company_Normalized, join_by(company == Company, City== City, State_or_Province==State_or_Province, Country==Country), relationship = "many-to-many") %>% 
  select(job_ID, job_title, job_type, job_level, Company_id)
head(Job_Listing_Normalized)

## # A tibble: 6 × 5
##   job_ID     job_title                             job_type job_level Company_id
##   <chr>      <chr>                                 <chr>    <chr>          <int>
## 1 2690930489 Senior Software Engineer, Perception… Onsite   Mid seni…          1
## 2 2721624310 Software Engineer (Machine Learning)  Onsite   Mid seni…          2
## 3 2740117798 Sr Staff Software Engineer, ML Accel… Onsite   Mid seni…          3
## 4 2811830681 Senior Java Data Architect            Onsite   Mid seni…          4
## 5 2815549932 Senior Data Engineer-- Data Governan… Onsite   Mid seni…          5
## 6 2830014073 Enterprise Data Warehouse Lead        Onsite   Mid seni…          6

# Normalized Listing_Skills table
Listing_Skills_Normalized <- job_skills_clean %>% 
  left_join(Skills_Normalized)

## Joining with `by = join_by(Skills)`

## Warning in left_join(., Skills_Normalized): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 287 of `x` matches multiple rows in `y`.
## ℹ Row 37754 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Listing_Skills_Normalized <- Listing_Skills_Normalized %>% 
  select(job_ID, Skill_ID)
head(Listing_Skills_Normalized)

## # A tibble: 6 × 2
##   job_ID     Skill_ID
##   <chr>         <int>
## 1 2690930489    14168
## 2 2690930489    37754
## 3 2690930489    52588
## 4 2690930489    45632
## 5 2690930489    21196
## 6 2690930489    21247

Analyze

# Further cleaning of the strings in the table:
job_skills_clean$Skills<- trimws(gsub("SKILLS$", "", job_skills_clean$Skills))
job_skills_clean$Skills<- gsub("[\"*]","", job_skills_clean$Skills)

Top_ten_skills<- job_skills_clean %>%
  count(Skills) %>%
  slice_max(n, n=10)

ggplot(Top_ten_skills, aes(x=reorder(Skills, n), y = n, fill=n))+
  geom_bar(stat='identity') +
  labs(title='Top 10 data science skills from LinkedIn job posting 2024',
       x="", y='Number of appearance in job posting')+
  theme(axis.text.x = element_text(angle=47, hjust = 1),
        legend.position='') +
  coord_flip()

Which 5 states in the US have the most DS job postings:

job_num <- job_postings_clean %>%
  drop_na(State_or_Province) %>% 
  filter(Country=='United States') %>% 
  count(State_or_Province) %>% 
  slice_max(n, n=5)

ggplot(job_num, aes(x=reorder(State_or_Province, n), y=n))+
  geom_bar(stat='identity', aes(fill=n)) +
  labs(title = 'Top 5 States in the US that has the most DS job postings',
       y='Number of jobs',
       x='States') +
  theme(legend.position = '')

asso_skills <- job_postings_clean %>% 
  filter(job_level=='Associate') %>% 
  merge(job_skills_clean, 'job_ID') %>% 
  count(Skills) %>% 
  slice_max(n, n=10)

ggplot(asso_skills, aes(x=reorder(Skills, n), y=n))+
  geom_bar(stat='identity') +
  coord_flip() +
  labs(title = "Associate level top 10 skills",
       y='Skill frequency',
       x='')

senior_skills <- job_postings_clean %>% 
  filter(job_level=='Mid senior') %>% 
  merge(job_skills_clean, 'job_ID') %>% 
  count(Skills) %>% 
  slice_max(n, n=10)

ggplot(senior_skills, aes(x=reorder(Skills, n), y=n)) +
  geom_bar(stat='identity') +
  coord_flip() +
  labs(title = "Mid Senior level top 10 skills",
       y='Skill frequency',
       x='')

DATA607_Project_3

Brandon Chung, Stephanie Chiang, Ying Fang Lee, Chi Hang(Philip) Cheung

2024-10-27