607 Project 3: Webscraping Indeed.com

Introduction

In this project we are tasked with determining which are the most valued skills for a data scientist. In order to do this we decided to webscrape Indeed.com for Data Scientists job postings. After scraping this information we then transformed and analyzed the data to see which skills are most often occurring in the indeed data set. We scraped 1,436 job listings that have been posted in the past week. Our data set was not filtered for job location.

Web Scraping

library(tidyverse)
library(rvest)
library(xml2)
library(stringr)

url <- "https://www.indeed.com/jobs?q=data+scientist&sort=date&fromage=7"
page <- xml2::read_html(url)




job_num<-str_match(page,
            pattern = 'of (.*?) jobs')

## Warning in stri_match_first_regex(string, pattern, opts_regex = opts(pattern)):
## argument is not an atomic vector; coercing

job_num<-as.numeric(gsub(",","",job_num[,2]))

job_num

## [1] 863

job_title<- c()
  job_title <- page %>% 
    rvest::html_nodes("div") %>%
    rvest::html_nodes(xpath = '//*[@data-tn-element = "jobTitle"]') %>%
    rvest::html_attr("title")

  job_location<-c()
  job_location <-page %>% 
    rvest::html_nodes("div") %>%
    rvest::html_nodes(xpath = '//*[@class = "recJobLoc"]') %>%
    rvest::html_attr("data-rc-loc")

  job_company<-c()
  job_company <-page %>% 
    rvest::html_nodes("span") %>%
    rvest::html_nodes(xpath = '//*[@data-tn-element = "companyName"]') %>%
    rvest::html_text() %>%
    stringi::stri_trim_both()

  job_company_rating<-c()
  job_company_rating <-page %>% 
    rvest::html_nodes("span") %>%
    rvest::html_nodes(xpath = '//*[@class = "ratingsContent"]') %>%
    rvest::html_text() %>%
    stringi::stri_trim_both()

  job_descr_links<-c()
  job_descr_links <-page %>% 
  rvest::html_nodes("div") %>%
  rvest::html_nodes(xpath = '//*[@class="jobtitle turnstileLink "]') %>%
  rvest::html_attr("href")
  
  
  job_descr<- c()
  for(i in seq_along(job_descr_links)) {
    
  indeed <- "https://www.indeed.com/"
    job_descr_url <- paste0(indeed,job_descr_links[i],  sep="")
    pa <- xml2::read_html(job_descr_url)
    job_descr <-pa %>%
    rvest::html_nodes("ul")  %>% 
    rvest::html_nodes(xpath = '//*[@class="jobsearch-jobDescriptionText"]') %>% 
    rvest::html_text() %>%
    stringi::stri_trim_both()
}

  
  
  
result_start <- 10 
result_end <- 100
results <- seq(from = result_start, to = result_end, by = 10)
#results

indeed_df<-data.frame()

for(i in seq_along(results)) {
  
  url<-"https://www.indeed.com/jobs?q=data+scientist&sort=date&fromage=7&start="
  iurl <- paste0(url, toString(i))
  p <- xml2::read_html(iurl)
  p
  Sys.sleep(1)

  jt <- p %>% 
    rvest::html_nodes("div") %>%
    rvest::html_nodes(xpath = '//*[@data-tn-element = "jobTitle"]') %>%
    rvest::html_attr("title")
  job_title<-c(job_title,jt)
  
  jl <-p %>% 
    rvest::html_nodes("div") %>%
    rvest::html_nodes(xpath = '//*[@class = "recJobLoc"]') %>%
    rvest::html_attr("data-rc-loc")
  job_location<-c(job_location,jl)
  
  jc<-p %>% 
    rvest::html_nodes("span") %>%
    rvest::html_nodes(xpath = '//*[@data-tn-element = "companyName"]') %>%
    rvest::html_text() %>%
    stringi::stri_trim_both()
  job_company<-c(job_company,jc)
  
  jcr <-p %>% 
    rvest::html_nodes("span") %>%
    rvest::html_nodes(xpath = '//*[@class = "ratingsContent"]') %>%
    rvest::html_text() %>%
    stringi::stri_trim_both()
  job_company_rating<-c(job_company_rating,jcr)
  
  jdl <-p %>% 
  rvest::html_nodes("div") %>%
  rvest::html_nodes(xpath = '//*[@class="jobtitle turnstileLink "]') %>%
  rvest::html_attr("href")
  job_descr_links<-c(job_descr_links,jdl)
  
  

  for(i in seq_along(jdl)) {
    
    indeed <- "https://www.indeed.com/"
    job_descr_url <- paste0(indeed,jdl[i],  sep="")
    pa <- xml2::read_html(job_descr_url)
    jd<-pa %>%
    rvest::html_nodes("ul")  %>% 
    rvest::html_nodes(xpath = '//*[@class="jobsearch-jobDescriptionText"]') %>% 
    rvest::html_text() %>%
    stringi::stri_trim_both()
    job_descr<-c(job_descr,jd)
}

 #  df <- data.frame(job_title, job_company_rating,  job_location, job_company, job_descr)
#  indeed_df <- rbind(indeed_df, df)
   
   
   }


#job_company_rating
#job_location
#job_company
#head(job_title,50)
#head(job_company,50)
#head(job_descr,20)




write.csv(matrix(job_descr, nrow=1), file ="INDEED_DESCR.csv", row.names=FALSE)

After saving the job descriptions into a vector titled job_descr, we will then save the othervariables (company, title etc.) in their own vectors in the event they are needed for further analysis.

Analysis

Next, we used the links below to identify potential skills and phrases that would be in potential job listings for data scientists.

https://towardsdatascience.com/the-most-in-demand-skills-for-data-scientists-4a4a8db896db

https://www.thebalancecareers.com/list-of-data-scientist-skills-2062381

https://www.cio.com/article/3263790/the-essential-skills-and-traits-of-an-expert-data-scientist.html

https://www.simplilearn.com/what-skills-do-i-need-to-become-a-data-scientist-article

library(tidyverse)
library(rvest)
library(xml2)
library(stringr)
library(sqldf)

## Warning: package 'sqldf' was built under R version 3.6.3

## Warning: package 'gsubfn' was built under R version 3.6.3

## Warning: package 'proto' was built under R version 3.6.3

## Warning: package 'RSQLite' was built under R version 3.6.3

job_des <-as.list(job_descr)
lds<-length(job_des)
id<-c(1:lds)
jd_df<-df <- data.frame(job_descr,id)


GENERAL_SKILLS<-c("ANALYSIS","MACHINE LEARNING","STATISTICS","COMPUTER SCIENCE","MATHEMATICS","VISUALIZATION","AI","ARTIFICIAL INTELLIGENCE","DEEP LEARNING","NLP","NEURO-LINGUISTIC PROGRAMMING","NEURO LINGUISTIC PROGRAMMING","SOFTWARE DEVELOPMENT","NEURAL NETWORKS","PROJECT MANAGEMENT","SOFTWARE ENGINEER","DATA ENGINEERING","ANALYTICAL SKILLS","BIG DATA","PREDICTIVE MODELS","PREDICTIVE MODELING","DATA ANALYTICS","DATA MANIPULATION","DATA WRANGLING","DATA MINING","DATA SCIENCE TOOLS","INTERPRETING DATA","METRICS","RISK MODELING","HYPOTHESIS TESTING","ADAPTABILITY","DECISION MAKING","DECISION TREES","ALGORITHMS","INFORMATION RETREIVAL","LINEAR ALGEBRA","MULTIVARIABLE CALCULUS","STATISTICAL MODELING","CLOUD SERVICES","CRITICAL THINKING","DATA ARCHITECTURE","DATA ARCHITECT","RISK ANALYSIS","PROCESS IMPROVEMENT","SYSTEMS ENGINEER","SYSTEMS ENGINEERING","UNSTRUCTURED DATA","DATA INTUITION")

PERSONALITY_SKILLS<-c("CRITICAL THINKING","CREATIVITY","FAST PACED ENVIRONMENT","FAST-PACED ENVIRONMENT","LOGICAL THINKING","PROBLEM SOLVING","ASSERTIVE","ASSERTIVENESS","COLLABORATION","CONSULTING","DOCUMENTING","DOCUMENT", "DRAWING CONSENSUS","BUILDING CONSENSUS","FACILITATE MEETINGS","FACILITATING MEETINGS","LEADERSHIP","MENTOR","MENTORING","PRESENTATION","REPORTING","SUPERVISORY SKILLS","VERBAL COMMUNICATION","WRITEN COMMUNICATION","CURIOSITY","INNOVATION","INNOVATOR")

TECH_SKILLS<-c("PYTHON"," R ","SQL","HADOOP","SPARK","JAVA"," SAS ","TABLEAU","HIVE","SCALA","AWS"#,"C\+\+"
                  ,"MATLAB","TENSORFLOW"," C ","EXCEL","NOSQL","LINUX","AZURE","SICKIT-LEARN","PERL","APPENGINE","COUCHDB","JS","ECL","HBASE","SAAS","PIG")

job_attr_df<-data.frame(id=integer(),descr=character(),general=character(),personality=character(),tech=character(),stringsAsFactors = FALSE)

jd_df$job_descr<-toupper(jd_df$job_descr)

#jd_df$job_descr[1]

#length(GENERAL_SKILLS)

    for (i in 1:(1436*length(GENERAL_SKILLS))){
    for(j in 1:length(GENERAL_SKILLS)) {
    if(grepl(GENERAL_SKILLS[j],jd_df$job_descr[i])){
      job<- data.frame(id=jd_df$id[i],descr=jd_df$job_descr[i],general=GENERAL_SKILLS[j],personality="na",tech="na")
      job_attr_df<-rbind(job_attr_df,job)
    }
    }
    }
   
   
    for (i in 1:(1436*length(PERSONALITY_SKILLS))){
    for(j in 1:length(PERSONALITY_SKILLS)) {
    if(grepl(PERSONALITY_SKILLS[j],jd_df$job_descr[i])){
      job<- data.frame(id=jd_df$id[i],descr=jd_df$job_descr[i],general="na",personality=PERSONALITY_SKILLS[j],tech="na")
      job_attr_df<-rbind(job_attr_df,job)
    }
    }
    }
   
   
    for (i in 1:(1436*length(TECH_SKILLS))){
    for(j in 1:length(TECH_SKILLS)) {
    if(grepl(TECH_SKILLS[j],jd_df$job_descr[i])){
      job<- data.frame(id=jd_df$id[i],descr=jd_df$job_descr[i],general="na",personality="na",tech=TECH_SKILLS[j])
      job_attr_df<-rbind(job_attr_df,job)
    }
    }
    }
  


skills_all<-sqldf("select distinct * from(select id,general as skill from job_attr_df where general <> 'na' 
      union all
      select id,personality as skill from job_attr_df where personality <> 'na' 
      union all
      select id,tech as skill from job_attr_df where tech <> 'na') a")

skills_cat<-sqldf("select distinct id,general,personality,tech from job_attr_df ")

job_descr<-sqldf("select distinct  id,descr from job_attr_df")


write.csv(skills_all, file ="INDEED_skills_all.csv", row.names=FALSE)
write.csv(skills_cat, file ="INDEED_skills_cat.csv", row.names=FALSE)
write.csv(job_descr, file ="INDEED_job_descr.csv", row.names=FALSE)

Plots and Findings

As one might expect, AI, Machine Learning and Statistics were the most valued general skills and Python, R and SQL were amoung the most commonly requested tech skills. It is noteworthy that presentation and innovation were amoung the top soft skills to job posters on Indeed.com.

library(dplyr)
library(RCurl)
library(magrittr)
library(tidyverse)
library(ggplot2)

skills_cat<-read_csv("https://raw.githubusercontent.com/agersowitz/607-Project-3-ACC/master/INDEED_skills_cat.csv",col_names = TRUE)

## Parsed with column specification:
## cols(
##   id = col_double(),
##   general = col_character(),
##   personality = col_character(),
##   tech = col_character()
## )

#subsetting dataframe to remove observations with NA
sub_general <- subset(skills_cat, skills_cat$general!="na")
ggplot(sub_general, aes(x = factor(sub_general$general))) +
    geom_bar() + theme(axis.text.x = element_text(angle = 90)) + labs(title="General Skills for Data Scientist Role", x="General Skills", y="Frequency of Occurrence")

#subsetting dataframe to remove observations with NA
sub_personality <- subset(skills_cat, skills_cat$personality!="na")
sub_personality$personality[(sub_personality$personality %in% c("DOCUMENT","DOCUMENTING")==TRUE)]<- "DOCUMENTATION"
sub_personality$personality[(sub_personality$personality %in% c("INNOVATOR")==TRUE)]<- "INNOVATION"
sub_personality$personality[(sub_personality$personality %in% c("MENTOR")==TRUE)]<- "MENTORING"

ggplot(sub_personality, aes(x = factor(sub_personality$personality))) +
    geom_bar() + theme(axis.text.x = element_text(angle = 90)) + labs(title="Personality Traits for Data Scientist Role", x="Personality traits", y="Frequency of Occurrence")

#subsetting dataframe to remove observations with NA
sub_tech <- subset(skills_cat, skills_cat$tech!="na")
ggplot(sub_tech, aes(x = factor(sub_tech$tech))) +
    geom_bar() + theme(axis.text.x = element_text(angle = 90)) + labs(title="Tech Skills for Data Scientist Role", x="Tech skills", y="Frequency of Occurrence")

607 Project 3: Webscraping Indeed.com

Adam Gersowitz; Chitrart Kaushik; Christopher Bloome

3/14/2020

Introduction

Web Scraping

Analysis

Plots and Findings