For our project we set out to find an answer to the question: Which are the most valued data science skills? We began by searching for data sources, exploring multiple different options. We searched through a couple of job posting datasets on kaggle.com but once we filtered the data to return job postings about data science positions there wasn’t much left to work with. One dataset that had over 20,000 job postings had only 8 postings with a title containing ‘Data Scientist’ or ‘Data Science’. We also attempted to scrape LinkedIn however we ran into trouble with API restrictions and difficulty in scraping the HTMLs or search results. We settled on using indeed.com as our data source and approached scrapping the data in two different ways.

Approach 1 to scraping indeed.com

When searching indeed.com the resulting URL includes all of the criteria specified in your search. This approach concatentates search result URLs with different specifications. We combined a skill we were interested in with the term ‘data scientist’, specified the city of interest, and the radius of 50 miles. We then iterated through the URLs we created and collected the number of job postings (count) for each different search.

library(xml2)

library(XML)
library(stringr)
library(knitr)

get_indeed_url <- function(skill, base_term, city, radius){
  indeed_url <- paste("http://www.indeed.com/jobs?q=",trimws(skill),"+",trimws(base_term),"&l=",city,"&radius=",radius, sep = "")
  return (indeed_url)
}

get_count <- function(indeed_url){
  indeed_web_result <- readLines(indeed_url)
  find_jobcount_text=grep("<meta name=\"description\"",indeed_web_result)

 
  job_count_text =indeed_web_result[find_jobcount_text]
  job_count_text = gsub(",", "", job_count_text)
  # prepare regular expression
  regexp <- "[[:digit:]]+"
  #regexp<-"[[:digit:]]+\\,*[[:digit:]]+"
  # process string
  #regexp<-"(\\d[,])+"
  job_count =as.numeric(str_extract(job_count_text, regexp))
  job_count

  return (job_count)
}

write_indeed_results <- function(base_skill_list, skill_terms, cities, radius_list){
  
  skill_terms
  grid <- expand.grid(base_skill_list, skill_terms, cities, radius_list)
  dfx <- data.frame(grid)
  colnames(dfx) <- c("Base_Skill","Skill_Term","City","radius")
  
  
  dfx$indeed_url <- mapply(get_indeed_url, dfx$Skill_Term, dfx$Base_Skill, dfx$City, dfx$radius)
  dfx$jobs_count <- mapply(get_count, dfx$indeed_url)

  write.csv(dfx, file = "Indeed_Job_Search_Results.csv", row.names = FALSE, na = "")

  kable(head(dfx, n = 20))  
}

run_indeed_job_search <- function(){
  base_skill <- c('Data Scientist')
  skill_terms.df <- read.csv("indeed_skills.csv", header = TRUE)
  skill_terms <- trimws(skill_terms.df$SKILL)
  head(skill_terms)
  
  cities.df <- read.csv("indeed_city_state.csv", header = TRUE)
  cities <- paste(trimws(cities.df$CITY), trimws(cities.df$STATE), sep=",")
  head(cities)
  
  radius_list <- c(50)

  write_indeed_results(trimws(base_skill), trimws(skill_terms), trimws(cities), trimws(radius_list))
}

run_indeed_job_search()

Approach 2 to scraping indeed.com

For our second approach we decided focus in on one location, New York, NY. We also approached scraping indeed slightly differently. Instead of changing the parameters of the search result URL, we kept the parameters the same, searching only for ‘data scientist’ in New York, NY. The search result page displays 10 results per page so we created URLs for each sequential page of the search results by increasing the ‘start’ parameter by 10.

## Warning: package 'stringr' was built under R version 3.4.4

## Warning: package 'dplyr' was built under R version 3.4.4

## Warning: package 'dbplyr' was built under R version 3.4.4

## Warning: package 'tidyr' was built under R version 3.4.4

## Warning: package 'knitr' was built under R version 3.4.3

Creating the base URL and first page of the search URL to see how many jobs are listed for data scientists in NY.

base_url <- 'https://www.indeed.com/'
search_url <- paste0(base_url, 'jobs?q=data+scientist&l=New+York%2c+NY&start=0')

Reading the html file to get the total job count for data scientist in NY.

first_page <- read_html(search_url)

job_count <- unlist(strsplit(first_page %>% 
                               html_node("#searchCount") %>%
                               html_text(), split = ' ')) 

job_count <- as.numeric(str_replace_all(job_count[length(job_count)-1],',',''))

Creating the sequential search result page links.

# this only selects for the 10 jobs listed on each page, sponsored jobs have different nodes
links <- first_page %>%
  html_nodes("h2 a") %>%
  html_attr('href')

# creating an empty list to hold the page links
page_links <- list(rep(NA, 243))

# create the list of search page links
page_links <- paste0(paste0(base_url,'jobs?q=data+scientist&l=New+York%2c+NY&start='),seq(0,job_count,10))

Creating vector of words to search for in job descriptions and the empty data frame to store data in.

kws <- c('Python',
         '\\w{0,10} ?SQL',
         '\\bR\\b',
         'Spark',
         'SAS',
         'Excel',
         '\\bAWS\\b',
         'Java',
         'Tableau',
         'Looker',
         'Hadoop',
         'PHP',
         'Ruby',
         'Matlab',
         'C\\+\\+',
         regex('communication skills',ingore_case=T),
         'inquisitive',
         regex('presentation skills',ignore_case = T),
         regex('manage|(team management)',ignore_case = T),
         regex('team leader', ignore_case = T),
         regex('teamwork|team work', ignore_case = T),
         regex('business acumen', ignore_case = T),
         'remote(ly)?',
         regex('Bachelors| BA | BS ', ignore_case = T),
         regex('Masters| MA | MS ', ignore_case = T),
         'PhD',
         regex('Computer Science', ignore_case = T),
         regex('Computer Engineering', ignore_case = T),
         regex('Statistics|Stats', ignore_case = T),
         regex('Math|(Applied Math)', ignore_case = T)
         )

# vector of clean names to use for the data frame row names
key_names <- c('Python',
               'SQL',
               'R',
               'Spark',
               'SAS',
               'Excel',
               'AWS',
               'Java',
               'Tableau',
               'Looker',
               'Hadoop',
               'PHP',
               'Ruby',
               'Matlab',
               'C++',
               'communication skills',
               'inquisitive',
               'presentation skills',
               'manage',
               'team leader',
               'teamwork',
               'business acumen',
               'remote',
               'Bachelors',
               'Masters',
               'PhD',
               'Computer Science',
               'Computer Engineering',
               'Statistics',
               'Math'
               )

type <-c(rep('hard',15),rep('soft',8),rep('education',7))
type

##  [1] "hard"      "hard"      "hard"      "hard"      "hard"     
##  [6] "hard"      "hard"      "hard"      "hard"      "hard"     
## [11] "hard"      "hard"      "hard"      "hard"      "hard"     
## [16] "soft"      "soft"      "soft"      "soft"      "soft"     
## [21] "soft"      "soft"      "soft"      "education" "education"
## [26] "education" "education" "education" "education" "education"

# empty data frame
kw_count <- data.frame(type = type, key_words = key_names, count = rep(0, length(key_names))) 

# job count
njobs <- 0

search_results <- list('key_word_count' = kw_count, 'njobs' = njobs)

Scraping function:

A function to iterate through the job links on a search page. By itself it will go through the first page with 10 jobs, search each jobs descriptions for a mention of each specified skill, and add a count to the empty data frame. It will also keep count of the number of job links opened and processed.

scrape <- function(searchResults, job_links){
  for(i in 1:length(job_links)){
    job_url <- paste0(base_url,job_links[i])
    
    Sys.sleep(.5)
    cat('Reading job ', i, ' \n')
    
    tryCatch({
      html <- read_html(job_url)
      text <- html_text(html)
      df <- data.frame(key_word = kws, count = ifelse(str_detect(text, kws), 1, 0))
      searchResults$key_word_count$count <- searchResults$key_word_count$count + df$count
      searchResults$njobs <- searchResults$njobs + 1
    }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
  }
  return(searchResults)
}

Creating a for loop to iterate through all of the page links and apply the scrape function to each page.

for(i in 1:length(page_links)){
  print('on to the next 10 jobs')
  
  next_page <- read_html(page_links[i])
  
  links <- next_page %>%
  html_nodes("h2 a") %>%
  html_attr('href')
  
  search_results <- scrape(search_results, links)
  
  cat('number of jobs listings processed: ', search_results$njobs, ' \n')
  
}

approach2_search_results <- search_results

Data607-Project3

Santosh Manjrekar, Sanchid Deshmukh, Hantz Angrand, Robert Lauto

October 14, 2018

Approach 1 to scraping indeed.com

Approach 2 to scraping indeed.com