Indeed

library(rvest)
## Loading required package: xml2
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ purrr   0.2.4
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()
library(dplyr)
library(RCurl)
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
ping <- function(x, stderr = FALSE, stdout = FALSE, ...){
    pingvec <- system2("ping", x,
                       stderr = FALSE,
                       stdout = FALSE,...)
    if (pingvec == 0) TRUE else FALSE
}
# Search Words
job_title <- "Data+Scientist"
location <- "united+states"
# Set URLs
url_short <- 'https://www.indeed.com'
url_full <- paste0('https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10')
cat(url_full)
## https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10
# get the html file from search url
main_page <- read_html(url_full)
# get the total number of job posting from keywords
total_job_posting <- unlist(strsplit(main_page %>%
                              html_node("#searchCount") %>%
                              html_text(), split = ' '))
total_job_posting <- as.numeric(str_replace_all(total_job_posting[length(total_job_posting)-1],',',''))
cat('Total number of job posting: ', total_job_posting)
## Total number of job posting:  27242
# Setting up main page web scraping
links <- main_page %>%
 html_nodes("h2 a") %>%
 html_attr('href')
# Set page search sequence
page_seq <- paste0("https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=", seq(10, 60, 10 ))
  
  
kw_ln <- c('Hadoop','Python','\\bSQL', 'NoSQL','\\bR\\b', 'Spark', 'SAS', 'Excel\\b', 'Hive', '\\bC\\b', 'Java', 'Tableau')
kw_edu <- c('(\\bB[\\.| ]?A\\.?\\b)|(\\bB[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Bachelor',
            '(\\bM[\\.| ]?A\\.?\\b)|(\\bM[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Master',
            'Ph[\\.| ]?D|Doctorate' )
# Raw html cleaning; removing commas, tabs and etc  
clean.text <- function(text)
{
 str_replace_all(text, regex('\r\n|\n|\t|\r|,|/|<|>|\\.|[:space:]'), ' ')
}
# Scrape web page and compute running total
scrape_web <- function(res, page_seq ){
 for(i in 1:length(page_seq)){
   job.url <- paste0(url_short,page_seq [i])
   
   Sys.sleep(1)
   cat(paste0('Reading job ', i, '\n'))
   
   tryCatch({
     html <- read_html(job.url)
     text <- html_text(html)
     text <- clean.text(text)
     df <- data.frame(skill = kw_ln, count = ifelse(str_detect(text, kw_ln), 1, 0))
     res$running$count <- res$running$count + df$count
     res$num_jobs <- res$num_jobs + 1
   }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
 }
 return(res)
}
scrape_web_edu <- function(res, page_seq ){
 for(i in 1:length(page_seq)){
   job.url <- paste0(url_short,page_seq [i])
   
   Sys.sleep(1)
   cat(paste0('Reading job ', i, '\n'))
   
   tryCatch({
     html <- read_html(job.url)
     text <- html_text(html)
     text <- clean.text(text)
     df <- data.frame(skill = kw_edu, count = ifelse(str_detect(text, kw_edu), 1, 0))
     res$running$count <- res$running$count + df$count
     res$num_jobs <- res$num_jobs + 1
   }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
 }
 return(res)
}
# Remove \\b for ggplot visualization
kw_ln_ggplot <- c('Hadoop','Python','SQL', 'NoSQL','R', 'Spark', 'SAS', 'Excel', 'Hive', 'C', 'Java', 'Tableau')
kw_edu_ggplot  <- c('bachelor','Master','PhD' )
# Get the running total
running <- data.frame(skill = kw_ln_ggplot, count = rep(0, length(kw_ln_ggplot)))
running_edu <- data.frame(Education = kw_edu_ggplot, count = rep(0, length(kw_edu_ggplot)))
# Since the indeed only display max of 20 pages from search result, we cannot use total_job_posting but need to track by creating a num_jobs
num_jobs <- 0
# running total
print(arrange(results$running, -count))
##      skill count
## 1   Python    44
## 2        R    35
## 3      SQL    29
## 4    Spark    23
## 5     Java    20
## 6   Hadoop    18
## 7      SAS    16
## 8        C    13
## 9     Hive    12
## 10 Tableau    11
## 11   NoSQL     6
## 12   Excel     3
# running total count as percentage
results$running$count<-results$running$count/results$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
#p <- ggplot(results$running, aes(reorder(skill,-count), count)) + geom_bar(stat="identity") +
# labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for ', jt, ' in ', loc)) 
#p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
print(arrange(results_edu$running, -count))
##   Education count
## 1    Master    23
## 2  bachelor    21
## 3       PhD    18
# running total count as percentage
results_edu$running$count<-results_edu$running$count/results_edu$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
# RW... commenting this out so that we just read the pre-saved out .csv results right below
# p <- ggplot(results_edu$running, aes(reorder(Education,-count), count)) + geom_bar(stat="identity") +
# labs(x = 'Education', y = 'Frequency (%)', title = paste0('Education (%) for ', jt, ' in ', loc)) 
# p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

Text Analysis and Visualization

connect to mysql

dbListFields(mydb, 'indeed_skills')
## [1] "indeed_skill_id" "skill"           "count"           "job_count"
x<-dbGetQuery(mydb, 'select * from indeed_skills;')
head(x)
##   indeed_skill_id  skill count job_count
## 1               1 hadoop    58        60
## 2               2 python   152        60
## 3               3    sql   108        60
## 4               4  nosql    15        60
## 5               5      r   120        60
## 6               6  spark    57        60

Get the table that contains the skill and count for each skill

indeed_skill<-dbGetQuery(mydb, 'select skill, count from indeed_skills;')
head(indeed_skill)
##    skill count
## 1 hadoop    58
## 2 python   152
## 3    sql   108
## 4  nosql    15
## 5      r   120
## 6  spark    57
indeed_edu<-dbGetQuery(mydb, 'select education_level as education, count from indeed_edu;')
head(indeed_edu)
##   education count
## 1  bachelor    83
## 2    master   104
## 3       phd    80
zip_skill<-dbGetQuery(mydb, 'select skill, count, percent from zip_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(zip_skill)
##    skill count percent
## 1 hadoop   185    0.49
## 2 python   156    0.41
## 3    sql   209    0.55
## 4  nosql    71    0.19
## 5      r    80    0.21
## 6  spark   151    0.40
zip_edu<-dbGetQuery(mydb, 'select education_level as education, count, percent from zip_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(zip_edu)
##   education count percent
## 1  bachelor   230    0.61
## 2    master   157    0.41
## 3       phd    19    0.05
reddit_skill<-dbGetQuery(mydb, 'select skill, count, percent from reddit_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(reddit_skill)
##    skill count percent
## 1 python    80    0.64
## 2    sql    49    0.39
## 3  nosql     8    0.06
## 4      r    75    0.60
## 5  spark    27    0.22
## 6    sas    14    0.11
reddit_edu<-dbGetQuery(mydb, 'select education_level as education, count, percent from reddit_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(reddit_edu)
##   education count percent
## 1  bachelor    39    0.31
## 2    master    83    0.66
## 3       phd    46    0.37

Bar plots - Skills

# Indeed
p <- ggplot(indeed_skill, aes(reorder(skill,-count),fill = skill,  count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Indeed - Frequency ', title = paste0('Language for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

# Zip Recruiter
p <- ggplot(zip_skill, aes(reorder(skill,-count), fill = skill, count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Zip Recruiter - Frequency ', title = paste0('Language for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

# Reddit
p <- ggplot(reddit_skill, aes(reorder(skill,-count),fill = skill,  count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Reddit - Frequency ', title = paste0('Language for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

Bar plots - Education

# Indeed
p <- ggplot(indeed_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Indeed - Frequency', title = paste0('Education for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

# Zip Recruiter
p <- ggplot(zip_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Zip Recruiter - Frequency', title = paste0('Education for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

# Reddit
p <- ggplot(reddit_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Reddit - Frequency', title = paste0('Education for Data Scientist in the United States')) 
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

Word clouds

Indeed - Skills

set.seed(1234)
wordcloud(words=indeed_skill$skill, freq = indeed_skill$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

Zip Recruiter - Skills

set.seed(1234)
wordcloud(words=zip_skill$skill, freq = zip_skill$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

Reddit - Skills

set.seed(1234)
wordcloud(words=reddit_skill$skill, freq = reddit_skill$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

Indeed - Education

set.seed(1234)
wordcloud(words=indeed_edu$education, freq = indeed_edu$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

Zip Recruiter - Education

set.seed(1234)
wordcloud(words=zip_edu$education, freq = zip_edu$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

Reddit - Education

set.seed(1234)
wordcloud(words=reddit_edu$education, freq = reddit_edu$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

We examined the education and skill requirements from indeed.com, ziprecruiter.com, and reddit.com. We illustrate the top skills required through the use of word clouds built from the number of times a skill appears in different job postings. We also examine the skill and education by percentage of job postings. We also collected full job descriptions and compiled them into text documents.

We can analyze these three documents and perform tf-idf to determine if additional inishgts regarding the most in demand data skills can be obtained.

That portion of the anaysis can be found here http://rpubs.com/ejd01/374331