library(rvest)
## Loading required package: xml2
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.4
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
library(dplyr)
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
ping <- function(x, stderr = FALSE, stdout = FALSE, ...){
pingvec <- system2("ping", x,
stderr = FALSE,
stdout = FALSE,...)
if (pingvec == 0) TRUE else FALSE
}
# Search Words
job_title <- "Data+Scientist"
location <- "united+states"
# Set URLs
url_short <- 'https://www.indeed.com'
url_full <- paste0('https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10')
cat(url_full)
## https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10
# get the html file from search url
main_page <- read_html(url_full)
# get the total number of job posting from keywords
total_job_posting <- unlist(strsplit(main_page %>%
html_node("#searchCount") %>%
html_text(), split = ' '))
total_job_posting <- as.numeric(str_replace_all(total_job_posting[length(total_job_posting)-1],',',''))
cat('Total number of job posting: ', total_job_posting)
## Total number of job posting: 27028
# Setting up main page web scraping
links <- main_page %>%
html_nodes("h2 a") %>%
html_attr('href')
# Set page search sequence
page_seq <- paste0("https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=", seq(10, 60, 10 ))
kw_ln <- c('Hadoop','Python','\\bSQL', 'NoSQL','\\bR\\b', 'Spark', 'SAS', 'Excel\\b', 'Hive', '\\bC\\b', 'Java', 'Tableau')
kw_edu <- c('(\\bB[\\.| ]?A\\.?\\b)|(\\bB[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Bachelor',
'(\\bM[\\.| ]?A\\.?\\b)|(\\bM[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Master',
'Ph[\\.| ]?D|Doctorate' )
# Raw html cleaning; removing commas, tabs and etc
clean.text <- function(text)
{
str_replace_all(text, regex('\r\n|\n|\t|\r|,|/|<|>|\\.|[:space:]'), ' ')
}
# Scrape web page and compute running total
scrape_web <- function(res, page_seq ){
for(i in 1:length(page_seq)){
job.url <- paste0(url_short,page_seq [i])
Sys.sleep(1)
cat(paste0('Reading job ', i, '\n'))
tryCatch({
html <- read_html(job.url)
text <- html_text(html)
text <- clean.text(text)
df <- data.frame(skill = kw_ln, count = ifelse(str_detect(text, kw_ln), 1, 0))
res$running$count <- res$running$count + df$count
res$num_jobs <- res$num_jobs + 1
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
return(res)
}
scrape_web_edu <- function(res, page_seq ){
for(i in 1:length(page_seq)){
job.url <- paste0(url_short,page_seq [i])
Sys.sleep(1)
cat(paste0('Reading job ', i, '\n'))
tryCatch({
html <- read_html(job.url)
text <- html_text(html)
text <- clean.text(text)
df <- data.frame(skill = kw_edu, count = ifelse(str_detect(text, kw_edu), 1, 0))
res$running$count <- res$running$count + df$count
res$num_jobs <- res$num_jobs + 1
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
return(res)
}
# Remove \\b for ggplot visualization
kw_ln_ggplot <- c('Hadoop','Python','SQL', 'NoSQL','R', 'Spark', 'SAS', 'Excel', 'Hive', 'C', 'Java', 'Tableau')
kw_edu_ggplot <- c('bachelor','Master','PhD' )
# Get the running total
running <- data.frame(skill = kw_ln_ggplot, count = rep(0, length(kw_ln_ggplot)))
running_edu <- data.frame(Education = kw_edu_ggplot, count = rep(0, length(kw_edu_ggplot)))
# Since the indeed only display max of 20 pages from search result, we cannot use total_job_posting but need to track by creating a num_jobs
num_jobs <- 0
# running total
print(arrange(results$running, -count))
## skill count
## 1 Python 41
## 2 R 35
## 3 SQL 28
## 4 SAS 20
## 5 Java 20
## 6 Hadoop 19
## 7 Spark 17
## 8 C 17
## 9 Tableau 14
## 10 Hive 9
## 11 NoSQL 6
## 12 Excel 2
# running total count as percentage
results$running$count<-results$running$count/results$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
p <- ggplot(results$running, aes(reorder(skill,-count), count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for ', jt, ' in ', loc))
p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
print(arrange(results_edu$running, -count))
## Education count
## 1 Master 20
## 2 bachelor 19
## 3 PhD 19
# running total count as percentage
results_edu$running$count<-results_edu$running$count/results_edu$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
p <- ggplot(results_edu$running, aes(reorder(Education,-count), count)) + geom_bar(stat="identity") +
labs(x = 'Education', y = 'Frequency (%)', title = paste0('Education (%) for ', jt, ' in ', loc))
p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
Text Analysis and Visualization
# Install
#install.packages("tm") # for text mining
#install.packages("SnowballC") # for text stemming
#install.packages("wordcloud") # word-cloud generator
#install.packages("RColorBrewer") # color palettes
# Load
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
connect to mysql
Get the table that contains the skill and count for each skill
Generate a word cloud
set.seed(1234)
wordcloud(words=rs$skill, freq = rs$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
set.seed(1234)
wordcloud(words=ws$education, freq = ws$count, min.freq = 0,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
barplot(rs$count, las = 2, names.arg = rs$skill,
col ="lightblue", main ="Most Frequent Skills",
ylab = "Skill Frequencies");
barplot(ws$count, las = 2, names.arg = ws$education,
col ="lightblue", main ="Most Frequent Education Level",
ylab = "Education Frequencies")
qs<-dbGetQuery(mydb, 'select skill, count, percent from zip_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(qs)
## skill count percent
## 1 hadoop 185 0.49
## 2 python 156 0.41
## 3 sql 209 0.55
## 4 nosql 71 0.19
## 5 r 80 0.21
## 6 spark 151 0.40
ps<-dbGetQuery(mydb, 'select education_level as education, count, percent from zip_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(ps)
## education count percent
## 1 bachelor 230 0.61
## 2 master 157 0.41
## 3 phd 19 0.05
Additional Visualization for Zip jobs
set.seed(1234)
wordcloud(words=qs$skill, freq = qs$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
set.seed(1234)
wordcloud(words=ps$education, freq = ps$count, min.freq = 0,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# Visualization
ggplot(qs, aes(reorder(skill,-percent), percent)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for Data Scientist in the United States')) %>%
scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
ggplot(ps, aes(reorder(education,-percent), percent)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Frequency (%)', title = paste0('Education (%) for Data Scientist in the United States')) %>%
scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
Get reddit results from database
hs<-dbGetQuery(mydb, 'select skill, count, percent from reddit_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(hs)
## skill count percent
## 1 python 80 0.64
## 2 sql 49 0.39
## 3 nosql 8 0.06
## 4 r 75 0.60
## 5 spark 27 0.22
## 6 sas 14 0.11
gs<-dbGetQuery(mydb, 'select education_level as education, count, percent from reddit_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(gs)
## education count percent
## 1 bachelor 39 0.31
## 2 master 83 0.66
## 3 phd 46 0.37
Additional Visualization for Reddit posts
set.seed(1234)
wordcloud(words=gs$education, freq = gs$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
set.seed(1234)
wordcloud(words=hs$skill, freq = hs$count, min.freq = 0,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# Visualization
ggplot(hs, aes(reorder(skill,-percent), percent)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for Data Scientist in the United States')) %>%
scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
ggplot(gs, aes(reorder(education,-percent), percent)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Frequency (%)', title = paste0('Education (%) for Data Scientist in the United States')) %>%
scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
We examined the education and skill requirements from indeed.com, ziprecruiter.com, and reddit.com. We illustrate the top skills required through the use of word clouds built from the number of times a skill appears in different job postings. We also examine the skill and education by percentage of job postings. We also collected full job descriptions and compiled them into text documents.
We can analyze these three documents and perform tf-idf to determine if additional inishgts regarding the most in demand data skills can be obtained.
This portion will be done in a different R Markdown found here: http://rpubs.com/ejd01/374188
Discussion/Limitations:
Timing: evening in subsequent searches on the same sites, we would see varying results. This gives some indication that there may be a temporal factor in the data (time of day or even season) where some words will show up more frequently.
Variation among sites: There was significant variation in the results from the analysis of each of the sites. For instance, Reddit results might show a high score for PhD whereas ZipRecruiter may not. We have not established the source of this variation.
Though job listings are a good starting point for thinking about what employers are looking for in candidates, it’s not necessarily true that a job description accurately articulates what are the “most important” skills for a data scientist. This could take at least two forms. For instance, a job listing may specify bare minimum requirements (say, BA/BS), when in fact less commonly listed requirements (perhaps, PhD) are actually most important. Also, job listings are short descriptions that focus on skills that can be described in brief (and often administrative) language. As many job-searchers might feel, the most important jobs skill might outlined a job posting itself.
Our job listing search gives each job listing the same weight. Perhaps we should weight listings from top companies differently than others. That is, perhaps Google/Amazon/Facebook/etc. may have a better specifications for the “most important” data science skills in their postings.
A search like ours could be more powerful, possibly, if there was a way to rate outcomes in association with the skills (such as which companies listings were associated with the greatest productivity/success, or something along these lines). Or, to weight the scraped results in some other manner. For instance, we looked at results from reddit posts, but did not rank according to individual upvote scores. Perhaps the most upvoted comments could be given more weight in the analysis that ones with lower scores or more downvotes.
In searching for single words “SQL experience is not necessary” will have same impact as “SQL experience is necessary”. Would need more robust natural language processing to disambiguating such cases.
Finally, in one of our analyses, we evaluated the distribution of commonly understood skills or educational requirements that we understand as important for data scientists. A further project might use data scraping to inform this initial list to allow for more robust results.