library(rvest)
## Loading required package: xml2
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ purrr 0.2.4
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
library(dplyr)
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
ping <- function(x, stderr = FALSE, stdout = FALSE, ...){
pingvec <- system2("ping", x,
stderr = FALSE,
stdout = FALSE,...)
if (pingvec == 0) TRUE else FALSE
}
# Search Words
job_title <- "Data+Scientist"
location <- "united+states"
# Set URLs
url_short <- 'https://www.indeed.com'
url_full <- paste0('https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10')
cat(url_full)
## https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10
# get the html file from search url
main_page <- read_html(url_full)
# get the total number of job posting from keywords
total_job_posting <- unlist(strsplit(main_page %>%
html_node("#searchCount") %>%
html_text(), split = ' '))
total_job_posting <- as.numeric(str_replace_all(total_job_posting[length(total_job_posting)-1],',',''))
cat('Total number of job posting: ', total_job_posting)
## Total number of job posting: 27242
# Setting up main page web scraping
links <- main_page %>%
html_nodes("h2 a") %>%
html_attr('href')
# Set page search sequence
page_seq <- paste0("https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=", seq(10, 60, 10 ))
kw_ln <- c('Hadoop','Python','\\bSQL', 'NoSQL','\\bR\\b', 'Spark', 'SAS', 'Excel\\b', 'Hive', '\\bC\\b', 'Java', 'Tableau')
kw_edu <- c('(\\bB[\\.| ]?A\\.?\\b)|(\\bB[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Bachelor',
'(\\bM[\\.| ]?A\\.?\\b)|(\\bM[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Master',
'Ph[\\.| ]?D|Doctorate' )
# Raw html cleaning; removing commas, tabs and etc
clean.text <- function(text)
{
str_replace_all(text, regex('\r\n|\n|\t|\r|,|/|<|>|\\.|[:space:]'), ' ')
}
# Scrape web page and compute running total
scrape_web <- function(res, page_seq ){
for(i in 1:length(page_seq)){
job.url <- paste0(url_short,page_seq [i])
Sys.sleep(1)
cat(paste0('Reading job ', i, '\n'))
tryCatch({
html <- read_html(job.url)
text <- html_text(html)
text <- clean.text(text)
df <- data.frame(skill = kw_ln, count = ifelse(str_detect(text, kw_ln), 1, 0))
res$running$count <- res$running$count + df$count
res$num_jobs <- res$num_jobs + 1
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
return(res)
}
scrape_web_edu <- function(res, page_seq ){
for(i in 1:length(page_seq)){
job.url <- paste0(url_short,page_seq [i])
Sys.sleep(1)
cat(paste0('Reading job ', i, '\n'))
tryCatch({
html <- read_html(job.url)
text <- html_text(html)
text <- clean.text(text)
df <- data.frame(skill = kw_edu, count = ifelse(str_detect(text, kw_edu), 1, 0))
res$running$count <- res$running$count + df$count
res$num_jobs <- res$num_jobs + 1
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}
return(res)
}
# Remove \\b for ggplot visualization
kw_ln_ggplot <- c('Hadoop','Python','SQL', 'NoSQL','R', 'Spark', 'SAS', 'Excel', 'Hive', 'C', 'Java', 'Tableau')
kw_edu_ggplot <- c('bachelor','Master','PhD' )
# Get the running total
running <- data.frame(skill = kw_ln_ggplot, count = rep(0, length(kw_ln_ggplot)))
running_edu <- data.frame(Education = kw_edu_ggplot, count = rep(0, length(kw_edu_ggplot)))
# Since the indeed only display max of 20 pages from search result, we cannot use total_job_posting but need to track by creating a num_jobs
num_jobs <- 0
# running total
print(arrange(results$running, -count))
## skill count
## 1 Python 44
## 2 R 35
## 3 SQL 29
## 4 Spark 23
## 5 Java 20
## 6 Hadoop 18
## 7 SAS 16
## 8 C 13
## 9 Hive 12
## 10 Tableau 11
## 11 NoSQL 6
## 12 Excel 3
# running total count as percentage
results$running$count<-results$running$count/results$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
#p <- ggplot(results$running, aes(reorder(skill,-count), count)) + geom_bar(stat="identity") +
# labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for ', jt, ' in ', loc))
#p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
print(arrange(results_edu$running, -count))
## Education count
## 1 Master 23
## 2 bachelor 21
## 3 PhD 18
# running total count as percentage
results_edu$running$count<-results_edu$running$count/results_edu$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
# RW... commenting this out so that we just read the pre-saved out .csv results right below
# p <- ggplot(results_edu$running, aes(reorder(Education,-count), count)) + geom_bar(stat="identity") +
# labs(x = 'Education', y = 'Frequency (%)', title = paste0('Education (%) for ', jt, ' in ', loc))
# p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
Text Analysis and Visualization
connect to mysql
dbListFields(mydb, 'indeed_skills')
## [1] "indeed_skill_id" "skill" "count" "job_count"
x<-dbGetQuery(mydb, 'select * from indeed_skills;')
head(x)
## indeed_skill_id skill count job_count
## 1 1 hadoop 58 60
## 2 2 python 152 60
## 3 3 sql 108 60
## 4 4 nosql 15 60
## 5 5 r 120 60
## 6 6 spark 57 60
Get the table that contains the skill and count for each skill
indeed_skill<-dbGetQuery(mydb, 'select skill, count from indeed_skills;')
head(indeed_skill)
## skill count
## 1 hadoop 58
## 2 python 152
## 3 sql 108
## 4 nosql 15
## 5 r 120
## 6 spark 57
indeed_edu<-dbGetQuery(mydb, 'select education_level as education, count from indeed_edu;')
head(indeed_edu)
## education count
## 1 bachelor 83
## 2 master 104
## 3 phd 80
zip_skill<-dbGetQuery(mydb, 'select skill, count, percent from zip_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(zip_skill)
## skill count percent
## 1 hadoop 185 0.49
## 2 python 156 0.41
## 3 sql 209 0.55
## 4 nosql 71 0.19
## 5 r 80 0.21
## 6 spark 151 0.40
zip_edu<-dbGetQuery(mydb, 'select education_level as education, count, percent from zip_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(zip_edu)
## education count percent
## 1 bachelor 230 0.61
## 2 master 157 0.41
## 3 phd 19 0.05
reddit_skill<-dbGetQuery(mydb, 'select skill, count, percent from reddit_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(reddit_skill)
## skill count percent
## 1 python 80 0.64
## 2 sql 49 0.39
## 3 nosql 8 0.06
## 4 r 75 0.60
## 5 spark 27 0.22
## 6 sas 14 0.11
reddit_edu<-dbGetQuery(mydb, 'select education_level as education, count, percent from reddit_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(reddit_edu)
## education count percent
## 1 bachelor 39 0.31
## 2 master 83 0.66
## 3 phd 46 0.37
Bar plots - Skills
# Indeed
p <- ggplot(indeed_skill, aes(reorder(skill,-count),fill = skill, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Indeed - Frequency ', title = paste0('Language for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
# Zip Recruiter
p <- ggplot(zip_skill, aes(reorder(skill,-count), fill = skill, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Zip Recruiter - Frequency ', title = paste0('Language for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
# Reddit
p <- ggplot(reddit_skill, aes(reorder(skill,-count),fill = skill, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Reddit - Frequency ', title = paste0('Language for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
Bar plots - Education
# Indeed
p <- ggplot(indeed_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Indeed - Frequency', title = paste0('Education for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
# Zip Recruiter
p <- ggplot(zip_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Zip Recruiter - Frequency', title = paste0('Education for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
# Reddit
p <- ggplot(reddit_edu, aes(reorder(education,-count), fill = education, count)) + geom_bar(stat="identity") +
labs(x = 'Language', y = 'Reddit - Frequency', title = paste0('Education for Data Scientist in the United States'))
p #+ scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))
Word clouds
Indeed - Skills
set.seed(1234)
wordcloud(words=indeed_skill$skill, freq = indeed_skill$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Zip Recruiter - Skills
set.seed(1234)
wordcloud(words=zip_skill$skill, freq = zip_skill$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Reddit - Skills
set.seed(1234)
wordcloud(words=reddit_skill$skill, freq = reddit_skill$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Indeed - Education
set.seed(1234)
wordcloud(words=indeed_edu$education, freq = indeed_edu$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Zip Recruiter - Education
set.seed(1234)
wordcloud(words=zip_edu$education, freq = zip_edu$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"));
Reddit - Education
set.seed(1234)
wordcloud(words=reddit_edu$education, freq = reddit_edu$count, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
We examined the education and skill requirements from indeed.com, ziprecruiter.com, and reddit.com. We illustrate the top skills required through the use of word clouds built from the number of times a skill appears in different job postings. We also examine the skill and education by percentage of job postings. We also collected full job descriptions and compiled them into text documents.
We can analyze these three documents and perform tf-idf to determine if additional inishgts regarding the most in demand data skills can be obtained.
That portion of the anaysis can be found here http://rpubs.com/ejd01/374331