Indeed

library(rvest)
## Loading required package: xml2
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ purrr   0.2.4
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()
library(dplyr)
library(RCurl)
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
ping <- function(x, stderr = FALSE, stdout = FALSE, ...){
    pingvec <- system2("ping", x,
                       stderr = FALSE,
                       stdout = FALSE,...)
    if (pingvec == 0) TRUE else FALSE
}
# Search Words
job_title <- "Data+Scientist"
location <- "united+states"
# Set URLs
url_short <- 'https://www.indeed.com'
url_full <- paste0('https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10')
cat(url_full)
## https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=10
# get the html file from search url
main_page <- read_html(url_full)
# get the total number of job posting from keywords
total_job_posting <- unlist(strsplit(main_page %>%
                              html_node("#searchCount") %>%
                              html_text(), split = ' '))
total_job_posting <- as.numeric(str_replace_all(total_job_posting[length(total_job_posting)-1],',',''))
cat('Total number of job posting: ', total_job_posting)
## Total number of job posting:  27028
# Setting up main page web scraping
links <- main_page %>%
 html_nodes("h2 a") %>%
 html_attr('href')
# Set page search sequence
page_seq <- paste0("https://www.indeed.com/jobs?q=Data+Scientist&l=united+states&start=", seq(10, 60, 10 ))
  
  
kw_ln <- c('Hadoop','Python','\\bSQL', 'NoSQL','\\bR\\b', 'Spark', 'SAS', 'Excel\\b', 'Hive', '\\bC\\b', 'Java', 'Tableau')
kw_edu <- c('(\\bB[\\.| ]?A\\.?\\b)|(\\bB[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Bachelor',
            '(\\bM[\\.| ]?A\\.?\\b)|(\\bM[\\.| ]?S\\.?\\b)|\1.?\2|\2.?\1|Master',
            'Ph[\\.| ]?D|Doctorate' )
# Raw html cleaning; removing commas, tabs and etc  
clean.text <- function(text)
{
 str_replace_all(text, regex('\r\n|\n|\t|\r|,|/|<|>|\\.|[:space:]'), ' ')
}
# Scrape web page and compute running total
scrape_web <- function(res, page_seq ){
 for(i in 1:length(page_seq)){
   job.url <- paste0(url_short,page_seq [i])
   
   Sys.sleep(1)
   cat(paste0('Reading job ', i, '\n'))
   
   tryCatch({
     html <- read_html(job.url)
     text <- html_text(html)
     text <- clean.text(text)
     df <- data.frame(skill = kw_ln, count = ifelse(str_detect(text, kw_ln), 1, 0))
     res$running$count <- res$running$count + df$count
     res$num_jobs <- res$num_jobs + 1
   }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
 }
 return(res)
}
scrape_web_edu <- function(res, page_seq ){
 for(i in 1:length(page_seq)){
   job.url <- paste0(url_short,page_seq [i])
   
   Sys.sleep(1)
   cat(paste0('Reading job ', i, '\n'))
   
   tryCatch({
     html <- read_html(job.url)
     text <- html_text(html)
     text <- clean.text(text)
     df <- data.frame(skill = kw_edu, count = ifelse(str_detect(text, kw_edu), 1, 0))
     res$running$count <- res$running$count + df$count
     res$num_jobs <- res$num_jobs + 1
   }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
 }
 return(res)
}
# Remove \\b for ggplot visualization
kw_ln_ggplot <- c('Hadoop','Python','SQL', 'NoSQL','R', 'Spark', 'SAS', 'Excel', 'Hive', 'C', 'Java', 'Tableau')
kw_edu_ggplot  <- c('bachelor','Master','PhD' )
# Get the running total
running <- data.frame(skill = kw_ln_ggplot, count = rep(0, length(kw_ln_ggplot)))
running_edu <- data.frame(Education = kw_edu_ggplot, count = rep(0, length(kw_edu_ggplot)))
# Since the indeed only display max of 20 pages from search result, we cannot use total_job_posting but need to track by creating a num_jobs
num_jobs <- 0
# running total
print(arrange(results$running, -count))
##      skill count
## 1   Python    41
## 2        R    35
## 3      SQL    28
## 4      SAS    20
## 5     Java    20
## 6   Hadoop    19
## 7    Spark    17
## 8        C    17
## 9  Tableau    14
## 10    Hive     9
## 11   NoSQL     6
## 12   Excel     2
# running total count as percentage
results$running$count<-results$running$count/results$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
p <- ggplot(results$running, aes(reorder(skill,-count), count)) + geom_bar(stat="identity") +
 labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for ', jt, ' in ', loc)) 
p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

print(arrange(results_edu$running, -count))
##   Education count
## 1    Master    20
## 2  bachelor    19
## 3       PhD    19
# running total count as percentage
results_edu$running$count<-results_edu$running$count/results_edu$num_jobs
# Reformat the Job Title and Location to readable form
jt <- str_replace_all(job_title, '\\+|\\\"', ' ')
loc <- str_replace_all(location, '\\%2C+|\\+',' ')
# Visualization
p <- ggplot(results_edu$running, aes(reorder(Education,-count), count)) + geom_bar(stat="identity") +
 labs(x = 'Education', y = 'Frequency (%)', title = paste0('Education (%) for ', jt, ' in ', loc)) 
p + scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

Text Analysis and Visualization

# Install
#install.packages("tm")  # for text mining
#install.packages("SnowballC") # for text stemming
#install.packages("wordcloud") # word-cloud generator 
#install.packages("RColorBrewer") # color palettes
# Load
library("tm")
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")

connect to mysql

Get the table that contains the skill and count for each skill

Generate a word cloud

set.seed(1234)
wordcloud(words=rs$skill, freq = rs$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

set.seed(1234)
wordcloud(words=ws$education, freq = ws$count, min.freq = 0,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

barplot(rs$count, las = 2, names.arg = rs$skill,
        col ="lightblue", main ="Most Frequent Skills",
        ylab = "Skill Frequencies");

barplot(ws$count, las = 2, names.arg = ws$education,
        col ="lightblue", main ="Most Frequent Education Level",
        ylab = "Education Frequencies")

Ziprecruiter

qs<-dbGetQuery(mydb, 'select skill, count, percent from zip_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(qs)
##    skill count percent
## 1 hadoop   185    0.49
## 2 python   156    0.41
## 3    sql   209    0.55
## 4  nosql    71    0.19
## 5      r    80    0.21
## 6  spark   151    0.40
ps<-dbGetQuery(mydb, 'select education_level as education, count, percent from zip_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(ps)
##   education count percent
## 1  bachelor   230    0.61
## 2    master   157    0.41
## 3       phd    19    0.05

Additional Visualization for Zip jobs

set.seed(1234)

wordcloud(words=qs$skill, freq = qs$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

set.seed(1234)
wordcloud(words=ps$education, freq = ps$count, min.freq = 0,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

# Visualization
ggplot(qs, aes(reorder(skill,-percent), percent)) + geom_bar(stat="identity") +
  labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for Data Scientist in the United States')) %>%
  scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

ggplot(ps, aes(reorder(education,-percent), percent)) + geom_bar(stat="identity") +
  labs(x = 'Language', y = 'Frequency (%)', title = paste0('Education (%) for Data Scientist in the United States')) %>%
  scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

Reddit

Get reddit results from database

hs<-dbGetQuery(mydb, 'select skill, count, percent from reddit_skills;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(hs)
##    skill count percent
## 1 python    80    0.64
## 2    sql    49    0.39
## 3  nosql     8    0.06
## 4      r    75    0.60
## 5  spark    27    0.22
## 6    sas    14    0.11
gs<-dbGetQuery(mydb, 'select education_level as education, count, percent from reddit_edu;')
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
head(gs)
##   education count percent
## 1  bachelor    39    0.31
## 2    master    83    0.66
## 3       phd    46    0.37

Additional Visualization for Reddit posts

set.seed(1234)
wordcloud(words=gs$education, freq = gs$count, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"));

set.seed(1234)

wordcloud(words=hs$skill, freq = hs$count, min.freq = 0,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

# Visualization
ggplot(hs, aes(reorder(skill,-percent), percent)) + geom_bar(stat="identity") +
  labs(x = 'Language', y = 'Frequency (%)', title = paste0('Language (%) for Data Scientist in the United States')) %>%
  scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

ggplot(gs, aes(reorder(education,-percent), percent)) + geom_bar(stat="identity") +
  labs(x = 'Language', y = 'Frequency (%)', title = paste0('Education (%) for Data Scientist in the United States')) %>%
  scale_y_continuous(labels = scales::percent, breaks = seq(0,1,0.1))

We examined the education and skill requirements from indeed.com, ziprecruiter.com, and reddit.com. We illustrate the top skills required through the use of word clouds built from the number of times a skill appears in different job postings. We also examine the skill and education by percentage of job postings. We also collected full job descriptions and compiled them into text documents.

We can analyze these three documents and perform tf-idf to determine if additional inishgts regarding the most in demand data skills can be obtained.

This portion will be done in a different R Markdown found here: http://rpubs.com/ejd01/374188

Discussion

Discussion/Limitations: