Indded.com offered a fairly open way to harvest information, with no blockages so we started like this there are several libraries for XML and HTML processing and adding some new ones for text minig that prove to be very useful
library(rvest)
## Loading required package: xml2
library(RCurl)
## Loading required package: bitops
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(xtable)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
##
## guess_encoding
library(tidytext)
library(knitr)
## Warning: package 'knitr' was built under R version 3.3.3
library(RColorBrewer)
library(SnowballC)
found a great function that packages a lot of the Text mining information as executes most of the outcomes of the analysis
source('http://www.sthda.com/upload/rquery_wordcloud.r')
Then I execute the search for “Hot” cities for datascience jobs and limit the amount of hits to the source, after this generation of a dataframe to “catch” and extract the relevant information and fields
#choosing "hot" cities for datascience openings
city.set <- c("New+York+NY", "Seattle+WA", "Washington+DC","Atlanta+GA","Boston+MA")
#This is the term ww will be searching
target.job <- "data+scientist"
#Indeed offer a fairly open way to harvest information the tags get confusing
base.url <- "https://www.indeed.com/"
max.results <- 50
#create a df to hold everything that we collect
jobs.data <- data.frame(matrix(ncol = 7, nrow = 0))
n <- c("city","job.title","company.name","job.location","summary.short","salary","links,summary.full")
colnames(jobs.data)<-n
#Havesting data for all the cities and begin to parse the results with xpaths to extract interesting information
for (city in city.set){
print(paste("Downloading data for: ", city))
for (start in range(0,max.results,10)){
url <- paste(base.url,"jobs?q=",target.job,"&l=",city,"&start=", start ,sep="")
page <- read_html(url)
Sys.sleep(1)
#get the links
links <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("href")
#get the job title
job.title <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("title")
#get the job title
job.title <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("title")
#get the company name
company.name <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="company"]') %>%
html_text() %>%
trimws -> company.name
#get job location
job.location <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="location"]')%>%
html_text() %>%
trimws -> job.location
#get the short sumary
summary.short <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="summary"]') %>%
html_text() %>%
trimws -> summary.short
}
#create a structure to hold our full summaries
summary.full <- rep(NA, length(links))
#fill in the job data
job.city <- rep(city,length(links))
#add a place-holder for the salary
job.salary <- rep(0,length(links))
#iterate over the links that we collected
for ( n in 1:length(links) ){
#build the link
link <- paste(base.url,links[n],sep="")
#pull the link
page <- read_html(link)
#get the full summary
s.full <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="summary"]') %>%
html_text() %>%
trimws -> s.full
#check to make sure we got some data and if so, append it.
#as expired postings return an empty var
if (length(s.full) > 0 ){
summary.full[n] = s.full
}
}
#add the newly collected data to the jobs.data
jobs.data <- rbind(jobs.data,data.frame(city,
job.title,
company.name,
job.location,
summary.short,
job.salary,
links,
summary.full))
}
## [1] "Downloading data for: New+York+NY"
## [1] "Downloading data for: Seattle+WA"
## [1] "Downloading data for: Washington+DC"
## [1] "Downloading data for: Atlanta+GA"
## [1] "Downloading data for: Boston+MA"
Then write the resulting dataframe into a CSV File
write.csv(jobs.data, file = "MyData.csv")
We start to analyze the information
This is a Worldcloud that tell us a bit of the story between soft and hard skills.
#taking out exceptions of words we don't want to be part of the cloud because they're not relevant
res<-rquery.wordcloud(jobs.data, type=c("text"),
lang="english", excludeWords = c("new", "york", "austin","sanfrancisco", "san", "francisco","newyorkny","seattlewa","austintx","seattle","boston","data","scientist",
"atlantaga","atlanta","washingtondc","sanfranciscoca","washington","bostonma"),
textStemming = FALSE, colorPalette="Dark2",
max.words=200)
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : optimization could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : quantitative could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : amazoncom could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : benefits could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : decision could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : design could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : developing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : experts could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : facebook could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : financial could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : focused could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : group could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : industry could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : internship could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : market could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : mathematical could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : minimum could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : national could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : operations could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : pipelines could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : production could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words
## = max.words, : professionals could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : prototype could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : services could not be fit on page. It will not be plotted.
Now we Show a Table showing the words mentioned with more frequency.
tdm <- res$tdm
freqTable <- res$freqTable
# Show the top10 words and their frequency
head(freqTable, 10)
## word freq
## machine machine 32
## analytics analytics 29
## learning learning 28
## experience experience 22
## scientists scientists 19
## analysis analysis 18
## mining mining 17
## will will 17
## area area 15
## work work 14
# Bar plot of the frequency for the top10
barplot(freqTable[1:10,]$freq, las = 2,
names.arg = freqTable[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
Now showing frequent time concepts in the job postings and just to exemplify a method on quantify strength of terms among each other in these case “learning”"
#we want to identify words that occur at least 15 times :
findFreqTerms(tdm, lowfreq = 15)
## [1] "analysis" "analytics" "area" "experience" "learning"
## [6] "machine" "mining" "scientists" "will"
#You could also analyze the correlation (or association) between frequent terms. The R code below identifies which words are associated with "learning"
findAssocs(tdm, terms = "learning", corlimit = 0.3)
## $learning
## analytical analytics engineering
## 1.00 1.00 1.00
## machine modeling science
## 1.00 1.00 1.00
## team abilities ability
## 1.00 0.99 0.99
## academic achieved acquisition
## 0.99 0.99 0.99
## across actionable actuarial
## 0.99 0.99 0.99
## additional additionally address
## 0.99 0.99 0.99
## adhoc advantage agencysponsored
## 0.99 0.99 0.99
## agenda agile air
## 0.99 0.99 0.99
## algorithm algorithms allowing
## 0.99 0.99 0.99
## amazon amendments amounts
## 0.99 0.99 0.99
## analyses analysis analysts
## 0.99 0.99 0.99
## analyze analyzing api
## 0.99 0.99 0.99
## application apply applying
## 0.99 0.99 0.99
## architect areas arm
## 0.99 0.99 0.99
## artificial ask assist
## 0.99 0.99 0.99
## assistance assurance ats
## 0.99 0.99 0.99
## attendance audience automate
## 0.99 0.99 0.99
## automation background based
## 0.99 0.99 0.99
## become bias bioinformatician
## 0.99 0.99 0.99
## biologist boosting bottomline
## 0.99 0.99 0.99
## bring build building
## 0.99 0.99 0.99
## built businessdriven candidate
## 0.99 0.99 0.99
## capable cases chain
## 0.99 0.99 0.99
## challenge challenges changes
## 0.99 0.99 0.99
## clearly clia client
## 0.99 0.99 0.99
## clients clinical closely
## 0.99 0.99 0.99
## cloud code coding
## 0.99 0.99 0.99
## codingdevelopment collaborate collaboration
## 0.99 0.99 0.99
## collected collecting combination
## 0.99 0.99 0.99
## combine commensurate communicate
## 0.99 0.99 0.99
## companys compensation competitive
## 0.99 0.99 0.99
## complex compliance comprehensive
## 0.99 0.99 0.99
## computational computer computers
## 0.99 0.99 0.99
## conclusions conduct conducting
## 0.99 0.99 0.99
## consulting continuing contribute
## 0.99 0.99 0.99
## converse convincing correction
## 0.99 0.99 0.99
## create creation credibility
## 0.99 0.99 0.99
## culture custom customer
## 0.99 0.99 0.99
## customers daily datacentric
## 0.99 0.99 0.99
## datadriven decision deep
## 0.99 0.99 0.99
## degree deliver demonstrate
## 0.99 0.99 0.99
## demonstrated deploy design
## 0.99 0.99 0.99
## desire desktop detailed
## 0.99 0.99 0.99
## detecting develop developing
## 0.99 0.99 0.99
## development digging direction
## 0.99 0.99 0.99
## discovering disparate diverse
## 0.99 0.99 0.99
## dollar downstream drive
## 0.99 0.99 0.99
## driven dynamic ecological
## 0.99 0.99 0.99
## economics education effective
## 0.99 0.99 0.99
## efficient efficiently else
## 0.99 0.99 0.99
## emphasis employ enabling
## 0.99 0.99 0.99
## encompassing engaging enhancing
## 0.99 0.99 0.99
## ensure ensuring entirety
## 0.99 0.99 0.99
## environment equally equivalent
## 0.99 0.99 0.99
## establish examine example
## 0.99 0.99 0.99
## exceptions execution existing
## 0.99 0.99 0.99
## experience experiences experiments
## 0.99 0.99 0.99
## expertise experts explore
## 0.99 0.99 0.99
## extensive external extraction
## 0.99 0.99 0.99
## fake fashion faster
## 0.99 0.99 0.99
## fastpaced feature field
## 0.99 0.99 0.99
## find firm fluently
## 0.99 0.99 0.99
## focus focused folks
## 0.99 0.99 0.99
## following forecasting forest
## 0.99 0.99 0.99
## four fuel fulfillment
## 0.99 0.99 0.99
## full functions gather
## 0.99 0.99 0.99
## general generating generation
## 0.99 0.99 0.99
## give gleaned globally
## 0.99 0.99 0.99
## goal gradient growing
## 0.99 0.99 0.99
## guide handling hands
## 0.99 0.99 0.99
## hardware hate haul
## 0.99 0.99 0.99
## help high highend
## 0.99 0.99 0.99
## highly identify impact
## 0.99 0.99 0.99
## implementation implementing improvement
## 0.99 0.99 0.99
## include includes including
## 0.99 0.99 0.99
## incredible industry inferential
## 0.99 0.99 0.99
## information infrastructure initiate
## 0.99 0.99 0.99
## insights integrating interest
## 0.99 0.99 0.99
## interns interpret involve
## 0.99 0.99 0.99
## issues knowledge laboratory
## 0.99 0.99 0.99
## language languages large
## 0.99 0.99 0.99
## lead leaders leading
## 0.99 0.99 0.99
## leadingedge learningai learningpredictive
## 0.99 0.99 0.99
## least leverages leveraging
## 0.99 0.99 0.99
## line linux longterm
## 0.99 0.99 0.99
## looking loves mac
## 0.99 0.99 0.99
## maintenance make management
## 0.99 0.99 0.99
## managers managing manipulation
## 0.99 0.99 0.99
## manufacturing marketing marketingdriven
## 0.99 0.99 0.99
## masters mathematical mathematics
## 0.99 0.99 0.99
## matter maximizes may
## 0.99 0.99 0.99
## measuring medical members
## 0.99 0.99 0.99
## mentor methodologies methods
## 0.99 0.99 0.99
## metrics mine minimum
## 0.99 0.99 0.99
## mining mission model
## 0.99 0.99 0.99
## modelling models monitoring
## 0.99 0.99 0.99
## multidisciplinary must nas
## 0.99 0.99 0.99
## natural needed needs
## 0.99 0.99 0.99
## negative network news
## 0.99 0.99 0.99
## next novel offer
## 0.99 0.99 0.99
## office open operating
## 0.99 0.99 0.99
## opportunities optimization optimize
## 0.99 0.99 0.99
## orchestrating order organizational
## 0.99 0.99 0.99
## package parsing part
## 0.99 0.99 0.99
## partner parts passionate
## 0.99 0.99 0.99
## pattern patterns perform
## 0.99 0.99 0.99
## performance performing pipeline
## 0.99 0.99 0.99
## pipelines place plans
## 0.99 0.99 0.99
## platform policies position
## 0.99 0.99 0.99
## possibilities powerful prediction
## 0.99 0.99 0.99
## predictive preparation present
## 0.99 0.99 0.99
## presentation preventing prior
## 0.99 0.99 0.99
## prioritization problem problems
## 0.99 0.99 0.99
## procedures process processing
## 0.99 0.99 0.99
## product professionals programming
## 0.99 0.99 0.99
## project projects proofed
## 0.99 0.99 0.99
## proper proposing proprietary
## 0.99 0.99 0.99
## protocols prototype provide
## 0.99 0.99 0.99
## pursuit python quality
## 0.99 0.99 0.99
## quantitative questionnaire questions
## 0.99 0.99 0.99
## random ranging rapid
## 0.99 0.99 0.99
## rapidly reducing regression
## 0.99 0.99 0.99
## related relevant researchers
## 0.99 0.99 0.99
## responsibilities responsible results
## 0.99 0.99 0.99
## retrieval retrieve right
## 0.99 0.99 0.99
## role sas scale
## 0.99 0.99 0.99
## scientific scientists scripting
## 0.99 0.99 0.99
## seasoned segmentation selection
## 0.99 0.99 0.99
## sensitive servers service
## 0.99 0.99 0.99
## serving sets setting
## 0.99 0.99 0.99
## shape sharing shipment
## 0.99 0.99 0.99
## sideside similar skills
## 0.99 0.99 0.99
## small smart software
## 0.99 0.99 0.99
## solid solution solving
## 0.99 0.99 0.99
## something sophisticated source
## 0.99 0.99 0.99
## sources sourcingcollection space
## 0.99 0.99 0.99
## specialists specific speech
## 0.99 0.99 0.99
## sql stages statistical
## 0.99 0.99 0.99
## statistician statistics strategic
## 0.99 0.99 0.99
## streaming streams strong
## 0.99 0.99 0.99
## students study subject
## 0.99 0.99 0.99
## supply support surveys
## 0.99 0.99 0.99
## system tackle take
## 0.99 0.99 0.99
## talented tasks teams
## 0.99 0.99 0.99
## technical techniques technologists
## 0.99 0.99 0.99
## telecom throughout tools
## 0.99 0.99 0.99
## traffic transportation trees
## 0.99 0.99 0.99
## two uncover understanding
## 0.99 0.99 0.99
## upgrades usage use
## 0.99 0.99 0.99
## used users uses
## 0.99 0.99 0.99
## using utilizes utilizing
## 0.99 0.99 0.99
## value variety various
## 0.99 0.99 0.99
## versed visualization visualizations
## 0.99 0.99 0.99
## visualizing want wants
## 0.99 0.99 0.99
## web well wetland
## 0.99 0.99 0.99
## whether wide will
## 0.99 0.99 0.99
## within work working
## 0.99 0.99 0.99
## workload world write
## 0.99 0.99 0.99
## writing year years
## 0.99 0.99 0.99
## advanced engineers applied
## 0.97 0.97 0.95
## benefits collection market
## 0.95 0.95 0.95
## operations production big
## 0.95 0.95 0.92
## senior research control
## 0.92 0.91 0.86
## solutions intelligence business
## 0.81 0.80 0.77
## dataquantitative environmental growth
## 0.77 0.77 0.77
## multiple platts positions
## 0.77 0.77 0.77
## pricing retail risk
## 0.77 0.77 0.77
## validation bizcredit systems
## 0.77 0.62 0.62
## thomas global internship
## 0.62 0.57 0.49
## lab nlp services
## 0.49 0.49 0.49
## technologies engineer intern
## 0.44 0.41 0.35
## junior company
## 0.35 0.31