Harvesting Information from Indeed.com

Indded.com offered a fairly open way to harvest information, with no blockages so we started like this there are several libraries for XML and HTML processing and adding some new ones for text minig that prove to be very useful

library(rvest)
## Loading required package: xml2
library(RCurl)
## Loading required package: bitops
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
library(ggplot2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(xtable)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
## 
##     guess_encoding
library(tidytext)
library(knitr)
## Warning: package 'knitr' was built under R version 3.3.3
library(RColorBrewer)
library(SnowballC)

found a great function that packages a lot of the Text mining information as executes most of the outcomes of the analysis

source('http://www.sthda.com/upload/rquery_wordcloud.r')

Then I execute the search for “Hot” cities for datascience jobs and limit the amount of hits to the source, after this generation of a dataframe to “catch” and extract the relevant information and fields

#choosing "hot" cities for datascience openings
city.set <- c("New+York+NY", "Seattle+WA", "Washington+DC","Atlanta+GA","Boston+MA")

#This is the term ww will be searching
target.job <- "data+scientist"   
#Indeed offer a fairly open way to harvest information the tags get confusing
base.url <- "https://www.indeed.com/"

max.results <- 50

#create a df to hold everything that we collect
jobs.data <- data.frame(matrix(ncol = 7, nrow = 0))
n <- c("city","job.title","company.name","job.location","summary.short","salary","links,summary.full")
colnames(jobs.data)<-n



#Havesting data for all the cities and begin to parse the results with xpaths to extract interesting information 
for (city in city.set){
  print(paste("Downloading data for: ", city))
  
  
  for (start in range(0,max.results,10)){
    
    url <- paste(base.url,"jobs?q=",target.job,"&l=",city,"&start=", start ,sep="")
    page <- read_html(url)
    Sys.sleep(1)
    
   
    
    #get the links
    links <- page %>% 
      html_nodes("div") %>%
      html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
      html_attr("href")
    
    
    #get the job title
    job.title <- page %>% 
      html_nodes("div") %>%
      html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
      html_attr("title")
    
    #get the job title
    job.title <- page %>% 
      html_nodes("div") %>%
      html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
      html_attr("title")
    
    #get the company name
    company.name <- page %>% 
      html_nodes("span")  %>% 
      html_nodes(xpath = '//*[@class="company"]')  %>% 
      html_text() %>%
      trimws -> company.name 
    
    #get job location
    job.location <- page %>% 
      html_nodes("span") %>% 
      html_nodes(xpath = '//*[@class="location"]')%>% 
      html_text() %>%
      trimws -> job.location
    
    #get the short sumary
    summary.short <- page %>% 
      html_nodes("span")  %>% 
      html_nodes(xpath = '//*[@class="summary"]')  %>% 
      html_text() %>%
      trimws -> summary.short 
    
  }
  
 #create a structure to hold our full summaries
  summary.full <- rep(NA, length(links))
  
  #fill in the job data
  job.city <- rep(city,length(links))
  
  #add a place-holder for the salary
  job.salary <- rep(0,length(links))
  
  #iterate over the links that we collected
  for ( n in 1:length(links) ){
    
    #build the link
    link <- paste(base.url,links[n],sep="")
    
    #pull the link
    page <- read_html(link)
    
    #get the full summary
    s.full <- page %>%
      html_nodes("span")  %>% 
      html_nodes(xpath = '//*[@class="summary"]') %>% 
      html_text() %>%
      trimws -> s.full
    
    #check to make sure we got some data and if so, append it.
    #as expired postings return an empty var
    if (length(s.full) > 0 ){
      summary.full[n] = s.full  
    } 
    
  }
  
  #add the newly collected data to the jobs.data
  jobs.data <- rbind(jobs.data,data.frame(city,
                                          job.title,
                                          company.name,
                                          job.location,
                                          summary.short,
                                          job.salary,
                                          links,
                                          summary.full))
  
  
}
## [1] "Downloading data for:  New+York+NY"
## [1] "Downloading data for:  Seattle+WA"
## [1] "Downloading data for:  Washington+DC"
## [1] "Downloading data for:  Atlanta+GA"
## [1] "Downloading data for:  Boston+MA"

Then write the resulting dataframe into a CSV File

write.csv(jobs.data, file = "MyData.csv")

We start to analyze the information

This is a Worldcloud that tell us a bit of the story between soft and hard skills.

#taking out exceptions of words we don't want to be part of the cloud because they're not relevant

res<-rquery.wordcloud(jobs.data, type=c("text"), 
                 lang="english", excludeWords = c("new", "york", "austin","sanfrancisco", "san", "francisco","newyorkny","seattlewa","austintx","seattle","boston","data","scientist",
                                                  "atlantaga","atlanta","washingtondc","sanfranciscoca","washington","bostonma"), 
                 textStemming = FALSE,  colorPalette="Dark2",
                 max.words=200)
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : optimization could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : quantitative could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : amazoncom could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : benefits could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : decision could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : design could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : developing could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : experts could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : facebook could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : financial could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : focused could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : group could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : industry could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : internship could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : market could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : mathematical could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : minimum could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : national could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : operations could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : pipelines could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : production could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words
## = max.words, : professionals could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : prototype could not be fit on page. It will not be plotted.
## Warning in wordcloud(d$word, d$freq, min.freq = min.freq, max.words =
## max.words, : services could not be fit on page. It will not be plotted.

Now we Show a Table showing the words mentioned with more frequency.

tdm <- res$tdm
freqTable <- res$freqTable

# Show the top10 words and their frequency
head(freqTable, 10)
##                  word freq
## machine       machine   32
## analytics   analytics   29
## learning     learning   28
## experience experience   22
## scientists scientists   19
## analysis     analysis   18
## mining         mining   17
## will             will   17
## area             area   15
## work             work   14
# Bar plot of the frequency for the top10
barplot(freqTable[1:10,]$freq, las = 2, 
        names.arg = freqTable[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

Now showing frequent time concepts in the job postings and just to exemplify a method on quantify strength of terms among each other in these case “learning”"

#we want to identify words that occur at least 15 times :
findFreqTerms(tdm, lowfreq = 15)
## [1] "analysis"   "analytics"  "area"       "experience" "learning"  
## [6] "machine"    "mining"     "scientists" "will"
#You could also analyze the correlation (or association) between frequent terms. The R code below identifies which words are associated with "learning" 

findAssocs(tdm, terms = "learning", corlimit = 0.3)
## $learning
##         analytical          analytics        engineering 
##               1.00               1.00               1.00 
##            machine           modeling            science 
##               1.00               1.00               1.00 
##               team          abilities            ability 
##               1.00               0.99               0.99 
##           academic           achieved        acquisition 
##               0.99               0.99               0.99 
##             across         actionable          actuarial 
##               0.99               0.99               0.99 
##         additional       additionally            address 
##               0.99               0.99               0.99 
##              adhoc          advantage    agencysponsored 
##               0.99               0.99               0.99 
##             agenda              agile                air 
##               0.99               0.99               0.99 
##          algorithm         algorithms           allowing 
##               0.99               0.99               0.99 
##             amazon         amendments            amounts 
##               0.99               0.99               0.99 
##           analyses           analysis           analysts 
##               0.99               0.99               0.99 
##            analyze          analyzing                api 
##               0.99               0.99               0.99 
##        application              apply           applying 
##               0.99               0.99               0.99 
##          architect              areas                arm 
##               0.99               0.99               0.99 
##         artificial                ask             assist 
##               0.99               0.99               0.99 
##         assistance          assurance                ats 
##               0.99               0.99               0.99 
##         attendance           audience           automate 
##               0.99               0.99               0.99 
##         automation         background              based 
##               0.99               0.99               0.99 
##             become               bias   bioinformatician 
##               0.99               0.99               0.99 
##          biologist           boosting         bottomline 
##               0.99               0.99               0.99 
##              bring              build           building 
##               0.99               0.99               0.99 
##              built     businessdriven          candidate 
##               0.99               0.99               0.99 
##            capable              cases              chain 
##               0.99               0.99               0.99 
##          challenge         challenges            changes 
##               0.99               0.99               0.99 
##            clearly               clia             client 
##               0.99               0.99               0.99 
##            clients           clinical            closely 
##               0.99               0.99               0.99 
##              cloud               code             coding 
##               0.99               0.99               0.99 
##  codingdevelopment        collaborate      collaboration 
##               0.99               0.99               0.99 
##          collected         collecting        combination 
##               0.99               0.99               0.99 
##            combine       commensurate        communicate 
##               0.99               0.99               0.99 
##          company’s       compensation        competitive 
##               0.99               0.99               0.99 
##            complex         compliance      comprehensive 
##               0.99               0.99               0.99 
##      computational           computer          computers 
##               0.99               0.99               0.99 
##        conclusions            conduct         conducting 
##               0.99               0.99               0.99 
##         consulting         continuing         contribute 
##               0.99               0.99               0.99 
##           converse         convincing         correction 
##               0.99               0.99               0.99 
##             create           creation        credibility 
##               0.99               0.99               0.99 
##            culture             custom           customer 
##               0.99               0.99               0.99 
##          customers              daily        datacentric 
##               0.99               0.99               0.99 
##         datadriven           decision               deep 
##               0.99               0.99               0.99 
##             degree            deliver        demonstrate 
##               0.99               0.99               0.99 
##       demonstrated             deploy             design 
##               0.99               0.99               0.99 
##             desire            desktop           detailed 
##               0.99               0.99               0.99 
##          detecting            develop         developing 
##               0.99               0.99               0.99 
##        development            digging          direction 
##               0.99               0.99               0.99 
##        discovering          disparate            diverse 
##               0.99               0.99               0.99 
##             dollar         downstream              drive 
##               0.99               0.99               0.99 
##             driven            dynamic         ecological 
##               0.99               0.99               0.99 
##          economics          education          effective 
##               0.99               0.99               0.99 
##          efficient        efficiently               else 
##               0.99               0.99               0.99 
##           emphasis             employ           enabling 
##               0.99               0.99               0.99 
##       encompassing           engaging          enhancing 
##               0.99               0.99               0.99 
##             ensure           ensuring           entirety 
##               0.99               0.99               0.99 
##        environment            equally         equivalent 
##               0.99               0.99               0.99 
##          establish            examine            example 
##               0.99               0.99               0.99 
##         exceptions          execution           existing 
##               0.99               0.99               0.99 
##         experience        experiences        experiments 
##               0.99               0.99               0.99 
##          expertise            experts            explore 
##               0.99               0.99               0.99 
##          extensive           external         extraction 
##               0.99               0.99               0.99 
##               fake            fashion             faster 
##               0.99               0.99               0.99 
##          fastpaced            feature              field 
##               0.99               0.99               0.99 
##               find               firm           fluently 
##               0.99               0.99               0.99 
##              focus            focused              folks 
##               0.99               0.99               0.99 
##          following        forecasting             forest 
##               0.99               0.99               0.99 
##               four               fuel        fulfillment 
##               0.99               0.99               0.99 
##               full          functions             gather 
##               0.99               0.99               0.99 
##            general         generating         generation 
##               0.99               0.99               0.99 
##               give            gleaned           globally 
##               0.99               0.99               0.99 
##               goal           gradient            growing 
##               0.99               0.99               0.99 
##              guide           handling              hands 
##               0.99               0.99               0.99 
##           hardware               hate               haul 
##               0.99               0.99               0.99 
##               help               high            highend 
##               0.99               0.99               0.99 
##             highly           identify             impact 
##               0.99               0.99               0.99 
##     implementation       implementing        improvement 
##               0.99               0.99               0.99 
##            include           includes          including 
##               0.99               0.99               0.99 
##         incredible           industry        inferential 
##               0.99               0.99               0.99 
##        information     infrastructure           initiate 
##               0.99               0.99               0.99 
##           insights        integrating           interest 
##               0.99               0.99               0.99 
##            interns          interpret            involve 
##               0.99               0.99               0.99 
##             issues          knowledge         laboratory 
##               0.99               0.99               0.99 
##           language          languages              large 
##               0.99               0.99               0.99 
##               lead            leaders            leading 
##               0.99               0.99               0.99 
##        leadingedge         learningai learningpredictive 
##               0.99               0.99               0.99 
##              least          leverages         leveraging 
##               0.99               0.99               0.99 
##               line              linux           longterm 
##               0.99               0.99               0.99 
##            looking              loves                mac 
##               0.99               0.99               0.99 
##        maintenance               make         management 
##               0.99               0.99               0.99 
##           managers           managing       manipulation 
##               0.99               0.99               0.99 
##      manufacturing          marketing    marketingdriven 
##               0.99               0.99               0.99 
##           master’s       mathematical        mathematics 
##               0.99               0.99               0.99 
##             matter          maximizes                may 
##               0.99               0.99               0.99 
##          measuring            medical            members 
##               0.99               0.99               0.99 
##             mentor      methodologies            methods 
##               0.99               0.99               0.99 
##            metrics               mine            minimum 
##               0.99               0.99               0.99 
##             mining            mission              model 
##               0.99               0.99               0.99 
##          modelling             models         monitoring 
##               0.99               0.99               0.99 
##  multidisciplinary               must                nas 
##               0.99               0.99               0.99 
##            natural             needed              needs 
##               0.99               0.99               0.99 
##           negative            network               news 
##               0.99               0.99               0.99 
##               next              novel              offer 
##               0.99               0.99               0.99 
##             office               open          operating 
##               0.99               0.99               0.99 
##      opportunities       optimization           optimize 
##               0.99               0.99               0.99 
##      orchestrating              order     organizational 
##               0.99               0.99               0.99 
##            package            parsing               part 
##               0.99               0.99               0.99 
##            partner              parts         passionate 
##               0.99               0.99               0.99 
##            pattern           patterns            perform 
##               0.99               0.99               0.99 
##        performance         performing           pipeline 
##               0.99               0.99               0.99 
##          pipelines              place              plans 
##               0.99               0.99               0.99 
##           platform           policies           position 
##               0.99               0.99               0.99 
##      possibilities           powerful         prediction 
##               0.99               0.99               0.99 
##         predictive        preparation            present 
##               0.99               0.99               0.99 
##       presentation         preventing              prior 
##               0.99               0.99               0.99 
##     prioritization            problem           problems 
##               0.99               0.99               0.99 
##         procedures            process         processing 
##               0.99               0.99               0.99 
##            product      professionals        programming 
##               0.99               0.99               0.99 
##            project           projects            proofed 
##               0.99               0.99               0.99 
##             proper          proposing        proprietary 
##               0.99               0.99               0.99 
##          protocols          prototype            provide 
##               0.99               0.99               0.99 
##            pursuit             python            quality 
##               0.99               0.99               0.99 
##       quantitative      questionnaire          questions 
##               0.99               0.99               0.99 
##             random            ranging              rapid 
##               0.99               0.99               0.99 
##            rapidly           reducing         regression 
##               0.99               0.99               0.99 
##            related           relevant        researchers 
##               0.99               0.99               0.99 
##   responsibilities        responsible            results 
##               0.99               0.99               0.99 
##          retrieval           retrieve              right 
##               0.99               0.99               0.99 
##               role                sas              scale 
##               0.99               0.99               0.99 
##         scientific         scientists          scripting 
##               0.99               0.99               0.99 
##           seasoned       segmentation          selection 
##               0.99               0.99               0.99 
##          sensitive            servers            service 
##               0.99               0.99               0.99 
##            serving               sets            setting 
##               0.99               0.99               0.99 
##              shape            sharing           shipment 
##               0.99               0.99               0.99 
##           sideside            similar             skills 
##               0.99               0.99               0.99 
##              small              smart           software 
##               0.99               0.99               0.99 
##              solid           solution            solving 
##               0.99               0.99               0.99 
##          something      sophisticated             source 
##               0.99               0.99               0.99 
##            sources sourcingcollection              space 
##               0.99               0.99               0.99 
##        specialists           specific             speech 
##               0.99               0.99               0.99 
##                sql             stages        statistical 
##               0.99               0.99               0.99 
##       statistician         statistics          strategic 
##               0.99               0.99               0.99 
##          streaming            streams             strong 
##               0.99               0.99               0.99 
##           students              study            subject 
##               0.99               0.99               0.99 
##             supply            support            surveys 
##               0.99               0.99               0.99 
##             system             tackle               take 
##               0.99               0.99               0.99 
##           talented              tasks              teams 
##               0.99               0.99               0.99 
##          technical         techniques      technologists 
##               0.99               0.99               0.99 
##            telecom         throughout              tools 
##               0.99               0.99               0.99 
##            traffic     transportation              trees 
##               0.99               0.99               0.99 
##                two            uncover      understanding 
##               0.99               0.99               0.99 
##           upgrades              usage                use 
##               0.99               0.99               0.99 
##               used              users               uses 
##               0.99               0.99               0.99 
##              using           utilizes          utilizing 
##               0.99               0.99               0.99 
##              value            variety            various 
##               0.99               0.99               0.99 
##             versed      visualization     visualizations 
##               0.99               0.99               0.99 
##        visualizing               want              wants 
##               0.99               0.99               0.99 
##                web               well            wetland 
##               0.99               0.99               0.99 
##            whether               wide               will 
##               0.99               0.99               0.99 
##             within               work            working 
##               0.99               0.99               0.99 
##           workload              world              write 
##               0.99               0.99               0.99 
##            writing               year              years 
##               0.99               0.99               0.99 
##           advanced          engineers            applied 
##               0.97               0.97               0.95 
##           benefits         collection             market 
##               0.95               0.95               0.95 
##         operations         production                big 
##               0.95               0.95               0.92 
##             senior           research            control 
##               0.92               0.91               0.86 
##          solutions       intelligence           business 
##               0.81               0.80               0.77 
##   dataquantitative      environmental             growth 
##               0.77               0.77               0.77 
##           multiple             platts          positions 
##               0.77               0.77               0.77 
##            pricing             retail               risk 
##               0.77               0.77               0.77 
##         validation          bizcredit            systems 
##               0.77               0.62               0.62 
##             thomas             global         internship 
##               0.62               0.57               0.49 
##                lab                nlp           services 
##               0.49               0.49               0.49 
##       technologies           engineer             intern 
##               0.44               0.41               0.35 
##             junior            company 
##               0.35               0.31