library("tidyverse")
library("rvest")
library("stringi")
library("xml2")
library("kableExtra")
library(RCurl)
library(plyr)
library(RColorBrewer)
library(dplyr)
library(ggplot2)
library(tm)
library(wordcloud)
library(tidytext)
library(xtable)
library(readr)
library(tidytext)
library(knitr)
library(phrasemachine)
library(quanteda)
library(tidyr)
library(scales)
library(forcats)

New final web scraping code

Extract location url

#Import url (indeed search results for full time data sceintist positions)
url <- "https://www.indeed.com/jobs?q=data+scientist&jt=fulltime"
page <- read_html(url)

#Extract urls from left side of page
location <- page %>% 
  html_nodes("li") %>%
  html_nodes(xpath = '//*[@rel="nofollow"]') %>%
  html_attr("href")

#Extract top 5 location urls based on indexes
# location2 <- location[c(8:12)] 
location2 <- location[c(8:21)]   ### All locations

Data Cleaning

fullDf = read.csv('fullDf.csv')
fullDf$job_description <- iconv(fullDf$job_description,"WINDOWS-1252","UTF-8")
fullDf$jobTitle <- iconv(fullDf$jobTitle,"WINDOWS-1252","UTF-8")

Text transformation

Transformation is performed using tm_map() function to replace / remove unneeded words, numbers and punctuations.
jobdesc = VCorpus(VectorSource(fullDf$job_description))
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
jobdesc <- tm_map(jobdesc, toSpace, "/") %>%
            tm_map(toSpace, "@") %>%
            tm_map(toSpace, "\\|") %>%
            tm_map(content_transformer(tolower)) %>%  ### transform to lower case
            tm_map(removeNumbers)%>%   ### remove numbers in job description
            tm_map(removeWords, stopwords("english"))%>% ### Remove english common stopwords
            tm_map(removePunctuation) %>%       # Remove punctuations
            tm_map(stripWhitespace)# Eliminate extra white spaces

jobtitle = VCorpus(VectorSource(fullDf$jobTitle))
jobtitle <- tm_map(jobtitle, toSpace, "/") %>%
            tm_map(toSpace, "@") %>%
            tm_map(toSpace, "\\|") %>%
            tm_map(content_transformer(tolower)) %>%  ### transform to lower case
            tm_map(removeNumbers)%>%   ### remove numbers in job description
            tm_map(removeWords, stopwords("english"))%>% ### Remove english common stopwords
            tm_map(removePunctuation) %>%       # Remove punctuations
            tm_map(stripWhitespace)# Eliminate extra white spaces

title_word_freq<- TermDocumentMatrix(jobtitle)%>%
                  as.matrix()%>%
                  rowSums()%>%
                  sort(decreasing=TRUE)

wf_df = data.frame(word = names(title_word_freq),freq=title_word_freq)
wf_df
##                                          word freq
## scientist                           scientist 2202
## data                                     data 2055
## engineer                             engineer  437
## senior                                 senior  367
## analyst                               analyst  319
## research                             research  289
## learning                             learning  214
## machine                               machine  202
## associate                           associate  166
## analytics                           analytics  127
## science                               science  106
## manager                               manager  100
## software                             software   91
## developer                           developer   87
## staff                                   staff   81
## product                               product   75
## –                                       –   68
## healthcare                         healthcare   68
## principal                           principal   68
## processing                         processing   68
## consultant                         consultant   63
## business                             business   62
## analysis                             analysis   61
## computer                             computer   58
## lead                                     lead   56
## operations                         operations   56
## development                       development   52
## language                             language   49
## multiple                             multiple   49
## natural                               natural   49
## intern                                 intern   48
## biology                               biology   46
## intelligence                     intelligence   46
## statistician                     statistician   45
## quantitative                     quantitative   44
## statistical                       statistical   43
## bioinformatics                 bioinformatics   41
## team                                     team   41
## applied                               applied   40
## director                             director   40
## claims                                 claims   39
## junior                                 junior   37
## pharmaceutics                   pharmaceutics   36
## remote                                 remote   36
## engineering                       engineering   35
## level                                   level   35
## systems                               systems   33
## discovery                           discovery   32
## services                             services   32
## technical                           technical   31
## technology                         technology   30
## wetland                               wetland   30
## clinical                             clinical   29
## customer                             customer   29
## risk                                     risk   29
## sales                                   sales   29
## programmer                         programmer   28
## visualization                   visualization   28
## social                                 social   27
## new                                       new   26
## nlp                                       nlp   26
## python                                 python   26
## iii                                       iii   25
## informatics                       informatics   25
## specialist                         specialist   25
## cell                                     cell   24
## graduate                             graduate   23
## insurance                           insurance   23
## positions                           positions   23
## assistant                           assistant   22
## levels                                 levels   22
## measurement                       measurement   22
## strategy                             strategy   22
## support                               support   22
## analytic                             analytic   21
## management                         management   21
## process                               process   21
## university                         university   21
## york                                     york   21
## biomarker                           biomarker   20
## engineers                           engineers   20
## solution                             solution   20
## acenterprise                     acenterprise   19
## alexa                                   alexa   19
## genetics                             genetics   19
## geospatial                         geospatial   19
## labs                                     labs   19
## platform                             platform   19
## sciences                             sciences   19
## summer                                 summer   19
## trainee                               trainee   19
## ace                                       ace   18
## computational                   computational   18
## implementation                 implementation   18
## javascript                         javascript   18
## market                                 market   18
## prn                                       prn   18
## santa                                   santa   18
## startup                               startup   18
## system                                 system   18
## xrd                                       xrd   18
## xrr                                       xrr   18
## yield                                   yield   18
## ambulance                           ambulance   17
## cardiology                         cardiology   17
## mid                                       mid   17
## paramedic                           paramedic   17
## sensor                                 sensor   17
## signal                                 signal   17
## solaria                               solaria   17
## canton                                 canton   16
## finance                               finance   16
## internship                         internship   16
## modeling                             modeling   16
## advanced                             advanced   15
## marketing                           marketing   15
## analytical                         analytical   14
## application                       application   14
## architect                           architect   14
## bioinformatician             bioinformatician   14
## chain                                   chain   14
## coordinator                       coordinator   14
## decision                             decision   14
## modeler                               modeler   14
## qlik                                     qlik   14
## qlikview                             qlikview   14
## sense                                   sense   14
## supply                                 supply   14
## advisor                               advisor   13
## cloud                                   cloud   13
## devops                                 devops   13
## digital                               digital   13
## required                             required   13
## researcher                         researcher   13
## deep                                     deep   12
## google                                 google   12
## health                                 health   12
## program                               program   12
## amazon                                 amazon   11
## brokerage                           brokerage   11
## consumer                             consumer   11
## global                                 global   11
## intermediate                     intermediate   11
## molecular                           molecular   11
## role                                     role   11
## security                             security   11
## transitions                       transitions   11
## vision                                 vision   11
## atlanta                               atlanta   10
## big                                       big   10
## entry                                   entry   10
## optimization                     optimization   10
## rna                                       rna   10
## tech                                     tech   10
## voice                                   voice   10
## algorithms                         algorithms    9
## cellular                             cellular    9
## designer                             designer    9
## drug                                     drug    9
## experience                         experience    9
## facing                                 facing    9
## midlevel                             midlevel    9
## quality                               quality    9
## client                                 client    8
## database                             database    8
## experienced                       experienced    8
## full                                     full    8
## groundwater                       groundwater    8
## hydrogeologist                 hydrogeologist    8
## insights                             insights    8
## models                                 models    8
## performance                       performance    8
## phd                                       phd    8
## project                               project    8
## chemistry                           chemistry    7
## chief                                   chief    7
## design                                 design    7
## experimentation               experimentation    7
## expert                                 expert    7
## fellow                                 fellow    7
## financial                           financial    7
## gec                                       gec    7
## information                       information    7
## lab                                       lab    7
## librarian                           librarian    7
## line                                     line    7
## manufacturing                   manufacturing    7
## payer                                   payer    7
## personalization               personalization    7
## policy                                 policy    7
## pricing                               pricing    7
## solutions                           solutions    7
## strategic                           strategic    7
## adtech                                 adtech    6
## applications                     applications    6
## artificial                         artificial    6
## bwh                                       bwh    6
## emt                                       emt    6
## epic                                     epic    6
## forecasting                       forecasting    6
## oncology                             oncology    6
## postdoctoral                     postdoctoral    6
## sas                                       sas    6
## test                                     test    6
## advertising                       advertising    5
## biologist                           biologist    5
## brain                                   brain    5
## cambridge                           cambridge    5
## care                                     care    5
## configuration                   configuration    5
## detection                           detection    5
## device                                 device    5
## dmpk                                     dmpk    5
## early                                   early    5
## employee                             employee    5
## environmental                   environmental    5
## fixedterm                           fixedterm    5
## food                                     food    5
## government                         government    5
## growth                                 growth    5
## identity                             identity    5
## innovation                         innovation    5
## integration                       integration    5
## medical                               medical    5
## microbiology                     microbiology    5
## nlu                                       nlu    5
## patient                               patient    5
## payment                               payment    5
## planning                             planning    5
## quantumblack                     quantumblack    5
## reporting                           reporting    5
## resident                             resident    5
## scientific                         scientific    5
## scientistcell                   scientistcell    5
## scientists                         scientists    5
## shopping                             shopping    5
## success                               success    5
## telecom                               telecom    5
## training                             training    5
## unit                                     unit    5
## validation                         validation    5
## vudu                                     vudu    5
## analystsenior                   analystsenior    4
## anomaly                               anomaly    4
## automated                           automated    4
## automation                         automation    4
## backend                               backend    4
## center                                 center    4
## cientã­fico                       cientã­fico    4
## community                           community    4
## computing                           computing    4
## conference                         conference    4
## contact                               contact    4
## coop                                     coop    4
## core                                     core    4
## datos                                   datos    4
## department                         department    4
## dir                                       dir    4
## enterprise                         enterprise    4
## governance                         governance    4
## improvement                       improvement    4
## investment                         investment    4
## mechanical                         mechanical    4
## midcareer                           midcareer    4
## national                             national    4
## network                               network    4
## nsbe                                     nsbe    4
## office                                 office    4
## openings                             openings    4
## people                                 people    4
## pharmacology                     pharmacology    4
## platforms                           platforms    4
## premium                               premium    4
## products                             products    4
## protein                               protein    4
## public                                 public    4
## purchasing                         purchasing    4
## resources                           resources    4
## safety                                 safety    4
## san                                       san    4
## technician                         technician    4
## telecommute                       telecommute    4
## testing                               testing    4
## trainer                               trainer    4
## transportation                 transportation    4
## trial                                   trial    4
## uber                                     uber    4
## ubereverything                 ubereverything    4
## video                                   video    4
## warehouse                           warehouse    4
## abq                                       abq    3
## abstractor                         abstractor    3
## administration                 administration    3
## administrative                 administrative    3
## administrator                   administrator    3
## aerodynamics                     aerodynamics    3
## asr                                       asr    3
## assoc                                   assoc    3
## assurance                           assurance    3
## autonomy                             autonomy    3
## benchmarking                     benchmarking    3
## boston                                 boston    3
## cancer                                 cancer    3
## cardiovascular                 cardiovascular    3
## ccsd                                     ccsd    3
## cipher                                 cipher    3
## clearance                           clearance    3
## clinicogenomics               clinicogenomics    3
## content                               content    3
## controls                             controls    3
## days                                     days    3
## diego                                   diego    3
## dna                                       dna    3
## economist                           economist    3
## edge                                     edge    3
## emergency                           emergency    3
## eosl                                     eosl    3
## evaluation                         evaluation    3
## excellence                         excellence    3
## expansion                           expansion    3
## forensic                             forensic    3
## gene                                     gene    3
## gis                                       gis    3
## head                                     head    3
## hours                                   hours    3
## hpw                                       hpw    3
## immunooncology                 immunooncology    3
## instructor                         instructor    3
## journeyman                         journeyman    3
## leader                                 leader    3
## ltc                                       ltc    3
## markets                               markets    3
## material                             material    3
## matter                                 matter    3
## medicine                             medicine    3
## mgr                                       mgr    3
## neurology                           neurology    3
## next                                     next    3
## nonphd                                 nonphd    3
## owner                                   owner    3
## payments                             payments    3
## perception                         perception    3
## personal                             personal    3
## php                                       php    3
## physiology                         physiology    3
## practice                             practice    3
## predictive                         predictive    3
## presales                             presales    3
## prime                                   prime    3
## privacy                               privacy    3
## production                         production    3
## programming                       programming    3
## programs                             programs    3
## recovery                             recovery    3
## redtech                               redtech    3
## relationship                     relationship    3
## retail                                 retail    3
## robotic                               robotic    3
## search                                 search    3
## seattle                               seattle    3
## seller                                 seller    3
## service                               service    3
## silicon                               silicon    3
## simulation                         simulation    3
## small                                   small    3
## snl                                       snl    3
## ssc                                       ssc    3
## stack                                   stack    3
## start                                   start    3
## states                                 states    3
## statistics                         statistics    3
## strategist                         strategist    3
## subject                               subject    3
## supervisor                         supervisor    3
## technologies                     technologies    3
## therapy                               therapy    3
## time                                     time    3
## trading                               trading    3
## translational                   translational    3
## tso                                       tso    3
## undergraduate                   undergraduate    3
## united                                 united    3
## valuation                           valuation    3
## vcsel                                   vcsel    3
## vitro                                   vitro    3
## water                                   water    3
## web                                       web    3
## windows                               windows    3
## accelerator                       accelerator    2
## account                               account    2
## acoustics                           acoustics    2
## acquisition                       acquisition    2
## actuarial                           actuarial    2
## ads                                       ads    2
## aeronautical                     aeronautical    2
## aide                                     aide    2
## air                                       air    2
## aircraft                             aircraft    2
## allocation                         allocation    2
## amrd                                     amrd    2
## analystassociate             analystassociate    2
## antibodies                         antibodies    2
## antibody                             antibody    2
## antifraud                           antifraud    2
## appeals                               appeals    2
## apps                                     apps    2
## architecture                     architecture    2
## askhr                                   askhr    2
## asset                                   asset    2
## austin                                 austin    2
## basic                                   basic    2
## behavioral                         behavioral    2
## bellevue                             bellevue    2
## biochemistry                     biochemistry    2
## biological                         biological    2
## boarding                             boarding    2
## boeing                                 boeing    2
## candidates                         candidates    2
## career                                 career    2
## carnegie                             carnegie    2
## centennial                         centennial    2
## chemical                             chemical    2
## childrens                           childrens    2
## cluster                               cluster    2
## cnn                                       cnn    2
## coding                                 coding    2
## cognitive                           cognitive    2
## commercial                         commercial    2
## complement                         complement    2
## compound                             compound    2
## consultants                       consultants    2
## consulting                         consulting    2
## contracting                       contracting    2
## control                               control    2
## cooper                                 cooper    2
## coops                                   coops    2
## credentialed                     credentialed    2
## culture                               culture    2
## customers                           customers    2
## day                                       day    2
## deidentification             deidentification    2
## dependency                         dependency    2
## diagnostics                       diagnostics    2
## diet                                     diet    2
## distinguished                   distinguished    2
## distribution                     distribution    2
## diversity                           diversity    2
## dynamics                             dynamics    2
## ecommerce                           ecommerce    2
## editing                               editing    2
## education                           education    2
## electromagnetic               electromagnetic    2
## electrophysics                 electrophysics    2
## encoding                             encoding    2
## engineerentry                   engineerentry    2
## english                               english    2
## everything                         everything    2
## experiencepresbyterian experiencepresbyterian    2
## experimentalist               experimentalist    2
## fellowship                         fellowship    2
## fermentation                     fermentation    2
## field                                   field    2
## first                                   first    2
## fixed                                   fixed    2
## flatiron                             flatiron    2
## foodservice                       foodservice    2
## formulations                     formulations    2
## fraud                                   fraud    2
## fullstack                           fullstack    2
## generalist                         generalist    2
## geochemistry                     geochemistry    2
## globalgiving                     globalgiving    2
## grad                                     grad    2
## grievance                           grievance    2
## hadoop                                 hadoop    2
## healthrules                       healthrules    2
## high                                     high    2
## human                                   human    2
## icl                                       icl    2
## image                                   image    2
## imaging                               imaging    2
## immunology                         immunology    2
## inclusion                           inclusion    2
## infrastructure                 infrastructure    2
## institution                       institution    2
## international                   international    2
## interns                               interns    2
## investigator                     investigator    2
## investments                       investments    2
## java                                     java    2
## job                                       job    2
## jpal                                     jpal    2
## laser                                   laser    2
## loads                                   loads    2
## managerassociate             managerassociate    2
## managercare                       managercare    2
## managerinformatics         managerinformatics    2
## managersr                           managersr    2
## managing                             managing    2
## map                                       map    2
## maps                                     maps    2
## marine                                 marine    2
## mathematical                     mathematical    2
## mba                                       mba    2
## mechanic                             mechanic    2
## member                                 member    2
## metrics                               metrics    2
## mexico                                 mexico    2
## mgmt                                     mgmt    2
## mobile                                 mobile    2
## music                                   music    2
## neuroscience                     neuroscience    2
## north                                   north    2
## nyc                                       nyc    2
## officer                               officer    2
## ops                                       ops    2
## paranoids                           paranoids    2
## partner                               partner    2
## partners                             partners    2
## payor                                   payor    2
## peacekeeping                     peacekeeping    2
## pharmacovigilance           pharmacovigilance    2
## pharmacy                             pharmacy    2
## physics                               physics    2
## plan                                     plan    2
## point                                   point    2
## portfolio                           portfolio    2
## postdoc                               postdoc    2
## president                           president    2
## principle                           principle    2
## procure                               procure    2
## professor                           professor    2
## promotions                         promotions    2
## propulsion                         propulsion    2
## qualitative                       qualitative    2
## quotes                                 quotes    2
## recruiter                           recruiter    2
## region                                 region    2
## repair                                 repair    2
## reports                               reports    2
## representative                 representative    2
## residency                           residency    2
## review                                 review    2
## sale                                     sale    2
## sandia                                 sandia    2
## school                                 school    2
## sci                                       sci    2
## science—postdoctoral science—postdoctoral    2
## sea                                       sea    2
## sensory                               sensory    2
## series                                 series    2
## server                                 server    2
## shared                                 shared    2
## shiny                                   shiny    2
## slm                                       slm    2
## spectroscopy                     spectroscopy    2
## speech                                 speech    2
## sql                                       sql    2
## storage                               storage    2
## structural                         structural    2
## structurebased                 structurebased    2
## student                               student    2
## students                             students    2
## studies                               studies    2
## systemoperations             systemoperations    2
## talent                                 talent    2
## technologytransform       technologytransform    2
## threat                                 threat    2
## tmt                                       tmt    2
## travel                                 travel    2
## urgent                                 urgent    2
## vehicle                               vehicle    2
## vice                                     vice    2
## vpquality                           vpquality    2
## weather                               weather    2
## wound                                   wound    2
## writer                                 writer    2
## aav                                       aav    1
## acceleration                     acceleration    1
## accountant                         accountant    1
## accounting                         accounting    1
## aco                                       aco    1
## adme                                     adme    1
## admin                                   admin    1
## admitting                           admitting    1
## advancement                       advancement    1
## advertiser                         advertiser    1
## advice                                 advice    1
## advocate                             advocate    1
## aerospace                           aerospace    1
## afc                                       afc    1
## affairs                               affairs    1
## agfs                                     agfs    1
## agl                                       agl    1
## algorithmic                       algorithmic    1
## ambulatory                         ambulatory    1
## america                               america    1
## aml                                       aml    1
## analysist                           analysist    1
## analystexpert                   analystexpert    1
## analystintermediate       analystintermediate    1
## analystjunior                   analystjunior    1
## android                               android    1
## antifinancial                   antifinancial    1
## antimoney                           antimoney    1
## apis                                     apis    1
## app                                       app    1
## archaeologist                   archaeologist    1
## archi                                   archi    1
## aritysenior                       aritysenior    1
## array                                   array    1
## arrays                                 arrays    1
## art                                       art    1
## assay                                   assay    1
## assays                                 assays    1
## assignment                         assignment    1
## assisitance                       assisitance    1
## assistance                         assistance    1
## asst                                     asst    1
## atas                                     atas    1
## atl                                       atl    1
## ats                                       ats    1
## auditor                               auditor    1
## autoimmunity                     autoimmunity    1
## aws                                       aws    1
## back                                     back    1
## banking                               banking    1
## based                                   based    1
## bed                                       bed    1
## behavior                             behavior    1
## benefits                             benefits    1
## bie                                       bie    1
## bikes                                   bikes    1
## bilingual                           bilingual    1
## bioassay                             bioassay    1
## bioassays                           bioassays    1
## biologics                           biologics    1
## biophotonics                     biophotonics    1
## bioprocess                         bioprocess    1
## biosensing                         biosensing    1
## bioviasoftware                 bioviasoftware    1
## blm                                       blm    1
## bomoda                                 bomoda    1
## bone                                     bone    1
## bos                                       bos    1
## brumback                             brumback    1
## busi                                     busi    1
## camera                                 camera    1
## capital                               capital    1
## case                                     case    1
## cataloging                         cataloging    1
## ccds                                     ccds    1
## cdc                                       cdc    1
## central                               central    1
## change                                 change    1
## channel                               channel    1
## chemist                               chemist    1
## child                                   child    1
## chinese                               chinese    1
## cimd                                     cimd    1
## clarity                               clarity    1
## clerical                             clerical    1
## clinic                                 clinic    1
## cloudscale                         cloudscale    1
## collaborative                   collaborative    1
## college                               college    1
## commissioning                   commissioning    1
## compensation                     compensation    1
## compliance                         compliance    1
## computation                       computation    1
## computingmachine             computingmachine    1
## consciousness                   consciousness    1
## contract                             contract    1
## coordination                     coordination    1
## cost                                     cost    1
## counsel                               counsel    1
## credit                                 credit    1
## crime                                   crime    1
## cryoem                                 cryoem    1
## cto                                       cto    1
## curator                               curator    1
## cyber                                   cyber    1
## cybersecurity                   cybersecurity    1
## cycle                                   cycle    1
## decisionsource                 decisionsource    1
## deparmtent                         deparmtent    1
## deployments                       deployments    1
## des                                       des    1
## desk                                     desk    1
## develoment                         develoment    1
## devices                               devices    1
## dinetah                               dinetah    1
## disaster                             disaster    1
## disease                               disease    1
## diseases                             diseases    1
## disincentives                   disincentives    1
## disorders                           disorders    1
## dissolution                       dissolution    1
## division                             division    1
## docsis                                 docsis    1
## domain                                 domain    1
## downstream                         downstream    1
## driving                               driving    1
## dsme                                     dsme    1
## ebi                                       ebi    1
## ecology                               ecology    1
## economics                           economics    1
## economy                               economy    1
## ecosystem                           ecosystem    1
## educator                             educator    1
## efficacy                             efficacy    1
## eicoff                                 eicoff    1
## electrical                         electrical    1
## electrician                       electrician    1
## eligibi                               eligibi    1
## eligibility                       eligibility    1
## elint                                   elint    1
## emerging                             emerging    1
## emobility                           emobility    1
## end                                       end    1
## energy                                 energy    1
## eng                                       eng    1
## engagement                         engagement    1
## engineerintermediate     engineerintermediate    1
## engineermobility             engineermobility    1
## engineerperception         engineerperception    1
## enrollment                         enrollment    1
## entrylevel                         entrylevel    1
## etl                                       etl    1
## euv                                       euv    1
## excel                                   excel    1
## exoplanet                           exoplanet    1
## exp                                       exp    1
## experimental                     experimental    1
## expertise                           expertise    1
## exploring                           exploring    1
## exports                               exports    1
## external                             external    1
## facilities                         facilities    1
## fall                                     fall    1
## federal                               federal    1
## fees                                     fees    1
## fema                                     fema    1
## fidelity                             fidelity    1
## films                                   films    1
## finished                             finished    1
## firmware                             firmware    1
## fishing                               fishing    1
## flavormint                         flavormint    1
## flext                                   flext    1
## flight                                 flight    1
## fluent                                 fluent    1
## flyer                                   flyer    1
## foundation                         foundation    1
## foundations                       foundations    1
## framework                           framework    1
## franciscomember               franciscomember    1
## free                                     free    1
## frm                                       frm    1
## gca                                       gca    1
## general                               general    1
## generation                         generation    1
## genome                                 genome    1
## genomic                               genomic    1
## genomics                             genomics    1
## geoinnovation                   geoinnovation    1
## geologist                           geologist    1
## geophysics                         geophysics    1
## gig                                       gig    1
## glycochemistry                 glycochemistry    1
## gmc                                       gmc    1
## gnf                                       gnf    1
## goods                                   goods    1
## groupmolecular                 groupmolecular    1
## grp                                       grp    1
## gsp                                       gsp    1
## hardware                             hardware    1
## highly                                 highly    1
## hiv                                       hiv    1
## house                                   house    1
## howard                                 howard    1
## hplc                                     hplc    1
## hrs                                       hrs    1
## hurricane                           hurricane    1
## hvac                                     hvac    1
## idea                                     idea    1
## iiiprogram                         iiiprogram    1
## illegal                               illegal    1
## immunogenomics                 immunogenomics    1
## impurity                             impurity    1
## industrial                         industrial    1
## industry                             industry    1
## infections                         infections    1
## infectious                         infectious    1
## informational                   informational    1
## institute                           institute    1
## institutional                   institutional    1
## integrated                         integrated    1
## integrator                         integrator    1
## integrity                           integrity    1
## internet                             internet    1
## interpretation                 interpretation    1
## intl                                     intl    1
## inventory                           inventory    1
## invest                                 invest    1
## ios                                       ios    1
## iot                                       iot    1
## iss                                       iss    1
## jolla                                   jolla    1
## journalist                         journalist    1
## journey                               journey    1
## juã±ior                               juã±ior    1
## knowledge                           knowledge    1
## laboratory                         laboratory    1
## laundering                         laundering    1
## licensing                           licensing    1
## lifecyle                             lifecyle    1
## limited                               limited    1
## linux                                   linux    1
## louis                                   louis    1
## maintenance                       maintenance    1
## managemt                             managemt    1
## marketplace                       marketplace    1
## marrow                                 marrow    1
## master                                 master    1
## mediamonitors                   mediamonitors    1
## medium                                 medium    1
## metabolic                           metabolic    1
## metabolism                         metabolism    1
## metadata                             metadata    1
## method                                 method    1
## mgt                                       mgt    1
## microbiome                         microbiome    1
## midwest                               midwest    1
## migration                           migration    1
## mission                               mission    1
## mls                                       mls    1
## mobility                             mobility    1
## molecule                             molecule    1
## molecules                           molecules    1
## mortgages                           mortgages    1
## mso                                       mso    1
## must                                     must    1
## networks                             networks    1
## neurodegeneration           neurodegeneration    1
## neurodevelopmental         neurodevelopmental    1
## neuropsychology               neuropsychology    1
## nga                                       nga    1
## nih                                       nih    1
## non                                       non    1
## nonclinical                       nonclinical    1
## novartis                             novartis    1
## nrsa                                     nrsa    1
## nsg                                       nsg    1
## numerical                           numerical    1
## offers                                 offers    1
## officersoftware               officersoftware    1
## offset                                 offset    1
## online                                 online    1
## open                                     open    1
## optical                               optical    1
## optimize                             optimize    1
## orders                                 orders    1
## osse                                     osse    1
## packaging                           packaging    1
## painter                               painter    1
## pandey                                 pandey    1
## peer                                     peer    1
## per                                       per    1
## permitted                           permitted    1
## pharmacistabq                   pharmacistabq    1
## physical                             physical    1
## planner                               planner    1
## plannerentry                     plannerentry    1
## pmg                                       pmg    1
## poly                                     poly    1
## population                         population    1
## populations                       populations    1
## position                             position    1
## ppmo                                     ppmo    1
## practica                             practica    1
## precision                           precision    1
## preclinical                       preclinical    1
## predoctoral                       predoctoral    1
## preformulation                 preformulation    1
## prinicpal                           prinicpal    1
## processor                           processor    1
## procurement                       procurement    1
## professional                     professional    1
## professionals                   professionals    1
## projects                             projects    1
## protections                       protections    1
## proteogenomics                 proteogenomics    1
## proteomic                           proteomic    1
## proteostasis                     proteostasis    1
## psychological                   psychological    1
## purification                     purification    1
## pva                                       pva    1
## qualcomm                             qualcomm    1
## quant                                   quant    1
## quantum                               quantum    1
## quip                                     quip    1
## rankings                             rankings    1
## receiving                           receiving    1
## recombinant                       recombinant    1
## recommendation                 recommendation    1
## reference                           reference    1
## registrar                           registrar    1
## regulatory                         regulatory    1
## rep                                       rep    1
## reporter                             reporter    1
## repository                         repository    1
## resource                             resource    1
## respiratory                       respiratory    1
## retrofit                             retrofit    1
## revenue                               revenue    1
## richmond                             richmond    1
## robotics                             robotics    1
## room                                     room    1
## runner                                 runner    1
## sagemaker                           sagemaker    1
## satellite                           satellite    1
## scala                                   scala    1
## scientist—demand         scientist—demand    1
## scientistchromosome       scientistchromosome    1
## scientistcross                 scientistcross    1
## scientisthux                     scientisthux    1
## scooters                             scooters    1
## scrum                                   scrum    1
## sde                                       sde    1
## seã±ior                               seã±ior    1
## secret                                 secret    1
## secretary                           secretary    1
## sectorcognitive               sectorcognitive    1
## seniorbig                           seniorbig    1
## sequencing                         sequencing    1
## serv                                     serv    1
## side                                     side    1
## simulations                       simulations    1
## single                                 single    1
## siri                                     siri    1
## sleep                                   sleep    1
## som                                       som    1
## sound                                   sound    1
## space                                   space    1
## spec                                     spec    1
## specialistfluent             specialistfluent    1
## sponsored                           sponsored    1
## stafffulltime                   stafffulltime    1
## stat                                     stat    1
## stem                                     stem    1
## sterilization                   sterilization    1
## stl                                       stl    1
## strain                                 strain    1
## strategies                         strategies    1
## structure                           structure    1
## subcontract                       subcontract    1
## supervisory                       supervisory    1
## supporting                         supporting    1
## survey                                 survey    1
## sys                                       sys    1
## systematic                         systematic    1
## tcr                                       tcr    1
## technologist                     technologist    1
## techstudentjunior           techstudentjunior    1
## techtelcomedia                 techtelcomedia    1
## telecommunications         telecommunications    1
## temporary                           temporary    1
## tess                                     tess    1
## texas                                   texas    1
## thin                                     thin    1
## things                                 things    1
## tigl                                     tigl    1
## tool                                     tool    1
## tools                                   tools    1
## top                                       top    1
## trader                                 trader    1
## transfer                             transfer    1
## transformation                 transformation    1
## transiting                         transiting    1
## transparency                     transparency    1
## transplant                         transplant    1
## treatment                           treatment    1
## trms                                     trms    1
## undergrad                           undergrad    1
## upstream                             upstream    1
## vaccines                             vaccines    1
## vector                                 vector    1
## vii                                       vii    1
## virology                             virology    1
## vivo                                     vivo    1
## washington                         washington    1
## webex                                   webex    1
## week                                     week    1
## welfare                               welfare    1
## westside                             westside    1
## winter                                 winter    1
## without                               without    1
## work                                     work    1
## workforce                           workforce    1
## world                                   world    1
## zillow                                 zillow    1
## zoro                                     zoro    1
ggplot(head(wf_df, 40), aes(reorder(word, freq),freq,fill=freq)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of Indeed Data Scientist Job title",
       x = "Words", y = "Frequency") +
  coord_flip()

fullDf = fullDf%>%
          mutate(DS_title = grepl("(data|science|machine|analytics|scientist|engineer)",
                                  jobTitle , ignore.case = TRUE))
                 
table(fullDf$DS_title)
## 
## FALSE  TRUE 
##   566  2881

Technical/General skill analysis

identify if the skill exist in job description

fullDf = fullDf %>%
    mutate(R = grepl("\\bR\\b", job_description , ignore.case = TRUE)) %>%   #### Technical skills
    mutate(python = grepl("python", job_description, ignore.case=TRUE)) %>%
    mutate(SQL = grepl("SQL", job_description, ignore.case=TRUE)) %>%
    mutate(hadoop = grepl("hadoop", job_description, ignore.case=TRUE)) %>%
    mutate(perl = grepl("perl", job_description, ignore.case=TRUE)) %>%
    mutate(C = grepl("\\bC\\b", job_description, ignore.case=TRUE)) %>%
    mutate(aws = grepl("aws", job_description, ignore.case=TRUE)) %>%
    mutate(excel = grepl("excel", job_description, ignore.case=TRUE)) %>%
    mutate(nosql = grepl("nosql", job_description, ignore.case=TRUE)) %>%
    mutate(linux = grepl("linux", job_description, ignore.case=TRUE)) %>%
    mutate(azure = grepl("Azure", job_description, ignore.case=TRUE)) %>%
    mutate(sas = grepl("\\bsas\\b", job_description, ignore.case=TRUE)) %>%
    mutate(Cplusplus = grepl("C++", job_description, fixed=TRUE)) %>%
    mutate(VB = grepl("VB", job_description, ignore.case=TRUE)) %>%
    mutate(java = grepl("java\\b", job_description, ignore.case=TRUE)) %>%
    mutate(csharp = grepl("(\\bc#\\b)", job_description, ignore.case=TRUE))%>%
    mutate(scala = grepl("scala", job_description, ignore.case=TRUE)) %>%
    mutate(tensorflow = grepl("tensorflow|\\btf\\b", job_description, ignore.case=TRUE)) %>%
    mutate(javascript = grepl("javascript", job_description, ignore.case=TRUE)) %>%
    mutate(spark = grepl("spark", job_description, ignore.case=TRUE))%>%
    mutate(bi = grepl("(\\bbi\\b|business intelligence)", job_description, ignore.case=TRUE))%>%
    mutate(ml = grepl("(\\bml\\b|machine learning)", job_description, ignore.case=TRUE))%>%  ### general skills
    mutate(stat = grepl("statis", job_description, ignore.case=TRUE))%>%
    mutate(visual = grepl("visual", job_description, ignore.case=TRUE))%>%
    mutate(deep_learn = grepl("(deep learning|neural net)", job_description, ignore.case=TRUE))%>%
    mutate(nlp = grepl("(nlp|nature language )", job_description, ignore.case=TRUE))%>%
    mutate(math = grepl("(mathematics)", job_description, ignore.case=TRUE))%>%
    mutate(AI = grepl("(artificial intelligence|\\bai\\b)", job_description, ignore.case=TRUE))%>%
    mutate(software_dev = grepl("software development|software engineer", job_description, ignore.case=TRUE))%>%
    mutate(analysis = grepl("(analytics|critical thinking)", job_description, ignore.case=TRUE))%>%
    mutate(project_management = grepl("project management", job_description, ignore.case=TRUE))%>%
    mutate(data_engineer = grepl("data engineering", job_description, ignore.case=TRUE))
Skill Frequency
skill_unlist= gather(fullDf[,c(9:40)]%>%filter(fullDf$DS_title == TRUE),skills,Number,1:32, factor_key = TRUE)

skill_ranking=aggregate(skill_unlist$Number, by=list(skill_unlist$skills), FUN=mean)
names(skill_ranking) = c('skills','perc')
ggplot(skill_ranking, aes(reorder(skills, perc),perc,,fill=perc)) +
  geom_bar(stat = "identity") +
  labs(title = "Frequency of Skills Indeed Data Scientist Job Postings",
       x = "skills", y = "Frequency / total posting") +
  coord_flip()+
  geom_text(aes(label= round(perc,2)), position=position_dodge(width=2),size=3,hjust=-0.1) 
## Warning: position_dodge requires non-overlapping x intervals

set.seed(1234)
wordcloud(words = skill_ranking$skills, freq = skill_ranking$perc, random.order=FALSE, rot.per=0.35,
          colors=brewer.pal(8, "Dark2"))

Salary

Salary Analysis

salary_ls =do.call(rbind,strsplit(as.character(fullDf$jobsalary),'-',1))
fullDf['salary_lower'] = unlist(as.numeric(gsub("\\$([0-9]+).*$",'\\1',salary_ls[,1])))
fullDf['salary_higher'] = unlist(as.numeric(gsub("\\$([0-9]+).*$",'\\1',salary_ls[,2])))
fullDf['salary_unit'] = ifelse(grepl('\\byear\\b',fullDf$jobsalary,ignore.case = TRUE),'year',ifelse(grepl('\\bhour\\b',fullDf$jobsalary,ignore.case = TRUE),'hour',NA))

Unify salary unit

fullDf=fullDf%>%
  mutate(salary_lower_unified = as.numeric(fullDf$salary_lower)*ifelse(fullDf$salary_unit == 'year', 1000,  ifelse(fullDf$salary_unit == 'hour',37.5*52,NA)),
         salary_higher_unified = as.numeric(fullDf$salary_higher)*ifelse(fullDf$salary_unit == 'year', 1000, ifelse(fullDf$salary_unit == 'hour',37.5*52,NA)))

Calculate average unified salary

fullDf = fullDf %>%
          mutate(mean_salary_unified = (salary_lower_unified+salary_higher_unified)/2)

Salary distribution

summary(fullDf%>%
          filter(DS_title == TRUE)%>%
          select(mean_salary_unified))
##  mean_salary_unified
##  Min.   : 62500     
##  1st Qu.:100000     
##  Median :118500     
##  Mean   :125581     
##  3rd Qu.:135000     
##  Max.   :275000     
##  NA's   :2617
ggplot(fullDf%>%filter(!is.na(fullDf$mean_salary_unified)&
                         # mean_salary_unified!=275000 &
                         # mean_salary_unified>=60000 &
                        DS_title == TRUE), aes(x=mean_salary_unified,fill="white")) +
    geom_histogram(binwidth=10000, alpha=.5, position="identity")+
  labs(title = "Data Scientist Salary distribution",
       x = "Salary", y = "Frequency")

Average salary of each skills

skill_salary= gather(fullDf[,c(9:40,46)]%>%filter(fullDf$DS_title == TRUE),skills,Number,1:32, factor_key = TRUE)%>%
        filter(Number==TRUE & !is.na(mean_salary_unified) & mean_salary_unified!=275000)%>%   ### Remove salary outlier
        select(skills,mean_salary_unified)
head(skill_salary)
##   skills mean_salary_unified
## 1      R              141500
## 2      R              141500
## 3      R              141500
## 4      R              131000
## 5      R              118500
## 6      R              135000

Salary distribution by skills

Salary boxplot

p<-ggplot(skill_salary,
          aes(x= reorder(skill_salary$skills, skill_salary$mean_salary_unified, FUN = mean), y=mean_salary_unified,color=skills)) +
          geom_boxplot() +
          coord_flip()+
          xlab('Skills')+
          ylab('Salary')
p

Mean Salary Ranking

skill_mean_salary=aggregate(skill_salary$mean_salary_unified, by=list(skill_salary$skills), FUN=mean)%>%
  arrange(desc(x))
names(skill_mean_salary) = c('skills','Mean_salary')
ggplot(skill_mean_salary, aes(reorder(skills, Mean_salary),Mean_salary,fill=Mean_salary)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Salary by Skillsets",
       x = "skills", y = "Average Salary") +
  coord_flip()+
  geom_text(aes(label=round(Mean_salary)), position=position_dodge(width=2),size=3)
## Warning: position_dodge requires non-overlapping x intervals

Demand V.S Salary

demand_salary=merge(skill_mean_salary ,skill_ranking,by='skills')

ggplot(demand_salary, aes(x=Mean_salary,y=perc,color=skills)) +
  geom_point()+
  geom_text(aes(label=skills), position=position_dodge(width=2),size=3,hjust=-0.15)+
  geom_line() +
  geom_hline(yintercept = median(demand_salary$perc), color="blue")+
  geom_vline(xintercept = median(demand_salary$Mean_salary), color="blue")+
  xlab('Average Salary')+
  ylab('Demand')
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

Conclusion

  1. To be an entry - mid level Data scientist, the most valued skills are Python, R , SQL , ML , Statistics.
  2. From the salary perspective, software development, cluster computing, deep learning are also valued as bonus skills.