library("tidyverse")
library("rvest")
library("stringi")
library("xml2")
library("kableExtra")
library(RCurl)
library(plyr)
library(RColorBrewer)
library(dplyr)
library(ggplot2)
library(tm)
library(wordcloud)
library(tidytext)
library(xtable)
library(readr)
library(tidytext)
library(knitr)
library(phrasemachine)
library(quanteda)
library(tidyr)
library(scales)
library(forcats)
#Import url (indeed search results for full time data sceintist positions)
url <- "https://www.indeed.com/jobs?q=data+scientist&jt=fulltime"
page <- read_html(url)
#Extract urls from left side of page
location <- page %>%
html_nodes("li") %>%
html_nodes(xpath = '//*[@rel="nofollow"]') %>%
html_attr("href")
#Extract top 5 location urls based on indexes
# location2 <- location[c(8:12)]
location2 <- location[c(8:21)] ### All locations
fullDf = read.csv('fullDf.csv')
fullDf$job_description <- iconv(fullDf$job_description,"WINDOWS-1252","UTF-8")
fullDf$jobTitle <- iconv(fullDf$jobTitle,"WINDOWS-1252","UTF-8")
jobdesc = VCorpus(VectorSource(fullDf$job_description))
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
jobdesc <- tm_map(jobdesc, toSpace, "/") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(content_transformer(tolower)) %>% ### transform to lower case
tm_map(removeNumbers)%>% ### remove numbers in job description
tm_map(removeWords, stopwords("english"))%>% ### Remove english common stopwords
tm_map(removePunctuation) %>% # Remove punctuations
tm_map(stripWhitespace)# Eliminate extra white spaces
jobtitle = VCorpus(VectorSource(fullDf$jobTitle))
jobtitle <- tm_map(jobtitle, toSpace, "/") %>%
tm_map(toSpace, "@") %>%
tm_map(toSpace, "\\|") %>%
tm_map(content_transformer(tolower)) %>% ### transform to lower case
tm_map(removeNumbers)%>% ### remove numbers in job description
tm_map(removeWords, stopwords("english"))%>% ### Remove english common stopwords
tm_map(removePunctuation) %>% # Remove punctuations
tm_map(stripWhitespace)# Eliminate extra white spaces
title_word_freq<- TermDocumentMatrix(jobtitle)%>%
as.matrix()%>%
rowSums()%>%
sort(decreasing=TRUE)
wf_df = data.frame(word = names(title_word_freq),freq=title_word_freq)
wf_df
## word freq
## scientist scientist 2202
## data data 2055
## engineer engineer 437
## senior senior 367
## analyst analyst 319
## research research 289
## learning learning 214
## machine machine 202
## associate associate 166
## analytics analytics 127
## science science 106
## manager manager 100
## software software 91
## developer developer 87
## staff staff 81
## product product 75
## – – 68
## healthcare healthcare 68
## principal principal 68
## processing processing 68
## consultant consultant 63
## business business 62
## analysis analysis 61
## computer computer 58
## lead lead 56
## operations operations 56
## development development 52
## language language 49
## multiple multiple 49
## natural natural 49
## intern intern 48
## biology biology 46
## intelligence intelligence 46
## statistician statistician 45
## quantitative quantitative 44
## statistical statistical 43
## bioinformatics bioinformatics 41
## team team 41
## applied applied 40
## director director 40
## claims claims 39
## junior junior 37
## pharmaceutics pharmaceutics 36
## remote remote 36
## engineering engineering 35
## level level 35
## systems systems 33
## discovery discovery 32
## services services 32
## technical technical 31
## technology technology 30
## wetland wetland 30
## clinical clinical 29
## customer customer 29
## risk risk 29
## sales sales 29
## programmer programmer 28
## visualization visualization 28
## social social 27
## new new 26
## nlp nlp 26
## python python 26
## iii iii 25
## informatics informatics 25
## specialist specialist 25
## cell cell 24
## graduate graduate 23
## insurance insurance 23
## positions positions 23
## assistant assistant 22
## levels levels 22
## measurement measurement 22
## strategy strategy 22
## support support 22
## analytic analytic 21
## management management 21
## process process 21
## university university 21
## york york 21
## biomarker biomarker 20
## engineers engineers 20
## solution solution 20
## acenterprise acenterprise 19
## alexa alexa 19
## genetics genetics 19
## geospatial geospatial 19
## labs labs 19
## platform platform 19
## sciences sciences 19
## summer summer 19
## trainee trainee 19
## ace ace 18
## computational computational 18
## implementation implementation 18
## javascript javascript 18
## market market 18
## prn prn 18
## santa santa 18
## startup startup 18
## system system 18
## xrd xrd 18
## xrr xrr 18
## yield yield 18
## ambulance ambulance 17
## cardiology cardiology 17
## mid mid 17
## paramedic paramedic 17
## sensor sensor 17
## signal signal 17
## solaria solaria 17
## canton canton 16
## finance finance 16
## internship internship 16
## modeling modeling 16
## advanced advanced 15
## marketing marketing 15
## analytical analytical 14
## application application 14
## architect architect 14
## bioinformatician bioinformatician 14
## chain chain 14
## coordinator coordinator 14
## decision decision 14
## modeler modeler 14
## qlik qlik 14
## qlikview qlikview 14
## sense sense 14
## supply supply 14
## advisor advisor 13
## cloud cloud 13
## devops devops 13
## digital digital 13
## required required 13
## researcher researcher 13
## deep deep 12
## google google 12
## health health 12
## program program 12
## amazon amazon 11
## brokerage brokerage 11
## consumer consumer 11
## global global 11
## intermediate intermediate 11
## molecular molecular 11
## role role 11
## security security 11
## transitions transitions 11
## vision vision 11
## atlanta atlanta 10
## big big 10
## entry entry 10
## optimization optimization 10
## rna rna 10
## tech tech 10
## voice voice 10
## algorithms algorithms 9
## cellular cellular 9
## designer designer 9
## drug drug 9
## experience experience 9
## facing facing 9
## midlevel midlevel 9
## quality quality 9
## client client 8
## database database 8
## experienced experienced 8
## full full 8
## groundwater groundwater 8
## hydrogeologist hydrogeologist 8
## insights insights 8
## models models 8
## performance performance 8
## phd phd 8
## project project 8
## chemistry chemistry 7
## chief chief 7
## design design 7
## experimentation experimentation 7
## expert expert 7
## fellow fellow 7
## financial financial 7
## gec gec 7
## information information 7
## lab lab 7
## librarian librarian 7
## line line 7
## manufacturing manufacturing 7
## payer payer 7
## personalization personalization 7
## policy policy 7
## pricing pricing 7
## solutions solutions 7
## strategic strategic 7
## adtech adtech 6
## applications applications 6
## artificial artificial 6
## bwh bwh 6
## emt emt 6
## epic epic 6
## forecasting forecasting 6
## oncology oncology 6
## postdoctoral postdoctoral 6
## sas sas 6
## test test 6
## advertising advertising 5
## biologist biologist 5
## brain brain 5
## cambridge cambridge 5
## care care 5
## configuration configuration 5
## detection detection 5
## device device 5
## dmpk dmpk 5
## early early 5
## employee employee 5
## environmental environmental 5
## fixedterm fixedterm 5
## food food 5
## government government 5
## growth growth 5
## identity identity 5
## innovation innovation 5
## integration integration 5
## medical medical 5
## microbiology microbiology 5
## nlu nlu 5
## patient patient 5
## payment payment 5
## planning planning 5
## quantumblack quantumblack 5
## reporting reporting 5
## resident resident 5
## scientific scientific 5
## scientistcell scientistcell 5
## scientists scientists 5
## shopping shopping 5
## success success 5
## telecom telecom 5
## training training 5
## unit unit 5
## validation validation 5
## vudu vudu 5
## analystsenior analystsenior 4
## anomaly anomaly 4
## automated automated 4
## automation automation 4
## backend backend 4
## center center 4
## cientãfico cientãfico 4
## community community 4
## computing computing 4
## conference conference 4
## contact contact 4
## coop coop 4
## core core 4
## datos datos 4
## department department 4
## dir dir 4
## enterprise enterprise 4
## governance governance 4
## improvement improvement 4
## investment investment 4
## mechanical mechanical 4
## midcareer midcareer 4
## national national 4
## network network 4
## nsbe nsbe 4
## office office 4
## openings openings 4
## people people 4
## pharmacology pharmacology 4
## platforms platforms 4
## premium premium 4
## products products 4
## protein protein 4
## public public 4
## purchasing purchasing 4
## resources resources 4
## safety safety 4
## san san 4
## technician technician 4
## telecommute telecommute 4
## testing testing 4
## trainer trainer 4
## transportation transportation 4
## trial trial 4
## uber uber 4
## ubereverything ubereverything 4
## video video 4
## warehouse warehouse 4
## abq abq 3
## abstractor abstractor 3
## administration administration 3
## administrative administrative 3
## administrator administrator 3
## aerodynamics aerodynamics 3
## asr asr 3
## assoc assoc 3
## assurance assurance 3
## autonomy autonomy 3
## benchmarking benchmarking 3
## boston boston 3
## cancer cancer 3
## cardiovascular cardiovascular 3
## ccsd ccsd 3
## cipher cipher 3
## clearance clearance 3
## clinicogenomics clinicogenomics 3
## content content 3
## controls controls 3
## days days 3
## diego diego 3
## dna dna 3
## economist economist 3
## edge edge 3
## emergency emergency 3
## eosl eosl 3
## evaluation evaluation 3
## excellence excellence 3
## expansion expansion 3
## forensic forensic 3
## gene gene 3
## gis gis 3
## head head 3
## hours hours 3
## hpw hpw 3
## immunooncology immunooncology 3
## instructor instructor 3
## journeyman journeyman 3
## leader leader 3
## ltc ltc 3
## markets markets 3
## material material 3
## matter matter 3
## medicine medicine 3
## mgr mgr 3
## neurology neurology 3
## next next 3
## nonphd nonphd 3
## owner owner 3
## payments payments 3
## perception perception 3
## personal personal 3
## php php 3
## physiology physiology 3
## practice practice 3
## predictive predictive 3
## presales presales 3
## prime prime 3
## privacy privacy 3
## production production 3
## programming programming 3
## programs programs 3
## recovery recovery 3
## redtech redtech 3
## relationship relationship 3
## retail retail 3
## robotic robotic 3
## search search 3
## seattle seattle 3
## seller seller 3
## service service 3
## silicon silicon 3
## simulation simulation 3
## small small 3
## snl snl 3
## ssc ssc 3
## stack stack 3
## start start 3
## states states 3
## statistics statistics 3
## strategist strategist 3
## subject subject 3
## supervisor supervisor 3
## technologies technologies 3
## therapy therapy 3
## time time 3
## trading trading 3
## translational translational 3
## tso tso 3
## undergraduate undergraduate 3
## united united 3
## valuation valuation 3
## vcsel vcsel 3
## vitro vitro 3
## water water 3
## web web 3
## windows windows 3
## accelerator accelerator 2
## account account 2
## acoustics acoustics 2
## acquisition acquisition 2
## actuarial actuarial 2
## ads ads 2
## aeronautical aeronautical 2
## aide aide 2
## air air 2
## aircraft aircraft 2
## allocation allocation 2
## amrd amrd 2
## analystassociate analystassociate 2
## antibodies antibodies 2
## antibody antibody 2
## antifraud antifraud 2
## appeals appeals 2
## apps apps 2
## architecture architecture 2
## askhr askhr 2
## asset asset 2
## austin austin 2
## basic basic 2
## behavioral behavioral 2
## bellevue bellevue 2
## biochemistry biochemistry 2
## biological biological 2
## boarding boarding 2
## boeing boeing 2
## candidates candidates 2
## career career 2
## carnegie carnegie 2
## centennial centennial 2
## chemical chemical 2
## childrens childrens 2
## cluster cluster 2
## cnn cnn 2
## coding coding 2
## cognitive cognitive 2
## commercial commercial 2
## complement complement 2
## compound compound 2
## consultants consultants 2
## consulting consulting 2
## contracting contracting 2
## control control 2
## cooper cooper 2
## coops coops 2
## credentialed credentialed 2
## culture culture 2
## customers customers 2
## day day 2
## deidentification deidentification 2
## dependency dependency 2
## diagnostics diagnostics 2
## diet diet 2
## distinguished distinguished 2
## distribution distribution 2
## diversity diversity 2
## dynamics dynamics 2
## ecommerce ecommerce 2
## editing editing 2
## education education 2
## electromagnetic electromagnetic 2
## electrophysics electrophysics 2
## encoding encoding 2
## engineerentry engineerentry 2
## english english 2
## everything everything 2
## experiencepresbyterian experiencepresbyterian 2
## experimentalist experimentalist 2
## fellowship fellowship 2
## fermentation fermentation 2
## field field 2
## first first 2
## fixed fixed 2
## flatiron flatiron 2
## foodservice foodservice 2
## formulations formulations 2
## fraud fraud 2
## fullstack fullstack 2
## generalist generalist 2
## geochemistry geochemistry 2
## globalgiving globalgiving 2
## grad grad 2
## grievance grievance 2
## hadoop hadoop 2
## healthrules healthrules 2
## high high 2
## human human 2
## icl icl 2
## image image 2
## imaging imaging 2
## immunology immunology 2
## inclusion inclusion 2
## infrastructure infrastructure 2
## institution institution 2
## international international 2
## interns interns 2
## investigator investigator 2
## investments investments 2
## java java 2
## job job 2
## jpal jpal 2
## laser laser 2
## loads loads 2
## managerassociate managerassociate 2
## managercare managercare 2
## managerinformatics managerinformatics 2
## managersr managersr 2
## managing managing 2
## map map 2
## maps maps 2
## marine marine 2
## mathematical mathematical 2
## mba mba 2
## mechanic mechanic 2
## member member 2
## metrics metrics 2
## mexico mexico 2
## mgmt mgmt 2
## mobile mobile 2
## music music 2
## neuroscience neuroscience 2
## north north 2
## nyc nyc 2
## officer officer 2
## ops ops 2
## paranoids paranoids 2
## partner partner 2
## partners partners 2
## payor payor 2
## peacekeeping peacekeeping 2
## pharmacovigilance pharmacovigilance 2
## pharmacy pharmacy 2
## physics physics 2
## plan plan 2
## point point 2
## portfolio portfolio 2
## postdoc postdoc 2
## president president 2
## principle principle 2
## procure procure 2
## professor professor 2
## promotions promotions 2
## propulsion propulsion 2
## qualitative qualitative 2
## quotes quotes 2
## recruiter recruiter 2
## region region 2
## repair repair 2
## reports reports 2
## representative representative 2
## residency residency 2
## review review 2
## sale sale 2
## sandia sandia 2
## school school 2
## sci sci 2
## science—postdoctoral science—postdoctoral 2
## sea sea 2
## sensory sensory 2
## series series 2
## server server 2
## shared shared 2
## shiny shiny 2
## slm slm 2
## spectroscopy spectroscopy 2
## speech speech 2
## sql sql 2
## storage storage 2
## structural structural 2
## structurebased structurebased 2
## student student 2
## students students 2
## studies studies 2
## systemoperations systemoperations 2
## talent talent 2
## technologytransform technologytransform 2
## threat threat 2
## tmt tmt 2
## travel travel 2
## urgent urgent 2
## vehicle vehicle 2
## vice vice 2
## vpquality vpquality 2
## weather weather 2
## wound wound 2
## writer writer 2
## aav aav 1
## acceleration acceleration 1
## accountant accountant 1
## accounting accounting 1
## aco aco 1
## adme adme 1
## admin admin 1
## admitting admitting 1
## advancement advancement 1
## advertiser advertiser 1
## advice advice 1
## advocate advocate 1
## aerospace aerospace 1
## afc afc 1
## affairs affairs 1
## agfs agfs 1
## agl agl 1
## algorithmic algorithmic 1
## ambulatory ambulatory 1
## america america 1
## aml aml 1
## analysist analysist 1
## analystexpert analystexpert 1
## analystintermediate analystintermediate 1
## analystjunior analystjunior 1
## android android 1
## antifinancial antifinancial 1
## antimoney antimoney 1
## apis apis 1
## app app 1
## archaeologist archaeologist 1
## archi archi 1
## aritysenior aritysenior 1
## array array 1
## arrays arrays 1
## art art 1
## assay assay 1
## assays assays 1
## assignment assignment 1
## assisitance assisitance 1
## assistance assistance 1
## asst asst 1
## atas atas 1
## atl atl 1
## ats ats 1
## auditor auditor 1
## autoimmunity autoimmunity 1
## aws aws 1
## back back 1
## banking banking 1
## based based 1
## bed bed 1
## behavior behavior 1
## benefits benefits 1
## bie bie 1
## bikes bikes 1
## bilingual bilingual 1
## bioassay bioassay 1
## bioassays bioassays 1
## biologics biologics 1
## biophotonics biophotonics 1
## bioprocess bioprocess 1
## biosensing biosensing 1
## bioviasoftware bioviasoftware 1
## blm blm 1
## bomoda bomoda 1
## bone bone 1
## bos bos 1
## brumback brumback 1
## busi busi 1
## camera camera 1
## capital capital 1
## case case 1
## cataloging cataloging 1
## ccds ccds 1
## cdc cdc 1
## central central 1
## change change 1
## channel channel 1
## chemist chemist 1
## child child 1
## chinese chinese 1
## cimd cimd 1
## clarity clarity 1
## clerical clerical 1
## clinic clinic 1
## cloudscale cloudscale 1
## collaborative collaborative 1
## college college 1
## commissioning commissioning 1
## compensation compensation 1
## compliance compliance 1
## computation computation 1
## computingmachine computingmachine 1
## consciousness consciousness 1
## contract contract 1
## coordination coordination 1
## cost cost 1
## counsel counsel 1
## credit credit 1
## crime crime 1
## cryoem cryoem 1
## cto cto 1
## curator curator 1
## cyber cyber 1
## cybersecurity cybersecurity 1
## cycle cycle 1
## decisionsource decisionsource 1
## deparmtent deparmtent 1
## deployments deployments 1
## des des 1
## desk desk 1
## develoment develoment 1
## devices devices 1
## dinetah dinetah 1
## disaster disaster 1
## disease disease 1
## diseases diseases 1
## disincentives disincentives 1
## disorders disorders 1
## dissolution dissolution 1
## division division 1
## docsis docsis 1
## domain domain 1
## downstream downstream 1
## driving driving 1
## dsme dsme 1
## ebi ebi 1
## ecology ecology 1
## economics economics 1
## economy economy 1
## ecosystem ecosystem 1
## educator educator 1
## efficacy efficacy 1
## eicoff eicoff 1
## electrical electrical 1
## electrician electrician 1
## eligibi eligibi 1
## eligibility eligibility 1
## elint elint 1
## emerging emerging 1
## emobility emobility 1
## end end 1
## energy energy 1
## eng eng 1
## engagement engagement 1
## engineerintermediate engineerintermediate 1
## engineermobility engineermobility 1
## engineerperception engineerperception 1
## enrollment enrollment 1
## entrylevel entrylevel 1
## etl etl 1
## euv euv 1
## excel excel 1
## exoplanet exoplanet 1
## exp exp 1
## experimental experimental 1
## expertise expertise 1
## exploring exploring 1
## exports exports 1
## external external 1
## facilities facilities 1
## fall fall 1
## federal federal 1
## fees fees 1
## fema fema 1
## fidelity fidelity 1
## films films 1
## finished finished 1
## firmware firmware 1
## fishing fishing 1
## flavormint flavormint 1
## flext flext 1
## flight flight 1
## fluent fluent 1
## flyer flyer 1
## foundation foundation 1
## foundations foundations 1
## framework framework 1
## franciscomember franciscomember 1
## free free 1
## frm frm 1
## gca gca 1
## general general 1
## generation generation 1
## genome genome 1
## genomic genomic 1
## genomics genomics 1
## geoinnovation geoinnovation 1
## geologist geologist 1
## geophysics geophysics 1
## gig gig 1
## glycochemistry glycochemistry 1
## gmc gmc 1
## gnf gnf 1
## goods goods 1
## groupmolecular groupmolecular 1
## grp grp 1
## gsp gsp 1
## hardware hardware 1
## highly highly 1
## hiv hiv 1
## house house 1
## howard howard 1
## hplc hplc 1
## hrs hrs 1
## hurricane hurricane 1
## hvac hvac 1
## idea idea 1
## iiiprogram iiiprogram 1
## illegal illegal 1
## immunogenomics immunogenomics 1
## impurity impurity 1
## industrial industrial 1
## industry industry 1
## infections infections 1
## infectious infectious 1
## informational informational 1
## institute institute 1
## institutional institutional 1
## integrated integrated 1
## integrator integrator 1
## integrity integrity 1
## internet internet 1
## interpretation interpretation 1
## intl intl 1
## inventory inventory 1
## invest invest 1
## ios ios 1
## iot iot 1
## iss iss 1
## jolla jolla 1
## journalist journalist 1
## journey journey 1
## juã±ior juã±ior 1
## knowledge knowledge 1
## laboratory laboratory 1
## laundering laundering 1
## licensing licensing 1
## lifecyle lifecyle 1
## limited limited 1
## linux linux 1
## louis louis 1
## maintenance maintenance 1
## managemt managemt 1
## marketplace marketplace 1
## marrow marrow 1
## master master 1
## mediamonitors mediamonitors 1
## medium medium 1
## metabolic metabolic 1
## metabolism metabolism 1
## metadata metadata 1
## method method 1
## mgt mgt 1
## microbiome microbiome 1
## midwest midwest 1
## migration migration 1
## mission mission 1
## mls mls 1
## mobility mobility 1
## molecule molecule 1
## molecules molecules 1
## mortgages mortgages 1
## mso mso 1
## must must 1
## networks networks 1
## neurodegeneration neurodegeneration 1
## neurodevelopmental neurodevelopmental 1
## neuropsychology neuropsychology 1
## nga nga 1
## nih nih 1
## non non 1
## nonclinical nonclinical 1
## novartis novartis 1
## nrsa nrsa 1
## nsg nsg 1
## numerical numerical 1
## offers offers 1
## officersoftware officersoftware 1
## offset offset 1
## online online 1
## open open 1
## optical optical 1
## optimize optimize 1
## orders orders 1
## osse osse 1
## packaging packaging 1
## painter painter 1
## pandey pandey 1
## peer peer 1
## per per 1
## permitted permitted 1
## pharmacistabq pharmacistabq 1
## physical physical 1
## planner planner 1
## plannerentry plannerentry 1
## pmg pmg 1
## poly poly 1
## population population 1
## populations populations 1
## position position 1
## ppmo ppmo 1
## practica practica 1
## precision precision 1
## preclinical preclinical 1
## predoctoral predoctoral 1
## preformulation preformulation 1
## prinicpal prinicpal 1
## processor processor 1
## procurement procurement 1
## professional professional 1
## professionals professionals 1
## projects projects 1
## protections protections 1
## proteogenomics proteogenomics 1
## proteomic proteomic 1
## proteostasis proteostasis 1
## psychological psychological 1
## purification purification 1
## pva pva 1
## qualcomm qualcomm 1
## quant quant 1
## quantum quantum 1
## quip quip 1
## rankings rankings 1
## receiving receiving 1
## recombinant recombinant 1
## recommendation recommendation 1
## reference reference 1
## registrar registrar 1
## regulatory regulatory 1
## rep rep 1
## reporter reporter 1
## repository repository 1
## resource resource 1
## respiratory respiratory 1
## retrofit retrofit 1
## revenue revenue 1
## richmond richmond 1
## robotics robotics 1
## room room 1
## runner runner 1
## sagemaker sagemaker 1
## satellite satellite 1
## scala scala 1
## scientist—demand scientist—demand 1
## scientistchromosome scientistchromosome 1
## scientistcross scientistcross 1
## scientisthux scientisthux 1
## scooters scooters 1
## scrum scrum 1
## sde sde 1
## seã±ior seã±ior 1
## secret secret 1
## secretary secretary 1
## sectorcognitive sectorcognitive 1
## seniorbig seniorbig 1
## sequencing sequencing 1
## serv serv 1
## side side 1
## simulations simulations 1
## single single 1
## siri siri 1
## sleep sleep 1
## som som 1
## sound sound 1
## space space 1
## spec spec 1
## specialistfluent specialistfluent 1
## sponsored sponsored 1
## stafffulltime stafffulltime 1
## stat stat 1
## stem stem 1
## sterilization sterilization 1
## stl stl 1
## strain strain 1
## strategies strategies 1
## structure structure 1
## subcontract subcontract 1
## supervisory supervisory 1
## supporting supporting 1
## survey survey 1
## sys sys 1
## systematic systematic 1
## tcr tcr 1
## technologist technologist 1
## techstudentjunior techstudentjunior 1
## techtelcomedia techtelcomedia 1
## telecommunications telecommunications 1
## temporary temporary 1
## tess tess 1
## texas texas 1
## thin thin 1
## things things 1
## tigl tigl 1
## tool tool 1
## tools tools 1
## top top 1
## trader trader 1
## transfer transfer 1
## transformation transformation 1
## transiting transiting 1
## transparency transparency 1
## transplant transplant 1
## treatment treatment 1
## trms trms 1
## undergrad undergrad 1
## upstream upstream 1
## vaccines vaccines 1
## vector vector 1
## vii vii 1
## virology virology 1
## vivo vivo 1
## washington washington 1
## webex webex 1
## week week 1
## welfare welfare 1
## westside westside 1
## winter winter 1
## without without 1
## work work 1
## workforce workforce 1
## world world 1
## zillow zillow 1
## zoro zoro 1
ggplot(head(wf_df, 40), aes(reorder(word, freq),freq,fill=freq)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of Indeed Data Scientist Job title",
x = "Words", y = "Frequency") +
coord_flip()
fullDf = fullDf%>%
mutate(DS_title = grepl("(data|science|machine|analytics|scientist|engineer)",
jobTitle , ignore.case = TRUE))
table(fullDf$DS_title)
##
## FALSE TRUE
## 566 2881
fullDf = fullDf %>%
mutate(R = grepl("\\bR\\b", job_description , ignore.case = TRUE)) %>% #### Technical skills
mutate(python = grepl("python", job_description, ignore.case=TRUE)) %>%
mutate(SQL = grepl("SQL", job_description, ignore.case=TRUE)) %>%
mutate(hadoop = grepl("hadoop", job_description, ignore.case=TRUE)) %>%
mutate(perl = grepl("perl", job_description, ignore.case=TRUE)) %>%
mutate(C = grepl("\\bC\\b", job_description, ignore.case=TRUE)) %>%
mutate(aws = grepl("aws", job_description, ignore.case=TRUE)) %>%
mutate(excel = grepl("excel", job_description, ignore.case=TRUE)) %>%
mutate(nosql = grepl("nosql", job_description, ignore.case=TRUE)) %>%
mutate(linux = grepl("linux", job_description, ignore.case=TRUE)) %>%
mutate(azure = grepl("Azure", job_description, ignore.case=TRUE)) %>%
mutate(sas = grepl("\\bsas\\b", job_description, ignore.case=TRUE)) %>%
mutate(Cplusplus = grepl("C++", job_description, fixed=TRUE)) %>%
mutate(VB = grepl("VB", job_description, ignore.case=TRUE)) %>%
mutate(java = grepl("java\\b", job_description, ignore.case=TRUE)) %>%
mutate(csharp = grepl("(\\bc#\\b)", job_description, ignore.case=TRUE))%>%
mutate(scala = grepl("scala", job_description, ignore.case=TRUE)) %>%
mutate(tensorflow = grepl("tensorflow|\\btf\\b", job_description, ignore.case=TRUE)) %>%
mutate(javascript = grepl("javascript", job_description, ignore.case=TRUE)) %>%
mutate(spark = grepl("spark", job_description, ignore.case=TRUE))%>%
mutate(bi = grepl("(\\bbi\\b|business intelligence)", job_description, ignore.case=TRUE))%>%
mutate(ml = grepl("(\\bml\\b|machine learning)", job_description, ignore.case=TRUE))%>% ### general skills
mutate(stat = grepl("statis", job_description, ignore.case=TRUE))%>%
mutate(visual = grepl("visual", job_description, ignore.case=TRUE))%>%
mutate(deep_learn = grepl("(deep learning|neural net)", job_description, ignore.case=TRUE))%>%
mutate(nlp = grepl("(nlp|nature language )", job_description, ignore.case=TRUE))%>%
mutate(math = grepl("(mathematics)", job_description, ignore.case=TRUE))%>%
mutate(AI = grepl("(artificial intelligence|\\bai\\b)", job_description, ignore.case=TRUE))%>%
mutate(software_dev = grepl("software development|software engineer", job_description, ignore.case=TRUE))%>%
mutate(analysis = grepl("(analytics|critical thinking)", job_description, ignore.case=TRUE))%>%
mutate(project_management = grepl("project management", job_description, ignore.case=TRUE))%>%
mutate(data_engineer = grepl("data engineering", job_description, ignore.case=TRUE))
skill_unlist= gather(fullDf[,c(9:40)]%>%filter(fullDf$DS_title == TRUE),skills,Number,1:32, factor_key = TRUE)
skill_ranking=aggregate(skill_unlist$Number, by=list(skill_unlist$skills), FUN=mean)
names(skill_ranking) = c('skills','perc')
ggplot(skill_ranking, aes(reorder(skills, perc),perc,,fill=perc)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of Skills Indeed Data Scientist Job Postings",
x = "skills", y = "Frequency / total posting") +
coord_flip()+
geom_text(aes(label= round(perc,2)), position=position_dodge(width=2),size=3,hjust=-0.1)
## Warning: position_dodge requires non-overlapping x intervals
set.seed(1234)
wordcloud(words = skill_ranking$skills, freq = skill_ranking$perc, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
salary_ls =do.call(rbind,strsplit(as.character(fullDf$jobsalary),'-',1))
fullDf['salary_lower'] = unlist(as.numeric(gsub("\\$([0-9]+).*$",'\\1',salary_ls[,1])))
fullDf['salary_higher'] = unlist(as.numeric(gsub("\\$([0-9]+).*$",'\\1',salary_ls[,2])))
fullDf['salary_unit'] = ifelse(grepl('\\byear\\b',fullDf$jobsalary,ignore.case = TRUE),'year',ifelse(grepl('\\bhour\\b',fullDf$jobsalary,ignore.case = TRUE),'hour',NA))
fullDf=fullDf%>%
mutate(salary_lower_unified = as.numeric(fullDf$salary_lower)*ifelse(fullDf$salary_unit == 'year', 1000, ifelse(fullDf$salary_unit == 'hour',37.5*52,NA)),
salary_higher_unified = as.numeric(fullDf$salary_higher)*ifelse(fullDf$salary_unit == 'year', 1000, ifelse(fullDf$salary_unit == 'hour',37.5*52,NA)))
fullDf = fullDf %>%
mutate(mean_salary_unified = (salary_lower_unified+salary_higher_unified)/2)
summary(fullDf%>%
filter(DS_title == TRUE)%>%
select(mean_salary_unified))
## mean_salary_unified
## Min. : 62500
## 1st Qu.:100000
## Median :118500
## Mean :125581
## 3rd Qu.:135000
## Max. :275000
## NA's :2617
ggplot(fullDf%>%filter(!is.na(fullDf$mean_salary_unified)&
# mean_salary_unified!=275000 &
# mean_salary_unified>=60000 &
DS_title == TRUE), aes(x=mean_salary_unified,fill="white")) +
geom_histogram(binwidth=10000, alpha=.5, position="identity")+
labs(title = "Data Scientist Salary distribution",
x = "Salary", y = "Frequency")
skill_salary= gather(fullDf[,c(9:40,46)]%>%filter(fullDf$DS_title == TRUE),skills,Number,1:32, factor_key = TRUE)%>%
filter(Number==TRUE & !is.na(mean_salary_unified) & mean_salary_unified!=275000)%>% ### Remove salary outlier
select(skills,mean_salary_unified)
head(skill_salary)
## skills mean_salary_unified
## 1 R 141500
## 2 R 141500
## 3 R 141500
## 4 R 131000
## 5 R 118500
## 6 R 135000
p<-ggplot(skill_salary,
aes(x= reorder(skill_salary$skills, skill_salary$mean_salary_unified, FUN = mean), y=mean_salary_unified,color=skills)) +
geom_boxplot() +
coord_flip()+
xlab('Skills')+
ylab('Salary')
p
skill_mean_salary=aggregate(skill_salary$mean_salary_unified, by=list(skill_salary$skills), FUN=mean)%>%
arrange(desc(x))
names(skill_mean_salary) = c('skills','Mean_salary')
ggplot(skill_mean_salary, aes(reorder(skills, Mean_salary),Mean_salary,fill=Mean_salary)) +
geom_bar(stat = "identity") +
labs(title = "Average Salary by Skillsets",
x = "skills", y = "Average Salary") +
coord_flip()+
geom_text(aes(label=round(Mean_salary)), position=position_dodge(width=2),size=3)
## Warning: position_dodge requires non-overlapping x intervals
demand_salary=merge(skill_mean_salary ,skill_ranking,by='skills')
ggplot(demand_salary, aes(x=Mean_salary,y=perc,color=skills)) +
geom_point()+
geom_text(aes(label=skills), position=position_dodge(width=2),size=3,hjust=-0.15)+
geom_line() +
geom_hline(yintercept = median(demand_salary$perc), color="blue")+
geom_vline(xintercept = median(demand_salary$Mean_salary), color="blue")+
xlab('Average Salary')+
ylab('Demand')
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?