Create functions to scrape the data using rvest library
Fuction to get Salary:
# Get Salary
getTheSalaries<-function(sessions,URLlink)
{
salary_links <- html_nodes(sessions, css = "#resultsCol li:nth-child(2) a") %>% html_attr("href")
salary_links <- paste(URLlink, salary_links, sep='')
salaries <- lapply(salary_links, . %>% html() %>% html_nodes("#salary_display_table .salary") %>% html_text())
salary <- unlist(salaries)
salary<-gsub("\\$", "", salary)
salary<-gsub("\\s", "", salary)
salary[salary=="NoData"]<-"0"
return(salary)
}
Function to get job title:
getJobTitle<-function(htmlSession)
{
htmldata<- htmlSession %>%
html_nodes("[itemprop=title]") %>%
html_text()
htmldata<-sub(".*,","",htmldata)
#htmldata<-gsub("\\s$", "",htmldata)
htmldata<-sub('-.*',"",htmldata)
htmldata<-gsub("\\s$", "",htmldata)
htmldata <- gsub("/", "", htmldata, fixed=TRUE)
return(htmldata)
}
Function to get the Company:
getCompany<-function(htmlSession)
{
htmldata<- htmlSession %>%
html_nodes("[itemprop=hiringOrganization]") %>%
html_text()
htmldata<-gsub("\n", "", htmldata)
htmldata<-gsub("\\s", "", htmldata)
return(htmldata)
}
Function to get the Location:
getLocation<-function(htmlSession)
{
htmldata<- htmlSession %>%
html_nodes("[itemprop=addressLocality]") %>%
html_text()
return(htmldata)
}
Function to get the job description:
getJobDescription<-function(htmlSession)
{
htmldata<- htmlSession %>%
html_nodes("[itemprop=description]") %>%
html_text()
htmldata<-gsub("\n", "", htmldata)
return(htmldata)
}
Function to get the indeed job link:
getIndeedLink<-function(htmlSession,pretty)
{
htmldata<- htmlSession %>%
html_nodes("[itemprop=title]")%>%
html_attr("href")
links<- paste('https://www.indeed.com', htmldata, sep='')
htmldata <- paste('[Link](https://www.indeed.com', htmldata, sep='')
htmldata <- paste(htmldata, ')', sep='')
if(pretty==1){
return(htmldata)
}
else
{
return(links)
}
}
Functions to find the skills and the job requirements based on data science skills and requirement needed
# extract design skills
extractDesignStrings <- function(passParm)
{
temp1<-passParm %>% html_nodes(xpath='//li[contains(.,"modeling")]') %>% html_text()
temp2<-passParm %>% html_nodes(xpath='//li[contains(.,"models")]') %>% html_text()
if(identical(temp1,character(0)) && identical(temp2,character(0))){
tempdata=0
}else{
tempdata=1
}
return(tempdata)
}
# extract all other skills
extractStrings<-function(passParm,stringTomatch)
{
StringTomatch<-paste0("//li[contains(.,'",stringTomatch,"')]")
temp<-passParm %>% html_nodes(xpath=StringTomatch) %>% html_text()
if(identical(temp,character(0))){
tempdata=0
}else{
tempdata=1
}
return(tempdata)
}
# apply the two previous functions to find 15 data science skills and store them into a dataframe
extractMatchingStrings<-function(URLlinkArray)
{
r<-NULL; py<-NULL
hadoop<-NULL; sql<-NULL
shiny<-NULL; spark<-NULL
pb<-NULL; st<-NULL
ml<-NULL; al<-NULL
eng<-NULL; com<-NULL
strg<-NULL; idea<-NULL
design<-NULL
lengthOfString=length(URLlinkArray)+1
for(i in 1:lengthOfString)
{
tryCatch(
{
url <- URLlinkArray[i]
tmp <- url %>% read_html
# design Skills
design <- c(design,extractDesignStrings(tmp))
# idea requirement
idea <- c(idea,extractStrings(tmp,"ideas"))
# strategy requirement
strg <- c(strg,extractStrings(tmp,"strategy"))
# engineering requirement
eng <- c(eng,extractStrings(tmp,"engineering"))
# communication requiremen
com <- c(com,extractStrings(tmp,"communication"))
# algorithms requirement
al <- c(al,extractStrings(tmp,"algorithms"))
# machine learning requirement
ml <- c(ml,extractStrings(tmp,"machine learning"))
# statistics requirement
st <- c(st,extractStrings(tmp,"statistics"))
# probability requirement
pb <- c(pb,extractStrings(tmp,"probability"))
# spark programming skills
spark <- c(spark,extractStrings(tmp,"Spark"))
# shiny programming skills
shiny <- c(shiny,extractStrings(tmp,"Shiny"))
# sql programming skills
sql <- c(sql,extractStrings(tmp,"SQL"))
# hadoop programming Skills
hadoop <- c(hadoop,extractStrings(tmp,"Hadoop"))
# py programming Skills
py <- c(py,extractStrings(tmp,"Python"))
# r programming Skills
r <- c(r,extractStrings(tmp,"R,"))
}, error=function(e){})
}
tempdf<-data.frame(design,idea,strg,eng,com,al,ml,st,pb,spark,shiny,sql,hadoop,py,r)
return(tempdf)
}
# replace eliminated raws due to NA by zeros to be able join dataframe
replaceMissingRow<-function(rowToAdd,df)
{
newrow<-vector('numeric',15)
for(i in 1:rowToAdd){
df= rbind(df,newrow)
}
return(df)
}
Get the data science Job information for 500 posted jobs with patch of 100 posts.
# Set the maximum job results to get from indeed
maxResults<-500
# Set the job results per page
nResultsPerpage<-100
Salary<-NULL
JobTitle<-NULL
Company<-NULL
Location<-NULL
JobDescription<-NULL
IndeedLink<-NULL
getLink<-NULL
for(i in seq(0, (maxResults-nResultsPerpage), nResultsPerpage)){
form<-getform(nResultsPerpage,i)
indeed_sessions <- submit_form2(session, form)
# salary
temp_data<-getTheSalaries(indeed_sessions,"http://www.indeed.com/")
Salary<-c(Salary,temp_data)
temp_data<-getJobTitle(indeed_sessions)
JobTitle<-c(JobTitle,temp_data)
temp_data<-getCompany(indeed_sessions)
Company<-c(Company,temp_data)
temp_data<-getLocation(indeed_sessions)
Location<-c(Location,temp_data)
temp_data<-getJobDescription(indeed_sessions)
JobDescription<-c(JobDescription,temp_data)
temp_data<-getIndeedLink(indeed_sessions,0)
IndeedLink<-c(IndeedLink,temp_data)
temp_data<-getIndeedLink(indeed_sessions,1)
getLink<-c(getLink,temp_data)
}
# the length of the data
length(IndeedLink)
## [1] 500
create a data frame with the available job posting information
Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,getLink)
kable(head(Indeeddf),format = "html")
|
JobTitle
|
Company
|
Salary
|
JobDescription
|
Location
|
getLink
|
|
Analytics & Data Science Intern
|
BettermentLLC
|
30,000
|
Pursuing a Masters in Data Science or similar. We are looking for a highly capable and adaptable student to work closely with our Director of Analytics & Data…
|
New York, NY 10013 (Tribeca area)
|
Link
|
|
Data Scientist
|
MassMutualFinancialGroup
|
109,000
|
In this role, you will perform data-driven research, problem solving, and algorithm development through the systematic application of mathematics, statistics…
|
New York, NY
|
Link
|
|
Data Scientist Data Science
|
CorporateTechnology
|
109,000
|
JOB DESCRIPTION DATA SCIENTIST. Python, C, or C++ SQL, PostgreSQL Experience with data gathering, data wrangling, cleaning, transforming and development of…
|
New York, NY
|
Link
|
|
High Frequency Trading
|
TwoSigmaInvestments,LLC.
|
106,000
|
Applying tick-level data analysis and real-world trading experimentation to define strategy decision-making….
|
New York, NY
|
Link
|
|
Data Scientist
|
SapientGlobalMarkets
|
109,000
|
The Senior Associate Data Science role is to not only be. And objectives into data driven solutions. At Sapient Global Markets, we are quite literally on the….
|
New York, NY
|
Link
|
|
Data Science
|
Koko
|
113,000
|
Lead ALL aspects of data science at Koko. As the owner of Data Science for Koko, you’ll be in a position to profoundly shape our product and the future of our…
|
New York, NY 10032 (Washington Heights area)
|
Link
|
kable(tail(Indeeddf),format = "html")
|
|
JobTitle
|
Company
|
Salary
|
JobDescription
|
Location
|
getLink
|
|
495
|
Data Scientist Client Insights & Analytics
|
MorganStanley
|
190,000
|
Morgan Stanley is looking for an exceptional data scientist with deep experience to work with a team of data integrators, data scientists and application…
|
New York, NY 10032 (Washington Heights area)
|
Link
|
|
496
|
SENIOR DATA ANALYST
|
MountSinaiHealthSystem
|
119,000
|
Standardization, data enrichment operations, data validations, data security. May access data in the Data Warehouse, as….
|
New York, NY 10029 (Yorkville area)
|
Link
|
|
497
|
Senior Business Analyst
|
CONSUMERAFFAIRS
|
124,000
|
The operation of data processing hardware or consoles. A master’s degree in computer science from an accredited college and three years of progressively more…
|
Manhattan, NY
|
Link
|
|
498
|
Business Resiliency
|
JPMorganChase
|
139,000
|
The Resiliency Data Management System (RDMS). Owner of the RDMS, GENT platforms and associated data management structures….
|
Jersey City, NJ
|
Link
|
|
499
|
2017 Summer Intern
|
Healthfirst
|
40,000
|
Aspiring data scientist working towards an advanced degree in computer science, mathematics or economics, biostatistics or other quantitative discipline Fluency…
|
New York, NY
|
Link
|
|
500
|
Database DeveloperAnalyst
|
TAXI&LIMOUSINECOMMISSION
|
115,000
|
Appropriately parsing incoming raw data files. Assisting users with accessing complex data structures. Optimally loading said data based on access requirements….
|
New York, NY
|
Link
|
Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,IndeedLink)
Extract skills requirement and assign “1” if skills posted and “0” if not posted.
Skillsdf<-extractMatchingStrings(IndeedLink)
kable(head(Skillsdf))
| 1 |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
0 |
| 1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
| 0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
| 0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
kable(tail(Skillsdf))
| 483 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 484 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 485 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 486 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 487 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
| 488 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
diff<-nrow(Indeeddf)-nrow(Skillsdf)
if(diff>0){
Skillsdf<-replaceMissingRow(diff,Skillsdf)
}
Analysis
Plot the sum of every skills column to find the most demanded skills
SkillsSum<-colSums(Skillsdf, na.rm = TRUE)
SkillsSum<-SkillsSum[order(-SkillsSum)]
SkillsSum
## py sql com design ml st hadoop r eng al
## 133 131 119 105 68 67 60 60 59 50
## spark strg idea pb shiny
## 48 37 31 10 0
barplot(SkillsSum, main="Skills Distribution", horiz=TRUE,space= 2.5, col ="lightblue", ylab = "Skills")

Store data
Write the data frame to a MySQL database schema
drv = dbDriver("MySQL")
con <- dbConnect(drv, user='root', password = myPassword, dbname='dscience',host="localhost",client.flag=CLIENT_MULTI_STATEMENTS)
dbWriteTable(con,"data_science",data_science, overwrite = TRUE)
## [1] TRUE