knitr::opts_chunk$set(echo = TRUE)

Intorduction

This is a project for your entire class section to work on together, since being able to work effectively on a virtual team is a key “soft skill” for data scientists. Please note especially the requirement about making a presentation during our first meetup after the project is due.

Procedure

create functions to do the following;

Load Html Libraries

Load libraries for web scraping html pages

library(xml2)
library(rvest)
library(knitr)
library(XML)
library(bitops)
library(RCurl)
library(htmltab)
library(stringr)
library(curl)
library(DBI)
library(RMySQL)
library(tidyr)
library(dplyr)

Create functions to scrape the data using rvest library

Function to submit form

submit_form2 <- function(session, form){
  library(XML)
  url <- XML::getRelativeURL(form$url, session$url)
  url <- paste(url,'?',sep='')
  values <- as.vector(rvest:::submit_request(form)$values)
  att <- names(values)
  if (tail(att, n=1) == "NULL"){
    values <- values[1:length(values)-1]
    att <- att[1:length(att)-1]
  }
  q <- paste(att,values,sep='=')
  q <- paste(q, collapse = '&')
  q <- gsub(" ", "+", q)
  url <- paste(url, q, sep = '')
  html_session(url)
}

Function to concatenate a string into an html form:

# get the setion 
session <- html_session("http://www.indeed.com")
getform<-function(PageLimit,PageStart)
{
  query = "data science"
  loc = paste0("New York&limit=",PageLimit,"&start=",PageStart)
    myForm <- html_form(session)[[1]]
  myForm <- set_values(myForm, q = query, l = loc)
  
  return (myForm)
}

General purpose function to extract html text using CSS:

gethtmldata<-function(itemporp_Value,htmlSession)
{
  
  htmlnode<-paste0("[itemprop=",itemporp_Value,"]")
  htmldata<- htmlSession %>% 
  html_nodes(htmlnode) %>%
  html_text()
  

  return(htmldata)
}

Fuction to get Salary:

# Get Salary
getTheSalaries<-function(sessions,URLlink)
{
  salary_links <- html_nodes(sessions, css = "#resultsCol li:nth-child(2) a") %>% html_attr("href")
  salary_links <- paste(URLlink, salary_links, sep='')
  salaries <- lapply(salary_links, . %>% html() %>% html_nodes("#salary_display_table .salary") %>% html_text())
  salary <- unlist(salaries)
  salary<-gsub("\\$", "", salary)
  salary<-gsub("\\s", "", salary)
  salary[salary=="NoData"]<-"0"

  return(salary)
}

Function to get job title:

getJobTitle<-function(htmlSession)
{
    htmldata<- htmlSession %>% 
    html_nodes("[itemprop=title]") %>%
    html_text()
  
    htmldata<-sub(".*,","",htmldata)
    #htmldata<-gsub("\\s$", "",htmldata)
    htmldata<-sub('-.*',"",htmldata)
    htmldata<-gsub("\\s$", "",htmldata)
    htmldata <- gsub("/", "", htmldata, fixed=TRUE) 

  return(htmldata)
}

Function to get the Company:

getCompany<-function(htmlSession)
{

  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=hiringOrganization]") %>%
    html_text()
  
  htmldata<-gsub("\n", "", htmldata)
  htmldata<-gsub("\\s", "", htmldata)
  
  return(htmldata)
}

Function to get the Location:

getLocation<-function(htmlSession)
{
  
  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=addressLocality]") %>%
    html_text()

  return(htmldata)
}

Function to get the job description:

getJobDescription<-function(htmlSession)
{
  
  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=description]") %>%
    html_text()
  
  htmldata<-gsub("\n", "", htmldata)

    return(htmldata)
}

Functions to find the skills and the job requirements based on data science skills and requirement needed

# extract design skills
extractDesignStrings <- function(passParm)
{
  
  temp1<-passParm %>% html_nodes(xpath='//li[contains(.,"modeling")]') %>% html_text()
  temp2<-passParm %>% html_nodes(xpath='//li[contains(.,"models")]') %>% html_text()
  
  if(identical(temp1,character(0)) && identical(temp2,character(0))){
    tempdata=0
  }else{
    tempdata=1
  }
  
  return(tempdata)
}

# extract all other skills
extractStrings<-function(passParm,stringTomatch)
{
  StringTomatch<-paste0("//li[contains(.,'",stringTomatch,"')]")
  temp<-passParm %>% html_nodes(xpath=StringTomatch) %>% html_text()

  if(identical(temp,character(0))){ 
    tempdata=0 
  }else{ 
    tempdata=1 
    }
  

  return(tempdata)
}


# apply the two previous functions to find 15 data science skills and store them into a dataframe
extractMatchingStrings<-function(URLlinkArray)
{
  r<-NULL; py<-NULL
  hadoop<-NULL; sql<-NULL
  shiny<-NULL; spark<-NULL
  pb<-NULL; st<-NULL
  ml<-NULL; al<-NULL
  eng<-NULL; com<-NULL
  strg<-NULL; idea<-NULL
  design<-NULL
  

lengthOfString=length(URLlinkArray)+1

  for(i in 1:lengthOfString) 
  {
    tryCatch(
      {
        url <- URLlinkArray[i]
        tmp <- url %>% read_html 
        # design Skills
        design <- c(design,extractDesignStrings(tmp))

        # idea requirement
        idea <- c(idea,extractStrings(tmp,"ideas"))

        # strategy requirement 
        strg <- c(strg,extractStrings(tmp,"strategy"))

        # engineering requirement 
        eng <- c(eng,extractStrings(tmp,"engineering"))

        # communication requiremen
        com <- c(com,extractStrings(tmp,"communication"))

        # algorithms requirement 
        al <- c(al,extractStrings(tmp,"algorithms"))

        # machine learning requirement
        ml <- c(ml,extractStrings(tmp,"machine learning"))

        # statistics requirement 
        st <- c(st,extractStrings(tmp,"statistics"))

        # probability requirement 
        pb <- c(pb,extractStrings(tmp,"probability"))

        # spark programming skills
        spark <- c(spark,extractStrings(tmp,"Spark"))

        # shiny programming skills
        shiny <- c(shiny,extractStrings(tmp,"Shiny"))

        # sql programming skills
        sql <- c(sql,extractStrings(tmp,"SQL"))

        # hadoop programming Skills
        hadoop <- c(hadoop,extractStrings(tmp,"Hadoop"))

        # py programming Skills
        py <- c(py,extractStrings(tmp,"Python"))

        # r programming Skills
        r <- c(r,extractStrings(tmp,"R,"))

        

      }, error=function(e){})
  }
  
  tempdf<-data.frame(design,idea,strg,eng,com,al,ml,st,pb,spark,shiny,sql,hadoop,py,r)
  
  return(tempdf)
}

# replace eliminated raws due to NA by zeros to be able join dataframe
replaceMissingRow<-function(rowToAdd,df)
{
  newrow<-vector('numeric',15)
  for(i in 1:rowToAdd){
    df= rbind(df,newrow)
  }
  return(df)
}

Get the data science Job information for 500 posted jobs with patch of 100 posts.

# Set the maximum job results to get from indeed
maxResults<-500

# Set the job results per page
nResultsPerpage<-100

Salary<-NULL
JobTitle<-NULL
Company<-NULL
Location<-NULL
JobDescription<-NULL
IndeedLink<-NULL
getLink<-NULL

for(i in seq(0, (maxResults-nResultsPerpage), nResultsPerpage)){
  
form<-getform(nResultsPerpage,i)
indeed_sessions <- submit_form2(session, form)

# salary
temp_data<-getTheSalaries(indeed_sessions,"http://www.indeed.com/")
Salary<-c(Salary,temp_data)

temp_data<-getJobTitle(indeed_sessions)
JobTitle<-c(JobTitle,temp_data)

temp_data<-getCompany(indeed_sessions)
Company<-c(Company,temp_data)

temp_data<-getLocation(indeed_sessions)
Location<-c(Location,temp_data)

temp_data<-getJobDescription(indeed_sessions)
JobDescription<-c(JobDescription,temp_data)

temp_data<-getIndeedLink(indeed_sessions,0)
IndeedLink<-c(IndeedLink,temp_data)

temp_data<-getIndeedLink(indeed_sessions,1)
getLink<-c(getLink,temp_data)
}
# the length of the data
length(IndeedLink)
## [1] 500

create a data frame with the available job posting information

Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,getLink)
kable(head(Indeeddf),format = "html")
JobTitle Company Salary JobDescription Location getLink
Analytics & Data Science Intern BettermentLLC 30,000 Pursuing a Masters in Data Science or similar. We are looking for a highly capable and adaptable student to work closely with our Director of Analytics & Data… New York, NY 10013 (Tribeca area) Link
Data Scientist MassMutualFinancialGroup 109,000 In this role, you will perform data-driven research, problem solving, and algorithm development through the systematic application of mathematics, statistics… New York, NY Link
Data Scientist Data Science CorporateTechnology 109,000 JOB DESCRIPTION – DATA SCIENTIST. Python, C, or C++ SQL, PostgreSQL Experience with data gathering, data wrangling, cleaning, transforming and development of… New York, NY Link
High Frequency Trading TwoSigmaInvestments,LLC. 106,000 Applying tick-level data analysis and real-world trading experimentation to define strategy decision-making…. New York, NY Link
Data Scientist SapientGlobalMarkets 109,000 The Senior Associate Data Science role is to not only be. And objectives into data driven solutions. At Sapient Global Markets, we are quite literally on the…. New York, NY Link
Data Science Koko 113,000 Lead ALL aspects of data science at Koko. As the owner of Data Science for Koko, you’ll be in a position to profoundly shape our product and the future of our… New York, NY 10032 (Washington Heights area) Link
kable(tail(Indeeddf),format = "html")
JobTitle Company Salary JobDescription Location getLink
495 Data Scientist – Client Insights & Analytics MorganStanley 190,000 Morgan Stanley is looking for an exceptional data scientist with deep experience to work with a team of data integrators, data scientists and application… New York, NY 10032 (Washington Heights area) Link
496 SENIOR DATA ANALYST MountSinaiHealthSystem 119,000 Standardization, data enrichment operations, data validations, data security. May access data in the Data Warehouse, as…. New York, NY 10029 (Yorkville area) Link
497 Senior Business Analyst CONSUMERAFFAIRS 124,000 The operation of data processing hardware or consoles. A master’s degree in computer science from an accredited college and three years of progressively more… Manhattan, NY Link
498 Business Resiliency JPMorganChase 139,000 The Resiliency Data Management System (RDMS). Owner of the RDMS, GENT platforms and associated data management structures…. Jersey City, NJ Link
499 2017 Summer Intern Healthfirst 40,000 Aspiring data scientist working towards an advanced degree in computer science, mathematics or economics, biostatistics or other quantitative discipline Fluency… New York, NY Link
500 Database DeveloperAnalyst TAXI&LIMOUSINECOMMISSION 115,000 Appropriately parsing incoming raw data files. Assisting users with accessing complex data structures. Optimally loading said data based on access requirements…. New York, NY Link
Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,IndeedLink)

Extract skills requirement and assign “1” if skills posted and “0” if not posted.

Skillsdf<-extractMatchingStrings(IndeedLink)
kable(head(Skillsdf))
design idea strg eng com al ml st pb spark shiny sql hadoop py r
1 0 0 0 1 1 1 1 1 1 0 1 1 1 0
1 0 0 0 1 1 1 0 0 0 0 1 0 1 0
0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
kable(tail(Skillsdf))
design idea strg eng com al ml st pb spark shiny sql hadoop py r
483 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
484 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
485 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
486 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
487 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1
488 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
diff<-nrow(Indeeddf)-nrow(Skillsdf)
if(diff>0){
Skillsdf<-replaceMissingRow(diff,Skillsdf)
}

Analysis

Plot the sum of every skills column to find the most demanded skills

SkillsSum<-colSums(Skillsdf, na.rm = TRUE)
SkillsSum<-SkillsSum[order(-SkillsSum)]
SkillsSum
##     py    sql    com design     ml     st hadoop      r    eng     al 
##    133    131    119    105     68     67     60     60     59     50 
##  spark   strg   idea     pb  shiny 
##     48     37     31     10      0
barplot(SkillsSum, main="Skills Distribution", horiz=TRUE,space= 2.5, col ="lightblue", ylab = "Skills")

Store data

Combine the information and the skills dataframe

# bind dataframes
data_science<-cbind(Indeeddf,Skillsdf)

# name columns
colnames(data_science) <- c("Title","Company","Salary","Description","Location","Link","Design","Idea","Strategy",
                            "Engineering","Communication","Algorithm","Machine_Learning","Statistic","Probability","Spark","Shiny","Sql","Hadoop","Python","R")


kable(head(data_science),format = "html")
Title Company Salary Description Location Link Design Idea Strategy Engineering Communication Algorithm Machine_Learning Statistic Probability Spark Shiny Sql Hadoop Python R
Analytics & Data Science Intern BettermentLLC 30,000 Pursuing a Masters in Data Science or similar. We are looking for a highly capable and adaptable student to work closely with our Director of Analytics & Data… New York, NY 10013 (Tribeca area) https://www.indeed.com/rc/clk?jk=175187f69b6dffe8&fccid=8cf32c3f09e899f7 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0
Data Scientist MassMutualFinancialGroup 109,000 In this role, you will perform data-driven research, problem solving, and algorithm development through the systematic application of mathematics, statistics… New York, NY https://www.indeed.com/rc/clk?jk=81f97ec28cc30ed1&fccid=acf786d7ca32d66d 1 0 0 0 1 1 1 0 0 0 0 1 0 1 0
Data Scientist Data Science CorporateTechnology 109,000 JOB DESCRIPTION – DATA SCIENTIST. Python, C, or C++ SQL, PostgreSQL Experience with data gathering, data wrangling, cleaning, transforming and development of… New York, NY https://www.indeed.com/rc/clk?jk=2c63b1fa0b8cdfe5&fccid=02eb5ab96a4e158f 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
High Frequency Trading TwoSigmaInvestments,LLC. 106,000 Applying tick-level data analysis and real-world trading experimentation to define strategy decision-making…. New York, NY https://www.indeed.com/rc/clk?jk=9cd4de9cbbb8956d&fccid=567a0cc944322669 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Data Scientist SapientGlobalMarkets 109,000 The Senior Associate Data Science role is to not only be. And objectives into data driven solutions. At Sapient Global Markets, we are quite literally on the…. New York, NY https://www.indeed.com/rc/clk?jk=35b1f290ec162233&fccid=42acc3216fdd8871 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Data Science Koko 113,000 Lead ALL aspects of data science at Koko. As the owner of Data Science for Koko, you’ll be in a position to profoundly shape our product and the future of our… New York, NY 10032 (Washington Heights area) https://www.indeed.com/rc/clk?jk=0808a96dd97470f4&fccid=a6950248401dbeeb 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
kable(tail(data_science),format = "html")
Title Company Salary Description Location Link Design Idea Strategy Engineering Communication Algorithm Machine_Learning Statistic Probability Spark Shiny Sql Hadoop Python R
495 Data Scientist – Client Insights & Analytics MorganStanley 190,000 Morgan Stanley is looking for an exceptional data scientist with deep experience to work with a team of data integrators, data scientists and application… New York, NY 10032 (Washington Heights area) https://www.indeed.com/rc/clk?jk=902c2bf4a0198c35&fccid=0c39fb2c91742dcf 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
496 SENIOR DATA ANALYST MountSinaiHealthSystem 119,000 Standardization, data enrichment operations, data validations, data security. May access data in the Data Warehouse, as…. New York, NY 10029 (Yorkville area) https://www.indeed.com/rc/clk?jk=35ed103a21348d79&fccid=c007936ceb766fe5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
497 Senior Business Analyst CONSUMERAFFAIRS 124,000 The operation of data processing hardware or consoles. A master’s degree in computer science from an accredited college and three years of progressively more… Manhattan, NY https://www.indeed.com/rc/clk?jk=67535c9f2abe90c4&fccid=cd1b7cbd1a92d087 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
498 Business Resiliency JPMorganChase 139,000 The Resiliency Data Management System (RDMS). Owner of the RDMS, GENT platforms and associated data management structures…. Jersey City, NJ https://www.indeed.com/rc/clk?jk=42d89f37d503fd12&fccid=c46d0116f6e69eae 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
499 2017 Summer Intern Healthfirst 40,000 Aspiring data scientist working towards an advanced degree in computer science, mathematics or economics, biostatistics or other quantitative discipline Fluency… New York, NY https://www.indeed.com/rc/clk?jk=4bc0866a99c0aafb&fccid=33b2d0072564c18e 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
500 Database DeveloperAnalyst TAXI&LIMOUSINECOMMISSION 115,000 Appropriately parsing incoming raw data files. Assisting users with accessing complex data structures. Optimally loading said data based on access requirements…. New York, NY https://www.indeed.com/rc/clk?jk=a595199b2c991474&fccid=72d17c8e1b4a2d6f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# clean data from unvalid utf8 error
data_science$Company<- iconv(data_science$Company, "latin1", "UTF-8")
data_science$Description<- iconv(data_science$Description, "latin1", "UTF-8")
data_science$Description<- iconv(data_science$Description, "latin1", "UTF-8")
data_science$Title<- iconv(data_science$Title, "latin1", "UTF-8")

Write the data frame to a MySQL database schema

drv = dbDriver("MySQL")
con <- dbConnect(drv, user='root', password = myPassword, dbname='dscience',host="localhost",client.flag=CLIENT_MULTI_STATEMENTS)

dbWriteTable(con,"data_science",data_science, overwrite = TRUE) 
## [1] TRUE

Conclusion

The data for data science jobs were scraped form indeed web pages using rvest and hlmt. Data was separated, cleaned, stored into data frame and then load them into MySQL, the relational database.
The plot shows that Python, SQL, design, and communication are the most needed skills in data sciense job.