project 3

knitr::opts_chunk$set(echo = TRUE)

Intorduction

This is a project for your entire class section to work on together, since being able to work effectively on a virtual team is a key “soft skill” for data scientists. Please note especially the requirement about making a presentation during our first meetup after the project is due.

You will need to determine what tool(s) you’ll use as a group to effectively collaborate, share code and any project documentation (such as motivation, approach, findings).
You will have to determine what data to collect, where the data can be found, and how to load it.
The data that you decide to collect should reside in a relational database, in a set of normalized tables.
You should perform any needed tidying, transformations, and exploratory data analysis in R.
Your deliverable should include all code, results, and documentation of your motivation, approach, and findings.
As a group, you should appoint (at least) three people to lead parts of the presentation.
While you are strongly encouraged (and will hopefully find it fun) to try out statistics and data models, your grade will not be affected by the statistical analysis and modeling performed (since this is a semester one course on Data Acquisition and Management).
Every student must be prepared to explain how the data was collected, loaded, transformed, tidied, and analyzed for outliers, etc. in our Meetup. This is the only way I’ll have to determine that everyone actively participated in the process, so you need to hold yourself responsible for understanding what your class-size team did! If you are unable to attend the meet up, then you need to either present to me one-on-one before the meetup presentation, or post a 3 to 5 minute video (e.g. on YouTube) explaining the process. Individual students will not be responsible for explaining any forays into statistical analysis, modeling, data mining, regression, decision trees, etc.

Procedure

create functions to do the following;

Scrape data science jobs from indeed.com
Clean and select useful information
Gather collected data science job info into data frame
Load data frame to MySQL
analysis

Load Html Libraries

Load libraries for web scraping html pages

library(xml2)
library(rvest)
library(knitr)
library(XML)
library(bitops)
library(RCurl)
library(htmltab)
library(stringr)
library(curl)
library(DBI)
library(RMySQL)
library(tidyr)
library(dplyr)

Create functions to scrape the data using rvest library

Function to submit form

submit_form2 <- function(session, form){
  library(XML)
  url <- XML::getRelativeURL(form$url, session$url)
  url <- paste(url,'?',sep='')
  values <- as.vector(rvest:::submit_request(form)$values)
  att <- names(values)
  if (tail(att, n=1) == "NULL"){
    values <- values[1:length(values)-1]
    att <- att[1:length(att)-1]
  }
  q <- paste(att,values,sep='=')
  q <- paste(q, collapse = '&')
  q <- gsub(" ", "+", q)
  url <- paste(url, q, sep = '')
  html_session(url)
}

Function to concatenate a string into an html form:

# get the setion 
session <- html_session("http://www.indeed.com")
getform<-function(PageLimit,PageStart)
{
  query = "data science"
  loc = paste0("New York&limit=",PageLimit,"&start=",PageStart)
    myForm <- html_form(session)[[1]]
  myForm <- set_values(myForm, q = query, l = loc)
  
  return (myForm)
}

General purpose function to extract html text using CSS:

gethtmldata<-function(itemporp_Value,htmlSession)
{
  
  htmlnode<-paste0("[itemprop=",itemporp_Value,"]")
  htmldata<- htmlSession %>% 
  html_nodes(htmlnode) %>%
  html_text()
  

  return(htmldata)
}

Fuction to get Salary:

# Get Salary
getTheSalaries<-function(sessions,URLlink)
{
  salary_links <- html_nodes(sessions, css = "#resultsCol li:nth-child(2) a") %>% html_attr("href")
  salary_links <- paste(URLlink, salary_links, sep='')
  salaries <- lapply(salary_links, . %>% html() %>% html_nodes("#salary_display_table .salary") %>% html_text())
  salary <- unlist(salaries)
  salary<-gsub("\\$", "", salary)
  salary<-gsub("\\s", "", salary)
  salary[salary=="NoData"]<-"0"

  return(salary)
}

Function to get job title:

getJobTitle<-function(htmlSession)
{
    htmldata<- htmlSession %>% 
    html_nodes("[itemprop=title]") %>%
    html_text()
  
    htmldata<-sub(".*,","",htmldata)
    #htmldata<-gsub("\\s$", "",htmldata)
    htmldata<-sub('-.*',"",htmldata)
    htmldata<-gsub("\\s$", "",htmldata)
    htmldata <- gsub("/", "", htmldata, fixed=TRUE) 

  return(htmldata)
}

Function to get the Company:

getCompany<-function(htmlSession)
{

  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=hiringOrganization]") %>%
    html_text()
  
  htmldata<-gsub("\n", "", htmldata)
  htmldata<-gsub("\\s", "", htmldata)
  
  return(htmldata)
}

Function to get the Location:

getLocation<-function(htmlSession)
{
  
  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=addressLocality]") %>%
    html_text()

  return(htmldata)
}

Function to get the job description:

getJobDescription<-function(htmlSession)
{
  
  htmldata<- htmlSession %>% 
    html_nodes("[itemprop=description]") %>%
    html_text()
  
  htmldata<-gsub("\n", "", htmldata)

    return(htmldata)
}

Function to get the indeed job link:

getIndeedLink<-function(htmlSession,pretty)
{

 htmldata<- htmlSession %>% 
    html_nodes("[itemprop=title]")%>%
    html_attr("href")

  links<- paste('https://www.indeed.com', htmldata, sep='')
  htmldata <- paste('[Link](https://www.indeed.com', htmldata, sep='')
  htmldata <- paste(htmldata, ')', sep='')
  
  if(pretty==1){
    return(htmldata)
  }
  else 
  {
    return(links)
    
  }
}

Functions to find the skills and the job requirements based on data science skills and requirement needed

# extract design skills
extractDesignStrings <- function(passParm)
{
  
  temp1<-passParm %>% html_nodes(xpath='//li[contains(.,"modeling")]') %>% html_text()
  temp2<-passParm %>% html_nodes(xpath='//li[contains(.,"models")]') %>% html_text()
  
  if(identical(temp1,character(0)) && identical(temp2,character(0))){
    tempdata=0
  }else{
    tempdata=1
  }
  
  return(tempdata)
}

# extract all other skills
extractStrings<-function(passParm,stringTomatch)
{
  StringTomatch<-paste0("//li[contains(.,'",stringTomatch,"')]")
  temp<-passParm %>% html_nodes(xpath=StringTomatch) %>% html_text()

  if(identical(temp,character(0))){ 
    tempdata=0 
  }else{ 
    tempdata=1 
    }
  

  return(tempdata)
}


# apply the two previous functions to find 15 data science skills and store them into a dataframe
extractMatchingStrings<-function(URLlinkArray)
{
  r<-NULL; py<-NULL
  hadoop<-NULL; sql<-NULL
  shiny<-NULL; spark<-NULL
  pb<-NULL; st<-NULL
  ml<-NULL; al<-NULL
  eng<-NULL; com<-NULL
  strg<-NULL; idea<-NULL
  design<-NULL
  

lengthOfString=length(URLlinkArray)+1

  for(i in 1:lengthOfString) 
  {
    tryCatch(
      {
        url <- URLlinkArray[i]
        tmp <- url %>% read_html 
        # design Skills
        design <- c(design,extractDesignStrings(tmp))

        # idea requirement
        idea <- c(idea,extractStrings(tmp,"ideas"))

        # strategy requirement 
        strg <- c(strg,extractStrings(tmp,"strategy"))

        # engineering requirement 
        eng <- c(eng,extractStrings(tmp,"engineering"))

        # communication requiremen
        com <- c(com,extractStrings(tmp,"communication"))

        # algorithms requirement 
        al <- c(al,extractStrings(tmp,"algorithms"))

        # machine learning requirement
        ml <- c(ml,extractStrings(tmp,"machine learning"))

        # statistics requirement 
        st <- c(st,extractStrings(tmp,"statistics"))

        # probability requirement 
        pb <- c(pb,extractStrings(tmp,"probability"))

        # spark programming skills
        spark <- c(spark,extractStrings(tmp,"Spark"))

        # shiny programming skills
        shiny <- c(shiny,extractStrings(tmp,"Shiny"))

        # sql programming skills
        sql <- c(sql,extractStrings(tmp,"SQL"))

        # hadoop programming Skills
        hadoop <- c(hadoop,extractStrings(tmp,"Hadoop"))

        # py programming Skills
        py <- c(py,extractStrings(tmp,"Python"))

        # r programming Skills
        r <- c(r,extractStrings(tmp,"R,"))

        

      }, error=function(e){})
  }
  
  tempdf<-data.frame(design,idea,strg,eng,com,al,ml,st,pb,spark,shiny,sql,hadoop,py,r)
  
  return(tempdf)
}

# replace eliminated raws due to NA by zeros to be able join dataframe
replaceMissingRow<-function(rowToAdd,df)
{
  newrow<-vector('numeric',15)
  for(i in 1:rowToAdd){
    df= rbind(df,newrow)
  }
  return(df)
}

Get the data science Job information for 500 posted jobs with patch of 100 posts.

# Set the maximum job results to get from indeed
maxResults<-500

# Set the job results per page
nResultsPerpage<-100

Salary<-NULL
JobTitle<-NULL
Company<-NULL
Location<-NULL
JobDescription<-NULL
IndeedLink<-NULL
getLink<-NULL

for(i in seq(0, (maxResults-nResultsPerpage), nResultsPerpage)){
  
form<-getform(nResultsPerpage,i)
indeed_sessions <- submit_form2(session, form)

# salary
temp_data<-getTheSalaries(indeed_sessions,"http://www.indeed.com/")
Salary<-c(Salary,temp_data)

temp_data<-getJobTitle(indeed_sessions)
JobTitle<-c(JobTitle,temp_data)

temp_data<-getCompany(indeed_sessions)
Company<-c(Company,temp_data)

temp_data<-getLocation(indeed_sessions)
Location<-c(Location,temp_data)

temp_data<-getJobDescription(indeed_sessions)
JobDescription<-c(JobDescription,temp_data)

temp_data<-getIndeedLink(indeed_sessions,0)
IndeedLink<-c(IndeedLink,temp_data)

temp_data<-getIndeedLink(indeed_sessions,1)
getLink<-c(getLink,temp_data)
}
# the length of the data
length(IndeedLink)

## [1] 500

create a data frame with the available job posting information

Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,getLink)
kable(head(Indeeddf),format = "html")

JobTitle	Company	Salary	JobDescription	Location	getLink
Analytics & Data Science Intern	BettermentLLC	30,000	Pursuing a Masters in Data Science or similar. We are looking for a highly capable and adaptable student to work closely with our Director of Analytics & Data…	New York, NY 10013 (Tribeca area)	Link
Data Scientist	MassMutualFinancialGroup	109,000	In this role, you will perform data-driven research, problem solving, and algorithm development through the systematic application of mathematics, statistics…	New York, NY	Link
Data Scientist Data Science	CorporateTechnology	109,000	JOB DESCRIPTION DATA SCIENTIST. Python, C, or C++ SQL, PostgreSQL Experience with data gathering, data wrangling, cleaning, transforming and development of…	New York, NY	Link
High Frequency Trading	TwoSigmaInvestments,LLC.	106,000	Applying tick-level data analysis and real-world trading experimentation to define strategy decision-making….	New York, NY	Link
Data Scientist	SapientGlobalMarkets	109,000	The Senior Associate Data Science role is to not only be. And objectives into data driven solutions. At Sapient Global Markets, we are quite literally on the….	New York, NY	Link
Data Science	Koko	113,000	Lead ALL aspects of data science at Koko. As the owner of Data Science for Koko, you’ll be in a position to profoundly shape our product and the future of our…	New York, NY 10032 (Washington Heights area)	Link

kable(tail(Indeeddf),format = "html")

	JobTitle	Company	Salary	JobDescription	Location	getLink
495	Data Scientist Client Insights & Analytics	MorganStanley	190,000	Morgan Stanley is looking for an exceptional data scientist with deep experience to work with a team of data integrators, data scientists and application…	New York, NY 10032 (Washington Heights area)	Link
496	SENIOR DATA ANALYST	MountSinaiHealthSystem	119,000	Standardization, data enrichment operations, data validations, data security. May access data in the Data Warehouse, as….	New York, NY 10029 (Yorkville area)	Link
497	Senior Business Analyst	CONSUMERAFFAIRS	124,000	The operation of data processing hardware or consoles. A master’s degree in computer science from an accredited college and three years of progressively more…	Manhattan, NY	Link
498	Business Resiliency	JPMorganChase	139,000	The Resiliency Data Management System (RDMS). Owner of the RDMS, GENT platforms and associated data management structures….	Jersey City, NJ	Link
499	2017 Summer Intern	Healthfirst	40,000	Aspiring data scientist working towards an advanced degree in computer science, mathematics or economics, biostatistics or other quantitative discipline Fluency…	New York, NY	Link
500	Database DeveloperAnalyst	TAXI&LIMOUSINECOMMISSION	115,000	Appropriately parsing incoming raw data files. Assisting users with accessing complex data structures. Optimally loading said data based on access requirements….	New York, NY	Link

Indeeddf<-data.frame(JobTitle,Company,Salary,JobDescription,Location,IndeedLink)

Extract skills requirement and assign “1” if skills posted and “0” if not posted.

Skillsdf<-extractMatchingStrings(IndeedLink)
kable(head(Skillsdf))

design	strg	com	al	ml	st	pb	spark	sql	hadoop	py
1	0	1	1	1	1	1	1	1	1	1
1	0	1	1	1	0	0	0	1	0	1
0	1	0	0	0	0	0	0	0	0	1
0	0	0	0	0	0	0	0	0	0	0
0	0	0	0	0	0	0	0	0	0	0
0	0	0	0	0	0	0	0	0	0	0

kable(tail(Skillsdf))

	design	ml	st	sql	py	r
483	0	0	0	0	0	0
484	0	0	0	0	0	0
485	0	0	0	0	0	0
486	0	0	0	0	0	0
487	1	1	1	1	1	1
488	0	0	0	0	0	0

diff<-nrow(Indeeddf)-nrow(Skillsdf)
if(diff>0){
Skillsdf<-replaceMissingRow(diff,Skillsdf)
}

Analysis

Plot the sum of every skills column to find the most demanded skills

SkillsSum<-colSums(Skillsdf, na.rm = TRUE)
SkillsSum<-SkillsSum[order(-SkillsSum)]
SkillsSum

##     py    sql    com design     ml     st hadoop      r    eng     al 
##    133    131    119    105     68     67     60     60     59     50 
##  spark   strg   idea     pb  shiny 
##     48     37     31     10      0

barplot(SkillsSum, main="Skills Distribution", horiz=TRUE,space= 2.5, col ="lightblue", ylab = "Skills")

Store data

Combine the information and the skills dataframe

# bind dataframes
data_science<-cbind(Indeeddf,Skillsdf)

# name columns
colnames(data_science) <- c("Title","Company","Salary","Description","Location","Link","Design","Idea","Strategy",
                            "Engineering","Communication","Algorithm","Machine_Learning","Statistic","Probability","Spark","Shiny","Sql","Hadoop","Python","R")


kable(head(data_science),format = "html")

Title	Company	Salary	Description	Location	Link	Design	Strategy	Communication	Algorithm	Machine_Learning	Statistic	Probability	Spark	Sql	Hadoop	Python
Analytics & Data Science Intern	BettermentLLC	30,000	Pursuing a Masters in Data Science or similar. We are looking for a highly capable and adaptable student to work closely with our Director of Analytics & Data…	New York, NY 10013 (Tribeca area)	https://www.indeed.com/rc/clk?jk=175187f69b6dffe8&fccid=8cf32c3f09e899f7	1	0	1	1	1	1	1	1	1	1	1
Data Scientist	MassMutualFinancialGroup	109,000	In this role, you will perform data-driven research, problem solving, and algorithm development through the systematic application of mathematics, statistics…	New York, NY	https://www.indeed.com/rc/clk?jk=81f97ec28cc30ed1&fccid=acf786d7ca32d66d	1	0	1	1	1	0	0	0	1	0	1
Data Scientist Data Science	CorporateTechnology	109,000	JOB DESCRIPTION DATA SCIENTIST. Python, C, or C++ SQL, PostgreSQL Experience with data gathering, data wrangling, cleaning, transforming and development of…	New York, NY	https://www.indeed.com/rc/clk?jk=2c63b1fa0b8cdfe5&fccid=02eb5ab96a4e158f	0	1	0	0	0	0	0	0	0	0	1
High Frequency Trading	TwoSigmaInvestments,LLC.	106,000	Applying tick-level data analysis and real-world trading experimentation to define strategy decision-making….	New York, NY	https://www.indeed.com/rc/clk?jk=9cd4de9cbbb8956d&fccid=567a0cc944322669	0	0	0	0	0	0	0	0	0	0	0
Data Scientist	SapientGlobalMarkets	109,000	The Senior Associate Data Science role is to not only be. And objectives into data driven solutions. At Sapient Global Markets, we are quite literally on the….	New York, NY	https://www.indeed.com/rc/clk?jk=35b1f290ec162233&fccid=42acc3216fdd8871	0	0	0	0	0	0	0	0	0	0	0
Data Science	Koko	113,000	Lead ALL aspects of data science at Koko. As the owner of Data Science for Koko, you’ll be in a position to profoundly shape our product and the future of our…	New York, NY 10032 (Washington Heights area)	https://www.indeed.com/rc/clk?jk=0808a96dd97470f4&fccid=a6950248401dbeeb	0	0	0	0	0	0	0	0	0	0	0

kable(tail(data_science),format = "html")

	Title	Company	Salary	Description	Location	Link
495	Data Scientist Client Insights & Analytics	MorganStanley	190,000	Morgan Stanley is looking for an exceptional data scientist with deep experience to work with a team of data integrators, data scientists and application…	New York, NY 10032 (Washington Heights area)	https://www.indeed.com/rc/clk?jk=902c2bf4a0198c35&fccid=0c39fb2c91742dcf
496	SENIOR DATA ANALYST	MountSinaiHealthSystem	119,000	Standardization, data enrichment operations, data validations, data security. May access data in the Data Warehouse, as….	New York, NY 10029 (Yorkville area)	https://www.indeed.com/rc/clk?jk=35ed103a21348d79&fccid=c007936ceb766fe5
497	Senior Business Analyst	CONSUMERAFFAIRS	124,000	The operation of data processing hardware or consoles. A master’s degree in computer science from an accredited college and three years of progressively more…	Manhattan, NY	https://www.indeed.com/rc/clk?jk=67535c9f2abe90c4&fccid=cd1b7cbd1a92d087
498	Business Resiliency	JPMorganChase	139,000	The Resiliency Data Management System (RDMS). Owner of the RDMS, GENT platforms and associated data management structures….	Jersey City, NJ	https://www.indeed.com/rc/clk?jk=42d89f37d503fd12&fccid=c46d0116f6e69eae
499	2017 Summer Intern	Healthfirst	40,000	Aspiring data scientist working towards an advanced degree in computer science, mathematics or economics, biostatistics or other quantitative discipline Fluency…	New York, NY	https://www.indeed.com/rc/clk?jk=4bc0866a99c0aafb&fccid=33b2d0072564c18e
500	Database DeveloperAnalyst	TAXI&LIMOUSINECOMMISSION	115,000	Appropriately parsing incoming raw data files. Assisting users with accessing complex data structures. Optimally loading said data based on access requirements….	New York, NY	https://www.indeed.com/rc/clk?jk=a595199b2c991474&fccid=72d17c8e1b4a2d6f

# clean data from unvalid utf8 error
data_science$Company<- iconv(data_science$Company, "latin1", "UTF-8")
data_science$Description<- iconv(data_science$Description, "latin1", "UTF-8")
data_science$Description<- iconv(data_science$Description, "latin1", "UTF-8")
data_science$Title<- iconv(data_science$Title, "latin1", "UTF-8")

Write the data frame to a MySQL database schema

drv = dbDriver("MySQL")
con <- dbConnect(drv, user='root', password = myPassword, dbname='dscience',host="localhost",client.flag=CLIENT_MULTI_STATEMENTS)

dbWriteTable(con,"data_science",data_science, overwrite = TRUE)

## [1] TRUE

Conclusion

The data for data science jobs were scraped form indeed web pages using rvest and hlmt. Data was separated, cleaned, stored into data frame and then load them into MySQL, the relational database.
The plot shows that Python, SQL, design, and communication are the most needed skills in data sciense job.