suppressPackageStartupMessages(library('XML'))
suppressPackageStartupMessages(library('rvest'))
## Warning: package 'rvest' was built under R version 3.4.2
## Warning: package 'xml2' was built under R version 3.4.2
suppressPackageStartupMessages(library('tidyr'))
## Warning: package 'tidyr' was built under R version 3.4.2
suppressPackageStartupMessages(library('ggplot2'))
# Check if the package is installed. If not, install the package
if(!require('XML')) {
  install.packages('XML')
  library(XML)
#Loading XML package
}
# Check if the package is installed. If not, install the package
if(!require('rvest')) {
  install.packages('rvest')
  library(rvest)
#Loading rvest package
}
# Check if the package is installed. If not, install the package
if(!require('stringr')) {
  install.packages('stringr')
  library(stringr)
#Loading string package
}
## Loading required package: stringr
# Checking if packages needed are installed. If not, install the package
if(!require('tidyr')) {
  install.packages('tidyr')
  library(tidyr)
#Loading rvest package
}
# Checking if packages needed are installed. If not, install the package
if(!require('ggplot2')) {
  install.packages('ggplot2')
  library(ggplot2)
#Loading rvest package
}

Data Science Skills

W.Edward Deming said, “In God we trust, all others must bring data”. –Please use data to answer the question “Which skills are the most valued data science skills?”. Consider your work as an exploration, there is not necessarilly a “right answer”. ####Giving a short introduction extracted from wikipedia.

data.science <- read_html("https://en.wikipedia.org/wiki/Data_science")
data.science.html <- htmlTreeParse(data.science, useInternal = TRUE) 
data.science.text <- unlist(xpathApply(data.science.html, '//body', xmlValue))
gsub("\\[[0-9].+\\]", "", str_trim(str_extract(data.science.text, ".Data scientist. has become.+")))
## [1] "\"Data scientist\" has become a popular occupation with Harvard Business Review dubbing it \"The Sexiest Job of the 21st Century\""

Data science process flowchart from “Doing Data Science”, Cathy O’Neil and Rachel Schutt, 2013

data.image <- data.science %>% 
  
  html_nodes(".thumbimage") %>%
  html_attr("src")
data.image
## [1] "//upload.wikimedia.org/wikipedia/commons/thumb/b/ba/Data_visualization_process_v1.png/350px-Data_visualization_process_v1.png"

Web Scrapping

Health Care Industry

Methodology used:

  1. Access a web page from R using the package ‘rvest’
  2. Tell R where to look for the page providing an url
  3. Manipulate the data in a usable format within R

Job 1

job1 <- read_html("https://cds.nyu.edu/junior-data-scientist-software-working-group/")
#Job title and quaifications(skills) extracted
job1.title <- job1 %>% html_nodes(".page-title") %>% html_text()

#Getting the skills 
job1.skills <- job1 %>% html_nodes("ul:nth-child(9) p") %>% html_text()
df1 <- data.frame(job1.title, job1.skills)
as.character(df1$job1.skills)
## [1] "Proficient in at least one of Python, R, Julia, Java, C++."                                                                           
## [2] "Familiarity with version control via Git or Mercurial"                                                                                
## [3] "Basic knowledge in (one or more areas including) data management, visualization, statistics, and machine learning is highly desirable"
y <-unlist(strsplit(job1.skills, split = ",", fixed = TRUE))
x <- str_extract_all(df1$job1.title, "[:alpha:]+")

dfjob1 <- 
df1$job1.skills <- factor(df1$job1.skills)


# split the data base on each skill
ndf <- split(df1, df1$job1.skills)  

#df1$job1.title <- str_replace(df1$job1.title, ".\[a-z]\.", " ")
#job1.title<- str_extract_all(df1$job1.title, "\\w+[[:alpha:]].?\\w+")
y
##  [1] "Proficient in at least one of Python"                            
##  [2] " R"                                                              
##  [3] " Julia"                                                          
##  [4] " Java"                                                           
##  [5] " C++."                                                           
##  [6] "Familiarity with version control via Git or Mercurial"           
##  [7] "Basic knowledge in (one or more areas including) data management"
##  [8] " visualization"                                                  
##  [9] " statistics"                                                     
## [10] " and machine learning is highly desirable"

Job 2

###Job 2
job2 <-read_html("https://job-openings.monster.com/Data-Scientist-OQPS-Albany-NY-US-PCG-Staffing-Solutions-Organization-LL/31/95cf1092-4b5f-4ffe-909f-dd5c1d30d6c7")
job2.title <- job2 %>% html_nodes("#JobViewHeader .title") %>% html_text()



#Getting the skills 
job2.skills<- job2 %>% html_nodes("ol:nth-child(6) li") %>% html_text()
df2 <- data.frame(job2.title, job2.skills)
as.character(df1$job2.skills)
## character(0)
y <-unlist(strsplit(job2.skills, split = ",", fixed = TRUE))

y
##  [1] "Professional experience in public health"                                       
##  [2] " health services research"                                                      
##  [3] " and/or public policy."                                                         
##  [4] "Experience with SAS business analytics software"                                
##  [5] " Python"                                                                        
##  [6] " R and/or SQL."                                                                 
##  [7] "Experience with data collection and analysis working with very large data sets."
##  [8] "Experience analyzing data from non-relational database models including graph"  
##  [9] " document"                                                                      
## [10] " column and/or key-values databases."                                           
## [11] "Experience with data manipulation (extracting"                                  
## [12] " merging and linking) of large data sets."                                      
## [13] "The ability to think creatively and strategically"                              
## [14] " with strong attention to detail and problem-solving skills."

Job3

job3 <-read_html("http://www.respondhr.com/58118630")
job3.title <- job3 %>% html_nodes("#jobtitletext") %>% html_text()


#Getting the skills 
job3.skills <- job3 %>% html_nodes("ul:nth-child(7) li") %>% html_text()
df3 <- data.frame(job3.title, job3.skills)
as.character(df3$job3.skills)
## [1] "A minimum of 5 years of professional experience as well as an M.S. or Ph.D. Degree in relevant discipline: Statistics, Applied Mathematics,  Operations Research/Optimization, Computer Science, Computational/Theoretical Physics, Data Science/visualization, Machine Learning, Electrical/Computer Engineering or Health Sciences (e.g. Bioengineering /Bioinformatics) "
## [2] "Strong mathematical background with strong knowledge in at least one of the following fields: statistics, data mining, machine learning, statistics, operations research, econometrics, natural language processing, and/or information retrieval"                                                                                                                          
## [3] "Deep experience in extracting, cleaning, preparing and modeling data"                                                                                                                                                                                                                                                                                                       
## [4] "Experience with command-line scripting, data structures, and algorithms; ability to work in a Linux environment"                                                                                                                                                                                                                                                            
## [5] "Proficient in the big data ecosystem such  Hadoop, Spark, and  Storm and programming languages (e.g. Python, Ruby, Java, Scala)"
y <-unlist(strsplit(job3.skills, split = ",", fixed = TRUE))

y
##  [1] "A minimum of 5 years of professional experience as well as an M.S. or Ph.D. Degree in relevant discipline: Statistics"
##  [2] " Applied Mathematics"                                                                                                 
##  [3] "  Operations Research/Optimization"                                                                                   
##  [4] " Computer Science"                                                                                                    
##  [5] " Computational/Theoretical Physics"                                                                                   
##  [6] " Data Science/visualization"                                                                                          
##  [7] " Machine Learning"                                                                                                    
##  [8] " Electrical/Computer Engineering or Health Sciences (e.g. Bioengineering /Bioinformatics) "                           
##  [9] "Strong mathematical background with strong knowledge in at least one of the following fields: statistics"             
## [10] " data mining"                                                                                                         
## [11] " machine learning"                                                                                                    
## [12] " statistics"                                                                                                          
## [13] " operations research"                                                                                                 
## [14] " econometrics"                                                                                                        
## [15] " natural language processing"                                                                                         
## [16] " and/or information retrieval"                                                                                        
## [17] "Deep experience in extracting"                                                                                        
## [18] " cleaning"                                                                                                            
## [19] " preparing and modeling data"                                                                                         
## [20] "Experience with command-line scripting"                                                                               
## [21] " data structures"                                                                                                     
## [22] " and algorithms; ability to work in a Linux environment"                                                              
## [23] "Proficient in the big data ecosystem such  Hadoop"                                                                    
## [24] " Spark"                                                                                                               
## [25] " and  Storm and programming languages (e.g. Python"                                                                   
## [26] " Ruby"                                                                                                                
## [27] " Java"                                                                                                                
## [28] " Scala)"

Job4

job4 <-read_html("https://jobs.smartrecruiters.com/4Catalyzer/743999661313585-data-scientist")
job4.title <- job4 %>% html_nodes(".job-title") %>% html_text()


#Getting the skills 
job4.skills <- job4 %>% html_nodes("ul:nth-child(2) p") %>% html_text()
df4 <- data.frame(job4.title,job4.skills)
as.character(df4$job4.skills)
## [1] "BA/BS degree in a quantitative discipline (e.g., statistics, bioinformatics, computer science, mathematics, physics)."
## [2] "Experience with statistical software (e.g., R, Julia, M, pandas) and database languages (e.g., SQL)."                 
## [3] "Exposure to, or a strong desire to learn about, deep learning."                                                       
## [4] "Large data analysis and visualization experience."
y <-unlist(strsplit(job4.skills, split = ",", fixed = TRUE))

y
##  [1] "BA/BS degree in a quantitative discipline (e.g."  
##  [2] " statistics"                                      
##  [3] " bioinformatics"                                  
##  [4] " computer science"                                
##  [5] " mathematics"                                     
##  [6] " physics)."                                       
##  [7] "Experience with statistical software (e.g."       
##  [8] " R"                                               
##  [9] " Julia"                                           
## [10] " M"                                               
## [11] " pandas) and database languages (e.g."            
## [12] " SQL)."                                           
## [13] "Exposure to"                                      
## [14] " or a strong desire to learn about"               
## [15] " deep learning."                                  
## [16] "Large data analysis and visualization experience."

Job 5

job5 <-read_html("http://jobs.nyulangone.org/job/7733505/")
job5.title <- job5 %>% html_nodes("#gtm-jobdetail-title") %>% html_text()


#Getting the skills 
job5.skills <- job5 %>% html_nodes("p:nth-child(5)") %>% html_text()
df5 <- data.frame(job5.title,job5.skills)
as.character(df5$job5.skills)
## [1] "Minimum Qualifications:To qualify you must have a Bachelors Degree or a minimum of three years in a physician office, hospital or managed care environment. Proficient in the use of Excel and Word. Knowledge of claims processing systems and guidelines. Ability to analyze problems, draw relevant conclusions, and devise appropriate courses of action. Excellent written communication and oral presentation skills. Exceptional relationship and organizational skills."
y <-unlist(strsplit(job5.skills, split = ".", fixed = TRUE))
y <-unlist(strsplit(y, split = ",", fixed = TRUE))

y
## [1] "Minimum Qualifications:To qualify you must have a Bachelors Degree or a minimum of three years in a physician office"
## [2] " hospital or managed care environment"                                                                               
## [3] " Proficient in the use of Excel and Word"                                                                            
## [4] " Knowledge of claims processing systems and guidelines"                                                              
## [5] " Ability to analyze problems"                                                                                        
## [6] " draw relevant conclusions"                                                                                          
## [7] " and devise appropriate courses of action"                                                                           
## [8] " Excellent written communication and oral presentation skills"                                                       
## [9] " Exceptional relationship and organizational skills"
job.data <- read.csv("https://raw.githubusercontent.com/doradu8030/Data607/master/Clean%20Data.csv", stringsAsFactor = FALSE)
ggplot(data=dat1, aes(x= Skill.Type, y= Req))+ geom_bar(colour="black", width=.8, stat = "Identity", fill= "blue") + 
guides(fill=FALSE) + 
xlab("Skills") + ylab("count") + 
ggtitle("Comparison Between Hard and Soft Skill")     
ggplot(data=job.data, aes(x= Industry, y = Skill.Set)) + geom_bar(stat = "Identity", fill= "blue") + theme_bw(base_size = 16) + theme(axis.text = element_text(angle = 0, hjust = 1)) + coord_flip()

Difficulties.

  1. One of most challenging parts from scrapping data was learn how to use CCS selector. It seems easy in the tutorials but I found this piece challenging for me, I spent too much time trying to select the correct tag.
  2. I scraped a a html page that couple of days after it was no longer available. It was time consuming do all process finding a new html page.
  3. Sometimes data are not downloadable, or dont’ have tags for each piece of information, the data couldn’t be scraped. In my case I had to select only one source “Monster” and select the job posts individually due to each website has different format, and I found Monster easy especially the format used to tag a html page.

Conclusion.

I found scrapping data very powerful tool but It is not very easy to scrap data from different websites due to the terms of service in place and in the difference each html page is formated. However, Scrapping data form the web could provides very useful information to various organizations and industries especially Bussines companies. For our project, scrapping the data by industry and seeing my peers analysis allow me to conclude that in most of industries soft skills are NOT the most valued for jobs as a Data Scientist at the contrary hard skills (Technical Skills) are the most valued data science skills.