suppressPackageStartupMessages(library('XML'))
suppressPackageStartupMessages(library('rvest'))

## Warning: package 'rvest' was built under R version 3.4.2

## Warning: package 'xml2' was built under R version 3.4.2

suppressPackageStartupMessages(library('tidyr'))

## Warning: package 'tidyr' was built under R version 3.4.2

# Check if the package is installed. If not, install the package
if(!require('XML')) {
  install.packages('XML')
  library(XML)
#Loading XML package
}

# Check if the package is installed. If not, install the package
if(!require('rvest')) {
  install.packages('rvest')
  library(rvest)
#Loading rvest package
}

# Check if the package is installed. If not, install the package
if(!require('stringr')) {
  install.packages('stringr')
  library(stringr)
#Loading string package
}

## Loading required package: stringr

# Checking if packages needed are installed. If not, install the package
if(!require('tidyr')) {
  install.packages('tidyr')
  library(tidyr)
#Loading rvest package
}

Data Science Skills

W.Edward Deming said, “In God we trust, all others must bring data”. –Please use data to answer the question “Which skills are the most valued data science skills?”. Consider your work as an exploration, there is not necessarilly a “right answer”. ####Giving a short introduction extracted from wikipedia, data still needs to be tidy

data.science <- read_html("https://en.wikipedia.org/wiki/Data_science")
data.science.html <- htmlTreeParse(data.science, useInternal = TRUE) 
data.science.text <- unlist(xpathApply(data.science.html, '//body', xmlValue))
gsub("\\[[0-9].+\\]", "", str_trim(str_extract(data.science.text, ".Data scientist. has become.+")))

## [1] "\"Data scientist\" has become a popular occupation with Harvard Business Review dubbing it \"The Sexiest Job of the 21st Century\""

Data science process flowchart from “Doing Data Science”, Cathy O’Neil and Rachel Schutt, 2013

data.image <- data.science %>% 
  
  html_nodes(".thumbimage") %>%
  html_attr("src")
data.image

## [1] "//upload.wikimedia.org/wikipedia/commons/thumb/b/ba/Data_visualization_process_v1.png/350px-Data_visualization_process_v1.png"

Web Scrapping

Health Care Industry

Job 1

job1 <- read_html("https://cds.nyu.edu/junior-data-scientist-software-working-group/")
#Job title and quaifications(skills) extracted
job1.title <- job1 %>% html_nodes(".page-title") %>% html_text()

#Getting the skills 
job1.skills <- job1 %>% html_nodes("ul:nth-child(9) p") %>% html_text()
df1 <- data.frame(job1.title, job1.skills)
as.character(df1$job1.skills)

## [1] "Proficient in at least one of Python, R, Julia, Java, C++."                                                                           
## [2] "Familiarity with version control via Git or Mercurial"                                                                                
## [3] "Basic knowledge in (one or more areas including) data management, visualization, statistics, and machine learning is highly desirable"

y <-unlist(strsplit(job1.skills, split = ",", fixed = TRUE))
x <- str_extract_all(df1$job1.title, "[:alpha:]+")

dfjob1 <- 
df1$job1.skills <- factor(df1$job1.skills)


# split the data base on each skill
ndf <- split(df1, df1$job1.skills)  

#df1$job1.title <- str_replace(df1$job1.title, ".\[a-z]\.", " ")
#job1.title<- str_extract_all(df1$job1.title, "\\w+[[:alpha:]].?\\w+")
y

##  [1] "Proficient in at least one of Python"                            
##  [2] " R"                                                              
##  [3] " Julia"                                                          
##  [4] " Java"                                                           
##  [5] " C++."                                                           
##  [6] "Familiarity with version control via Git or Mercurial"           
##  [7] "Basic knowledge in (one or more areas including) data management"
##  [8] " visualization"                                                  
##  [9] " statistics"                                                     
## [10] " and machine learning is highly desirable"

Job 2

###Job 2
job2 <-read_html("https://job-openings.monster.com/Data-Scientist-OQPS-Albany-NY-US-PCG-Staffing-Solutions-Organization-LL/31/95cf1092-4b5f-4ffe-909f-dd5c1d30d6c7")
job2.title <- job2 %>% html_nodes("#JobViewHeader .title") %>% html_text()



#Getting the skills 
job2.skills<- job2 %>% html_nodes("ol:nth-child(6) li") %>% html_text()
df2 <- data.frame(job2.title, job2.skills)
as.character(df1$job2.skills)

## character(0)

y <-unlist(strsplit(job2.skills, split = ",", fixed = TRUE))

y

##  [1] "Professional experience in public health"                                       
##  [2] " health services research"                                                      
##  [3] " and/or public policy."                                                         
##  [4] "Experience with SAS business analytics software"                                
##  [5] " Python"                                                                        
##  [6] " R and/or SQL."                                                                 
##  [7] "Experience with data collection and analysis working with very large data sets."
##  [8] "Experience analyzing data from non-relational database models including graph"  
##  [9] " document"                                                                      
## [10] " column and/or key-values databases."                                           
## [11] "Experience with data manipulation (extracting"                                  
## [12] " merging and linking) of large data sets."                                      
## [13] "The ability to think creatively and strategically"                              
## [14] " with strong attention to detail and problem-solving skills."

job3

job3 <-read_html("http://www.respondhr.com/58118630")
job3.title <- job3 %>% html_nodes("#jobtitletext") %>% html_text()


#Getting the skills 
job3.skills <- job3 %>% html_nodes("ul:nth-child(7) li") %>% html_text()
df3 <- data.frame(job3.title, job3.skills)
as.character(df3$job3.skills)

## [1] "A minimum of 5 years of professional experience as well as an M.S. or Ph.D. Degree in relevant discipline: Statistics, Applied Mathematics,  Operations Research/Optimization, Computer Science, Computational/Theoretical Physics, Data Science/visualization, Machine Learning, Electrical/Computer Engineering or Health Sciences (e.g. Bioengineering /Bioinformatics) "
## [2] "Strong mathematical background with strong knowledge in at least one of the following fields: statistics, data mining, machine learning, statistics, operations research, econometrics, natural language processing, and/or information retrieval"                                                                                                                          
## [3] "Deep experience in extracting, cleaning, preparing and modeling data"                                                                                                                                                                                                                                                                                                       
## [4] "Experience with command-line scripting, data structures, and algorithms; ability to work in a Linux environment"                                                                                                                                                                                                                                                            
## [5] "Proficient in the big data ecosystem such  Hadoop, Spark, and  Storm and programming languages (e.g. Python, Ruby, Java, Scala)"

y <-unlist(strsplit(job3.skills, split = ",", fixed = TRUE))

y

##  [1] "A minimum of 5 years of professional experience as well as an M.S. or Ph.D. Degree in relevant discipline: Statistics"
##  [2] " Applied Mathematics"                                                                                                 
##  [3] "  Operations Research/Optimization"                                                                                   
##  [4] " Computer Science"                                                                                                    
##  [5] " Computational/Theoretical Physics"                                                                                   
##  [6] " Data Science/visualization"                                                                                          
##  [7] " Machine Learning"                                                                                                    
##  [8] " Electrical/Computer Engineering or Health Sciences (e.g. Bioengineering /Bioinformatics) "                           
##  [9] "Strong mathematical background with strong knowledge in at least one of the following fields: statistics"             
## [10] " data mining"                                                                                                         
## [11] " machine learning"                                                                                                    
## [12] " statistics"                                                                                                          
## [13] " operations research"                                                                                                 
## [14] " econometrics"                                                                                                        
## [15] " natural language processing"                                                                                         
## [16] " and/or information retrieval"                                                                                        
## [17] "Deep experience in extracting"                                                                                        
## [18] " cleaning"                                                                                                            
## [19] " preparing and modeling data"                                                                                         
## [20] "Experience with command-line scripting"                                                                               
## [21] " data structures"                                                                                                     
## [22] " and algorithms; ability to work in a Linux environment"                                                              
## [23] "Proficient in the big data ecosystem such  Hadoop"                                                                    
## [24] " Spark"                                                                                                               
## [25] " and  Storm and programming languages (e.g. Python"                                                                   
## [26] " Ruby"                                                                                                                
## [27] " Java"                                                                                                                
## [28] " Scala)"

job4

job4 <-read_html("https://jobs.smartrecruiters.com/4Catalyzer/743999661313585-data-scientist")
job4.title <- job4 %>% html_nodes(".job-title") %>% html_text()


#Getting the skills 
job4.skills <- job4 %>% html_nodes("ul:nth-child(2) p") %>% html_text()
df4 <- data.frame(job4.title,job4.skills)
as.character(df4$job4.skills)

## [1] "BA/BS degree in a quantitative discipline (e.g., statistics, bioinformatics, computer science, mathematics, physics)."
## [2] "Experience with statistical software (e.g., R, Julia, M, pandas) and database languages (e.g., SQL)."                 
## [3] "Exposure to, or a strong desire to learn about, deep learning."                                                       
## [4] "Large data analysis and visualization experience."

y <-unlist(strsplit(job4.skills, split = ",", fixed = TRUE))

y

##  [1] "BA/BS degree in a quantitative discipline (e.g."  
##  [2] " statistics"                                      
##  [3] " bioinformatics"                                  
##  [4] " computer science"                                
##  [5] " mathematics"                                     
##  [6] " physics)."                                       
##  [7] "Experience with statistical software (e.g."       
##  [8] " R"                                               
##  [9] " Julia"                                           
## [10] " M"                                               
## [11] " pandas) and database languages (e.g."            
## [12] " SQL)."                                           
## [13] "Exposure to"                                      
## [14] " or a strong desire to learn about"               
## [15] " deep learning."                                  
## [16] "Large data analysis and visualization experience."

Job 5

job5 <-read_html("http://jobs.nyulangone.org/job/7733505/")
job5.title <- job5 %>% html_nodes("#gtm-jobdetail-title") %>% html_text()


#Getting the skills 
job5.skills <- job5 %>% html_nodes("p:nth-child(5)") %>% html_text()
df5 <- data.frame(job5.title,job5.skills)
as.character(df5$job5.skills)

## [1] "Minimum Qualifications:To qualify you must have a Bachelors Degree or a minimum of three years in a physician office, hospital or managed care environment. Proficient in the use of Excel and Word. Knowledge of claims processing systems and guidelines. Ability to analyze problems, draw relevant conclusions, and devise appropriate courses of action. Excellent written communication and oral presentation skills. Exceptional relationship and organizational skills."

y <-unlist(strsplit(job5.skills, split = ".", fixed = TRUE))
y <-unlist(strsplit(y, split = ",", fixed = TRUE))

y

## [1] "Minimum Qualifications:To qualify you must have a Bachelors Degree or a minimum of three years in a physician office"
## [2] " hospital or managed care environment"                                                                               
## [3] " Proficient in the use of Excel and Word"                                                                            
## [4] " Knowledge of claims processing systems and guidelines"                                                              
## [5] " Ability to analyze problems"                                                                                        
## [6] " draw relevant conclusions"                                                                                          
## [7] " and devise appropriate courses of action"                                                                           
## [8] " Excellent written communication and oral presentation skills"                                                       
## [9] " Exceptional relationship and organizational skills"

Proj3

Durley Torres-Marin

October 17, 2017