607 Project 3

library("tidyverse")  
library("rvest")    
library("stringi")   
library("xml2")
library("kableExtra")

#Import url
url <- "https://www.indeed.com/jobs?q=%22data%20scientist%22&l=Silicon%20Valley%2C%20CA"
#read url
page <- read_html(url)

#Extract job titles
job_title <- page %>% 
  html_nodes("div") %>%
  html_nodes(xpath = '//a[@data-tn-element = "jobTitle"]') %>%
  html_attr("title")

#Extract company names
company_name <- page %>% 
  html_nodes("span")  %>% 
  html_nodes(xpath = '//*[@class="company"]')  %>% 
  html_text() %>%
  stri_trim_both() -> company.name 

#Extract job posting links  
links <- page %>% 
  html_nodes("div") %>%
  html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
  html_attr("href")

#Extract job descriptions  
job_description <- c()
for(i in seq_along(links)) {
  url <- paste0("https://indeed.com/", links[i])
  page <- read_html(url)
  job_description[[i]] <- page %>%
    html_nodes("span")  %>% 
    html_nodes(xpath = '//*[@class="jobsearch-JobComponent-description icl-u-xs-mt--md"]') %>% 
    html_text() %>%
    stri_trim_both()
}
  
#Create dataframe  
df <- data.frame(job_title, company_name, job_description)
  
DT::datatable(df, editable = TRUE)

607 Project 3

Katherine Evers

3/18/2019