library("tidyverse")
library("rvest")
library("stringi")
library("xml2")
library("kableExtra")
#Import url
url <- "https://www.indeed.com/jobs?q=%22data%20scientist%22&l=Silicon%20Valley%2C%20CA"
#read url
page <- read_html(url)
#Extract job titles
job_title <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//a[@data-tn-element = "jobTitle"]') %>%
html_attr("title")
#Extract company names
company_name <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="company"]') %>%
html_text() %>%
stri_trim_both() -> company.name
#Extract job posting links
links <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("href")
#Extract job descriptions
job_description <- c()
for(i in seq_along(links)) {
url <- paste0("https://indeed.com/", links[i])
page <- read_html(url)
job_description[[i]] <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="jobsearch-JobComponent-description icl-u-xs-mt--md"]') %>%
html_text() %>%
stri_trim_both()
}
#Create dataframe
df <- data.frame(job_title, company_name, job_description)
DT::datatable(df, editable = TRUE)