Web Scraping
library("tidyverse")
library("rvest")
library("stringi")
library("xml2")
library("kableExtra")
#Import url (indeed search results for full time data sceintist positions)
url <- "https://www.indeed.com/jobs?q=data+scientist&jt=fulltime"
page <- read_html(url)
#Extract urls from left side of page
location <- page %>%
html_nodes("li") %>%
html_nodes(xpath = '//*[@rel="nofollow"]') %>%
html_attr("href")
#Extract top 5 location urls based on indexes
location2 <- location[c(8:12)]
pageStart <- 10 # 2nd page results
pageEnd <- 90 # 10th page results
pageResults <- seq(from = pageStart, to = pageEnd, by = 10)
#Create dataframe of search page result urls
url<-c()
for(i in 1:5) {
baseUrl <- "https://www.indeed.com"
#Filter results by location
url1 <- paste(baseUrl, location2[i], sep="")
#Go to next page
for(i in seq_along(pageResults)) {
url2 <- paste0(url1, "&start=", pageResults[i])
url<-rbind(url, url1, url2)
}
}
url <- unique(url)
#Create dataframe of job titles, locations, and summaries from url dataframe
#Create an empty dataframe
fullDf <- data.frame()
#Use a for loop to collect data
for(i in url) {
#Visit each url in url dataframe
page <- html_session(i)
#Extract job titles
jobTitle <- page %>%
html_nodes("div") %>%
html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
html_attr("title")
#Extract company names
companyName <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="company"]') %>%
html_text() %>%
stri_trim_both() -> company.name
#Extract job locations
jobLocation <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="location"]')%>%
html_text() %>%
stri_trim_both() -> job.location
#Extract job summaries
jobSummary <- page %>%
html_nodes("span") %>%
html_nodes(xpath = '//*[@class="summary"]') %>%
html_text() %>%
stri_trim_both() -> summary.short
lenJobTitle <- length(jobTitle)
lenCompanyName <- length(companyName)
lenJobLocation <- length(jobLocation)
lenJobSummary <- length(jobSummary)
if (lenJobTitle == lenCompanyName & lenJobLocation == lenJobSummary & lenCompanyName == lenJobLocation) {
#Put data in a dataframe
df <- data.frame(jobTitle, companyName, jobLocation, jobSummary)
#Add dataframe to starting dataframe
fullDf <- rbind(fullDf, df)
}
}
#Display table
DT::datatable(fullDf, editable = TRUE)