607 Project 3

Web Scraping

library("tidyverse")  
library("rvest")    
library("stringi")   
library("xml2")
library("kableExtra")

#Import url (indeed search results for full time data sceintist positions)
url <- "https://www.indeed.com/jobs?q=data+scientist&jt=fulltime"
page <- read_html(url)

#Extract urls from left side of page
location <- page %>% 
  html_nodes("li") %>%
  html_nodes(xpath = '//*[@rel="nofollow"]') %>%
  html_attr("href")

#Extract top 5 location urls based on indexes
location2 <- location[c(8:12)]

pageStart <- 10 # 2nd page results
pageEnd <- 90 # 10th page results
pageResults <- seq(from = pageStart, to = pageEnd, by = 10)

#Create dataframe of search page result urls
url<-c()

for(i in 1:5) {
  baseUrl <- "https://www.indeed.com"
  #Filter results by location
  url1 <- paste(baseUrl, location2[i], sep="")
  #Go to next page
  for(i in seq_along(pageResults)) {
    url2 <- paste0(url1, "&start=", pageResults[i])
    url<-rbind(url, url1, url2)
  }
}

url <- unique(url)

#Create dataframe of job titles, locations, and summaries from url dataframe
#Create an empty dataframe
fullDf <- data.frame()

#Use a for loop to collect data
for(i in url) {
  #Visit each url in url dataframe
  page <- html_session(i)
  #Extract job titles
  jobTitle <- page %>% 
    html_nodes("div") %>%
    html_nodes(xpath = '//*[@data-tn-element="jobTitle"]') %>%
    html_attr("title")
    
  #Extract company names
  companyName <- page %>% 
    html_nodes("span")  %>% 
    html_nodes(xpath = '//*[@class="company"]')  %>% 
    html_text() %>%
    stri_trim_both() -> company.name 
  
  #Extract job locations
  jobLocation <- page %>% 
    html_nodes("span") %>% 
    html_nodes(xpath = '//*[@class="location"]')%>% 
    html_text() %>%
    stri_trim_both() -> job.location
    
  #Extract job summaries
  jobSummary <- page %>% 
    html_nodes("span")  %>% 
    html_nodes(xpath = '//*[@class="summary"]')  %>% 
    html_text() %>%
    stri_trim_both() -> summary.short 
    lenJobTitle <- length(jobTitle)
    lenCompanyName <- length(companyName)
    lenJobLocation <- length(jobLocation)
    lenJobSummary <- length(jobSummary)
  
    if (lenJobTitle == lenCompanyName & lenJobLocation == lenJobSummary & lenCompanyName == lenJobLocation) {
      #Put data in a dataframe  
      df <- data.frame(jobTitle, companyName, jobLocation, jobSummary)
      #Add dataframe to starting dataframe
      fullDf <- rbind(fullDf, df)
  }
}

#Display table
DT::datatable(fullDf, editable = TRUE)

607 Project 3

Katherine Evers

3/21/2019

Web Scraping