Load Libraries

#install.packages("rvest") #install rvest package if it is not already installed
library(rvest)
library(stringr)
library(tidyverse)

Create all variables needed to search 10 pages of www.simplyhired.com

#build URL
site.pt1 <- "https://www.simplyhired.com/search?q=data+scientist&l=Brooklyn%2C+NY&pn="
site.pt2 <- "&job=FKql1P5_hBC1iuZNXkobdYwS3L8hYW8qlp10RE0p-U_OOFU210Cs4g"
pages = 10

url <- str_c(site.pt1, pages, site.pt2)
#url <- "https://www.simplyhired.com/search?q=data+scientist&l=Brooklyn%2C+NY&job=-q6yR-atece9p8LQvm2yP8xIX3VcYfRC9wsdPgSS0nWHIG3f2EZOxA"
#read all listings data from the url
listings <- read_html(url)

job_title <- listings %>%
  html_nodes("div") %>%
  html_nodes(".jobposting-title") %>%
  html_text()

hiring_company <- listings %>%
  html_nodes("div") %>%
  html_nodes(".jobposting-company") %>%
  html_text()

location <- listings %>%
  html_nodes("div") %>%
  html_nodes('[itemprop="address"]') %>%
  html_text()

salary <- listings %>%
  html_nodes("div") %>%
  html_nodes('.SerpJob-metaInfo') %>%
  html_text()

job_url <- listings %>%
  html_nodes("div") %>%
  html_nodes(".jobposting-title") %>%
  html_nodes("a") %>%
  html_attr("href") %>%
  as.character()

#add beginning of main URL to job_url to create a useable URL for the FOR loop
job_url_joined <- str_c("https://www.simplyhired.com",job_url)

Loop through 10 pages to get 100+ postings

job_description <- as.character()

for (i in 1:pages){
  #create URL for each page
  url <- str_c(site.pt1, page_num = i, site.pt2)
  
  #read URL data
  listings <- read_html(url)
  
  #pull necessary data from each page  
  job_title <- listings %>%
    html_nodes("div") %>%
    html_nodes(".jobposting-title") %>%
    html_text()
  
  hiring_company <- listings %>%
    html_nodes("div") %>%
    html_nodes(".jobposting-company") %>%
    html_text()
  
  location <- listings %>%
    html_nodes("div") %>%
    html_nodes('[itemprop="address"]') %>%
    html_text()
  
  salary <- listings %>%
    html_nodes("div") %>%
    html_nodes('.SerpJob-metaInfo') %>%
    html_text()
  
  job_url <- listings %>%
    html_nodes("div") %>%
    html_nodes(".jobposting-title") %>%
    html_nodes("a") %>%
    html_attr("href") %>%
    as.character()
  
  job_url_joined <- str_c("https://www.simplyhired.com",job_url)
  
  #second loop to get job description for each job posted on each page
  for(j in 1:length(job_url_joined)){
    desc <- read_html(job_url_joined[j])
    job_description[j] <- desc %>%
      html_nodes("div") %>%
      html_nodes(".ViewJob-description") %>%
      html_text() %>%
      as.character()
  }
  
  if(i==1){
    df <- tibble(job_title, hiring_company, location, salary, job_url_joined, job_description)
  }
  else{
    df <- add_row(df, job_title, hiring_company, location, salary, job_url_joined, job_description)
  }
}

head(df)
## # A tibble: 6 x 6
##   job_title  hiring_company  location salary job_url_joined job_description
##   <chr>      <chr>           <chr>    <chr>  <chr>          <chr>          
## 1 Data Scie~ Spotify         New Yor~ Estim~ https://www.s~ "Marketplace i~
## 2 Data Engi~ Noom Inc.       New Yor~ Estim~ https://www.s~ At Noom, we us~
## 3 Senior Da~ HVH Precision ~ New Yor~ Estim~ https://www.s~ Job Descriptio~
## 4 Ecologica~ PS&S            Mineola~ Estim~ https://www.s~ "Overview\nPS&~
## 5 Senior Da~ Strategic Fina~ New Yor~ Estim~ https://www.s~ "Overview\nDo ~
## 6 Sr. Direc~ HeartShare Hum~ Brookly~ Estim~ https://www.s~ "Position Summ~