Create all variables needed to search 10 pages of www.simplyhired.com
#build URL
site.pt1 <- "https://www.simplyhired.com/search?q=data+scientist&l=Brooklyn%2C+NY&pn="
site.pt2 <- "&job=FKql1P5_hBC1iuZNXkobdYwS3L8hYW8qlp10RE0p-U_OOFU210Cs4g"
pages = 10
url <- str_c(site.pt1, pages, site.pt2)
#url <- "https://www.simplyhired.com/search?q=data+scientist&l=Brooklyn%2C+NY&job=-q6yR-atece9p8LQvm2yP8xIX3VcYfRC9wsdPgSS0nWHIG3f2EZOxA"
#read all listings data from the url
listings <- read_html(url)
job_title <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-title") %>%
html_text()
hiring_company <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-company") %>%
html_text()
location <- listings %>%
html_nodes("div") %>%
html_nodes('[itemprop="address"]') %>%
html_text()
salary <- listings %>%
html_nodes("div") %>%
html_nodes('.SerpJob-metaInfo') %>%
html_text()
job_url <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-title") %>%
html_nodes("a") %>%
html_attr("href") %>%
as.character()
#add beginning of main URL to job_url to create a useable URL for the FOR loop
job_url_joined <- str_c("https://www.simplyhired.com",job_url)
Loop through 10 pages to get 100+ postings
job_description <- as.character()
for (i in 1:pages){
#create URL for each page
url <- str_c(site.pt1, page_num = i, site.pt2)
#read URL data
listings <- read_html(url)
#pull necessary data from each page
job_title <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-title") %>%
html_text()
hiring_company <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-company") %>%
html_text()
location <- listings %>%
html_nodes("div") %>%
html_nodes('[itemprop="address"]') %>%
html_text()
salary <- listings %>%
html_nodes("div") %>%
html_nodes('.SerpJob-metaInfo') %>%
html_text()
job_url <- listings %>%
html_nodes("div") %>%
html_nodes(".jobposting-title") %>%
html_nodes("a") %>%
html_attr("href") %>%
as.character()
job_url_joined <- str_c("https://www.simplyhired.com",job_url)
#second loop to get job description for each job posted on each page
for(j in 1:length(job_url_joined)){
desc <- read_html(job_url_joined[j])
job_description[j] <- desc %>%
html_nodes("div") %>%
html_nodes(".ViewJob-description") %>%
html_text() %>%
as.character()
}
if(i==1){
df <- tibble(job_title, hiring_company, location, salary, job_url_joined, job_description)
}
else{
df <- add_row(df, job_title, hiring_company, location, salary, job_url_joined, job_description)
}
}
head(df)
## # A tibble: 6 x 6
## job_title hiring_company location salary job_url_joined job_description
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Data Scie~ Spotify New Yor~ Estim~ https://www.s~ "Marketplace i~
## 2 Data Engi~ Noom Inc. New Yor~ Estim~ https://www.s~ At Noom, we us~
## 3 Senior Da~ HVH Precision ~ New Yor~ Estim~ https://www.s~ Job Descriptio~
## 4 Ecologica~ PS&S Mineola~ Estim~ https://www.s~ "Overview\nPS&~
## 5 Senior Da~ Strategic Fina~ New Yor~ Estim~ https://www.s~ "Overview\nDo ~
## 6 Sr. Direc~ HeartShare Hum~ Brookly~ Estim~ https://www.s~ "Position Summ~