library(RCurl)
library(XML)
library(tidyverse)
library(rvest)
library(stringr)
library(ggplot2)
Get listing of 16 HTML files for the Data Scientist [from Indeed.com] job posts
#NOTE: provide an existing path (in your environment) in order to store generated output files
data_store_path <- "~/GitHub/Project3"
jobURLs <- list.files(data_store_path, "indeed_job_post_.*.html")
head(jobURLs, 3)
## [1] "indeed_job_post_001.html" "indeed_job_post_002.html"
## [3] "indeed_job_post_003.html"
Visit each job posting HTML file and scrape job title and description for analysis
job_sum_text <- vector(mode = "character", length = length(jobURLs))
job_title <- vector(mode = "character", length = length(jobURLs))
for (i in 1:length(jobURLs)) {
#Visit each HTML page
htmFile <- file.path(data_store_path, jobURLs[i])
h <- read_html(htmFile)
#Get HTML nodes with CSS id "job_summary"
jobSum <- html_nodes(h, "#job_summary")
#Get textual content from the "job summary"" nodes
job_sum_text[i] = html_text(jobSum)
#Collect job title text
#Search for HTML <b> nodes with CSS class "jobtitle"
jobTitleNode <- html_nodes(h, "b.jobtitle")
job_title[i] <- html_text(jobTitleNode)
}
Create a data frame holding the result of scraping (job title, job summary, etc.) and save to a file
job_df <- data.frame(job_post_source = "INDEED", job_post_title = job_title, job_post_summary = job_sum_text)
glimpse(job_df)
## Observations: 16
## Variables: 3
## $ job_post_source <fct> INDEED, INDEED, INDEED, INDEED, INDEED, INDEE...
## $ job_post_title <fct> Data Scientist - Fixed Income Real-Time Prici...
## $ job_post_summary <fct> Job Requisition Number:64749The Fixed Income ...
save(job_df, file = file.path(data_store_path, "jobs_df.RData"), ascii = TRUE)
To load the data frame object [named job_df] back into the environment call:
load(file.path(data_store_path, "jobs_df.RData"))
head(jb_df, 2)
View(job_df)