Source file ⇒ lec35.Rmd
HTTP (hypertext transfer protocol) allows for communication between a client and host via request/response messages. At the heart of web communication is the request message, which are sent via Uniform Resource Locators (URLs).
R has very basic –limited– support for numerous protocols (e.g. HTTPS
, FTP
, FTPS
etc)
“RCurl” provides steriods for R to handle these protocols. Most important for us it will allow us to parse https.
There are two high level functions in RCurl
we are concerned with:
Function | Description |
---|---|
getURLContent() | fetches the content of a URL |
getForm() | submits a Web form via the GET method |
library(XML)
library(RCurl)
URL <- "https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype="
txt = getURLContent(URL)
doc <- htmlParse(txt)
or
URL <- "https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype="
getFormParams(URL)
## searchterms searchlocation newsearch sorttype
## "Data+Scientist" "" "true" ""
baseURL="https://www.cybercoders.com/search/"
txt = getForm(baseURL, searchterms = "Data+Scientist", searchlocation = "", newsearch = "true", sorttype = "")
doc = htmlParse(txt)
We will look at the salary offered for Data Science jobs listed at CyberCoders
Type in “Data Scientist” in the Job title, Keyword search box.
This will take you to https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype=
Notice this is a secure website (https). This requires us to use the function getURLContent
from the RCurl
package.
Explore some of the job listings
URL <- "https://www.cybercoders.com/data-scientist-job-258251"
txt <- getURLContent(URL)
doc <- htmlParse(txt) #doc is a parsed HTML document
info <- doc %>% xpathSApply( '//div[@class="wage"]', xmlValue)
info
## [1] " Full-time $90k - $150k"
Note: You can equivalently write: info <- doc %>% getNodeSet( '//div[@class="wage"]') %>% sapply(xmlValue)
Find all the wages on the page https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype=
URL <- "https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype="
txt <- getURLContent(URL)
doc <- htmlParse(txt)
info <- doc %>% xpathSApply( '//div[@class="wage"]', xmlValue)
info
## [1] "Full-time $90k - $150k" "Full-time $120k - $175k"
## [3] "Compensation Unspecified" "Full-time $80k - $100k"
## [5] "Full-time $100k - $175k" "Full-time $80k - $120k"
## [7] "Full-time $90k - $120k" "Full-time $130k - $175k"
## [9] "Compensation Unspecified" "Full-time $95k - $120k"
## [11] "Full-time $130k - $160k" "Full-time $100k - $150k"
## [13] "Full-time $95k - $140k" "Full-time $80k - $160k"
## [15] "Compensation Unspecified" "Full-time $120k - $175k"
## [17] "Full-time $100k - $140k" "Compensation Unspecified"
## [19] "Full-time $100k - $130k" "Full-time $100k - $130k"
Lets write a function that given the parsed HTML doc returns all the salaries on the page:
cy.readPost =
function(doc)
{
info <- doc %>% xpathSApply( '//div[@class="wage"]', xmlValue)
info
}
"https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype=" %>%
getURLContent() %>%
htmlParse() %>%
cy.readPost()
## [1] "Full-time $90k - $150k" "Full-time $120k - $175k"
## [3] "Compensation Unspecified" "Full-time $80k - $100k"
## [5] "Full-time $100k - $175k" "Full-time $80k - $120k"
## [7] "Full-time $90k - $120k" "Full-time $130k - $175k"
## [9] "Compensation Unspecified" "Full-time $95k - $120k"
## [11] "Full-time $130k - $160k" "Full-time $100k - $150k"
## [13] "Full-time $95k - $140k" "Full-time $80k - $160k"
## [15] "Compensation Unspecified" "Full-time $120k - $175k"
## [17] "Full-time $100k - $140k" "Compensation Unspecified"
## [19] "Full-time $100k - $130k" "Full-time $100k - $130k"
Our goal is to scrape all of the Data Scientist wage info from this website and put it in a data frame with variables low and high for low and high end of salary.
Next we need to get the salaries off of page 2 etc.
Examine the sourse code https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype= and figure out how to get the salaries off of page 2.
URL <- "https://www.cybercoders.com/search/?searchterms=Data+Scientist&searchlocation=&newsearch=true&sorttype="
txt <- getURLContent(URL)
doc <- htmlParse(txt)
link <- doc %>% getNodeSet( "//a[@rel='next']/@href")
baseURL <- "https://www.cybercoders.com/search/"
paste(baseURL,as.character(link[[1]]),sep="")
## [1] "https://www.cybercoders.com/search/./?page=2&searchterms=Data%20Scientist&searchlocation=&newsearch=true&sorttype="
cy.getNextPageLink =
function(doc)
{
baseURL = "https://www.cybercoders.com/search/"
link = getNodeSet(doc, "//a[@rel='next']/@href")
if(length(link) == 0) # if there is no link then length(link) will be zero
return(character())
paste(baseURL,as.character(link[[1]]),sep="")
}
Now we can write a function CyberCoders
(see below( that will give us the salary for each page.
cy.readPost =
function(doc)
{
info <- doc %>% xpathSApply( '//div[@class="wage"]', xmlValue)
info
}
cy.getNextPageLink =
function(doc)
{
baseURL = "https://www.cybercoders.com/search/"
link = getNodeSet(doc, "//a[@rel='next']/@href")
if(length(link) == 0) # if there is no link then length(link) will be zero
return(character())
paste(baseURL,as.character(link[[1]]),sep="")
}
cyberCoders =
function(query)
{
txt = getForm("https://www.cybercoders.com/search/",
searchterms = query, searchlocation = "",
newsearch = "true", sorttype = "")
doc = htmlParse(txt)
posts = c()
while(TRUE) {
posts = c(posts, cy.readPost(doc))
nextPage = cy.getNextPageLink(doc)
if(length(nextPage) == 0)
break
nextPage = getURLContent(nextPage)
doc = htmlParse(nextPage, asText = TRUE)
}
posts
}
dataSciPosts = cyberCoders("Data Scientist")
With your neighbor trace through these function and explain what is happening.
Finally, lets convert our vector of salaries into a data frame.
dataSciPosts = as.character(cyberCoders("Data Scientist"))
salaries <- as.data.frame(dataSciPosts) %>%
extractMatches(pattern="([[:digit:]]+)k - \\$([[:digit:]]+)", dataSciPosts, low=1, high=2) %>%
select(low,high) %>%
filter(!is.na(low)) %>%
filter(high!=0)
head(salaries)
## low high
## 1 90 150
## 2 120 175
## 3 80 100
## 4 100 175
## 5 80 120
## 6 90 120