1.Setup - Load required libraries
library(rvest)
library(plyr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following object is masked from 'package:plyr':
##
## here
2.Initialize the URL
url <- "http://www.r-bloggers.com/search/web%20scraping"
3.Write functions to scrap the required data from a given page
#This function extracts the author id from the URL, and the sample URL - http://www.r-bloggers.com/author/suman
#help for grep - http://www.inside-r.org/r-doc/base/grep
GetAuthor <- function(x) {
m <- regexec("^(([^:]+)://)?([^:/]+)(:([0-9]+))?(/author/)?(.*)?(/)", x)
parts <- do.call(rbind,
lapply(regmatches(x, m), `[`, c(3L, 4L, 6L, 7L, 8L)))
colnames(parts) <- c("protocol","host","port","path", "author")
if ( parts[5] == '/author') { return (NA ) }
else { (return (parts[5])) }
}
#Get the title, date and author id into a data frame for a given page.
GetPageData <- function(page_number) {
#prepare a full url, for example for page 1, it would be "http://www.r-bloggers.com/search/web%20scraping/page/1"
full_url <- paste(url, "/page/", page_number, sep='')
#scrap the titles from the given page.
title <- full_url %>%
html() %>%
html_nodes("#leftcontent h2 a") %>%
html_text()
#scrap the published date
dates <- full_url %>%
html() %>%
html_nodes("#leftcontent .meta .date") %>%
html_text()
#Convert it into a date
date <- as.Date(strptime(dates, "%B %d, %Y"))
#authors <- full_url %>%
# html() %>%
# html_nodes("#leftcontent .meta a") %>%
# html_text()
#NOTE: Since some of the author names are secure email id's. Decided to grab the href (intead of the above commented code), and then parse the author id from the href URL.
authors.href <- full_url %>%
html() %>%
html_nodes("#leftcontent .meta a[rel=author]") %>% html_attr("href")
authors <- lapply(authors.href, GetAuthor)
author <- unlist(lapply(authors, tail, 1))
#bind the columns in to a data frame.
blog.posts <- cbind.data.frame(title, date, author, stringsAsFactors = FALSE)
#apply factors
blog.posts$date <- as.factor(blog.posts$date)
blog.posts$author <- as.factor(blog.posts$author)
#putting a pause in between page reads, so this call is not treated as a denial of service attack.
Sys.sleep(1)
return ( blog.posts )
}
4.Get page data for the first page
#Get the data frame for page1 - title, date, author
page1.df <- GetPageData(1)
head(page1.df)
## title
## 1 rvest: easy web scraping with R
## 2 Migrating Table-oriented Web Scraping Code to rvest w/XPath & CSS Selector Examples
## 3 Web Scraping: working with APIs
## 4 Web Scraping: Scaling up Digital Data Collection
## 5 Web Scraping part2: Digging deeper
## 6 A Little Web Scraping Exercise with XML-Package
## date author
## 1 2014-11-24 hadleywickham
## 2 2014-09-17 bob-rudis-hrbrmstr
## 3 2014-03-12 rolf-fredheim
## 4 2014-03-05 rolf-fredheim
## 5 2014-02-25 rolf-fredheim
## 6 2012-04-05 kay-cichini
5.Get the data from all tagged pages
#For each of the 17 pages call the GetPageData and capture the data - title, date, author
data <- ldply(1:17, GetPageData)
data$title <- gsub("[^[:alnum:]///' ]", " ", data$title)
str(data)
## 'data.frame': 165 obs. of 3 variables:
## $ title : chr "rvest easy web scraping with R" "Migrating Table oriented Web Scraping Code to rvest w/XPath CSS Selector Examples" "Web Scraping working with APIs" "Web Scraping Scaling up Digital Data Collection" ...
## $ date : Factor w/ 149 levels "2011-11-10","2011-11-11",..: 10 9 8 7 6 5 4 3 2 1 ...
## $ author: Factor w/ 96 levels "axiomofchoice",..: 3 2 5 5 5 4 6 1 6 6 ...
head(data)
## title
## 1 rvest easy web scraping with R
## 2 Migrating Table oriented Web Scraping Code to rvest w/XPath CSS Selector Examples
## 3 Web Scraping working with APIs
## 4 Web Scraping Scaling up Digital Data Collection
## 5 Web Scraping part2 Digging deeper
## 6 A Little Web Scraping Exercise with XML Package
## date author
## 1 2014-11-24 hadleywickham
## 2 2014-09-17 bob-rudis-hrbrmstr
## 3 2014-03-12 rolf-fredheim
## 4 2014-03-05 rolf-fredheim
## 5 2014-02-25 rolf-fredheim
## 6 2012-04-05 kay-cichini
6.Some of the packages used in web scrapping
rvest - Wrappers around the XML and httr packages to make it easy to download, then manipulate, both html and xml
XML - This package provides many approaches for both reading and creating XML (and HTML) documents (including DTDs), both local and accessible via HTTP or FTP. It also offers access to an XPath “interpreter”.
RCurl - This allows us to download files from Web servers, post forms, use HTTPS (the secure HTTP), use persistent connections, upload files, use binary content, handle redirects, password authentication, etc.
Lets figure out which of the blog entries in our pages above mentioned these packages.
#Checks to see if the given url page content has the mention of the given package.
IsPackageFound <- function(urlpath, package) {
entry <- urlpath %>%
html() %>%
html_nodes("#leftcontent div.entry pre") %>%
html_text()
x <- grep(pattern = package, entry, value = TRUE)
if ( length(x) == 0 ) { found <- 0 } else { found <- 1 }
found
}
GetPackageData <- function(page_number) {
#prepare a full url, for example for page 1, it would be "http://www.r-bloggers.com/search/web%20scraping/page/1"
full_url <- paste(url, "/page/", page_number, sep='')
#get the entry URL ( readmore links hrefs) for the given page.
entry.url <- full_url %>%
html() %>%
html_nodes("div.entry p a.more-link") %>% html_attr("href")
#apply the IsPackageFound for each package to the entry.urls vector
rvest.used <- lapply(entry.url, IsPackageFound, "rvest")
rvest <- unlist(lapply(rvest.used, tail, 1))
xml.used <- lapply(entry.url, IsPackageFound, "XML")
xml <- unlist(lapply(xml.used, tail, 1))
rcurl.used <- lapply(entry.url, IsPackageFound, "RCurl")
rcurl <- unlist(lapply(rcurl.used, tail, 1))
#bind the columns in to a data frame.
package.data.df <- cbind.data.frame(entry.url, rvest, xml, rcurl, stringsAsFactors = FALSE)
#putting a pause in between page reads, so this call is not treated as a denial of service attack.
Sys.sleep(1)
return ( package.data.df )
}
#Load package data for each of the blog entry from those 17 pages.
data.pkg <- ldply(1:17, GetPackageData)
head(data.pkg)
## entry.url
## 1 http://www.r-bloggers.com/rvest-easy-web-scraping-with-r/
## 2 http://www.r-bloggers.com/migrating-table-oriented-web-scraping-code-to-rvest-wxpath-css-selector-examples/
## 3 http://www.r-bloggers.com/web-scraping-working-with-apis/
## 4 http://www.r-bloggers.com/web-scraping-scaling-up-digital-data-collection/
## 5 http://www.r-bloggers.com/web-scraping-part2-digging-deeper/
## 6 http://www.r-bloggers.com/a-little-web-scraping-exercise-with-xml-package/
## rvest xml rcurl
## 1 1 0 0
## 2 1 1 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 1
6.Combine both data frames [ column bind]
full.df <- cbind(data, data.pkg)
colnames(full.df)
## [1] "title" "date" "author" "entry.url" "rvest" "xml"
## [7] "rcurl"
head(full.df)
## title
## 1 rvest easy web scraping with R
## 2 Migrating Table oriented Web Scraping Code to rvest w/XPath CSS Selector Examples
## 3 Web Scraping working with APIs
## 4 Web Scraping Scaling up Digital Data Collection
## 5 Web Scraping part2 Digging deeper
## 6 A Little Web Scraping Exercise with XML Package
## date author
## 1 2014-11-24 hadleywickham
## 2 2014-09-17 bob-rudis-hrbrmstr
## 3 2014-03-12 rolf-fredheim
## 4 2014-03-05 rolf-fredheim
## 5 2014-02-25 rolf-fredheim
## 6 2012-04-05 kay-cichini
## entry.url
## 1 http://www.r-bloggers.com/rvest-easy-web-scraping-with-r/
## 2 http://www.r-bloggers.com/migrating-table-oriented-web-scraping-code-to-rvest-wxpath-css-selector-examples/
## 3 http://www.r-bloggers.com/web-scraping-working-with-apis/
## 4 http://www.r-bloggers.com/web-scraping-scaling-up-digital-data-collection/
## 5 http://www.r-bloggers.com/web-scraping-part2-digging-deeper/
## 6 http://www.r-bloggers.com/a-little-web-scraping-exercise-with-xml-package/
## rvest xml rcurl
## 1 1 0 0
## 2 1 1 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 1 1
7.Some Data Analysis & Visualization
#Lets figure out top 3 overall highest number of posts
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
(group_by(full.df, author) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
top_n(n=3) )
## Selecting by count
## Source: local data frame [3 x 2]
##
## author count
## 1 tony-breyal 10
## 2 bryan 9
## 3 rolf-fredheim 7
#Number of posts in each year
qplot(year(date), data = full.df, main = "Histogram of Number of Posts", xlab= "Year", ylab="Count") + scale_y_continuous(breaks = seq(0,50,by = 2))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#Number of posts for each package
# O represents revest.
# + repesents XML.
# X represents rcurl.
ggplot(full.df) +
geom_point(data = subset(full.df, rvest == 1) , aes(year(date), title), shape=1, size=10) +
geom_point(data = subset(full.df, xml == 1) , aes(year(date), title), shape=3, size=6) +
geom_point(data = subset(full.df, rcurl == 1) , aes(year(date), title), shape=4, size=4) +
ggtitle(" Package [ O = rvest, + = XML, X = rcurl ] mentioned in blog entries")
8. r-bloggers does not appear to be providing API to search the blogs. Here are some of the ideas for the APIs.
The below are few sample RESTful resource ideas to provide JSON and/or XML responses.
1. Provide blog entries for a date range: https://www.r-bloggers.com/api/v1/entries?fromdate={yyyy-mm-dd}&to={yyyy-mm-dd}
2. Provide top n blog posts of the week/month/year: eg: https://www.r-bloggers.com/api/v1/entries/top?count=n&month={monthNumber}
_3. Provide blog entries of a given author, by year(/month): https://www.r-bloggers.com/api/v1/entries/author/{authorid}/year/{year}_
4. Search blog entries based on a search text: https://www.r-bloggers.com/api/v1/entries/search?text={searchtext}
_5. Search current R jobs: https://www.r-bloggers.com/api/v1/jobs/_
_6. Provide popular searches: https://www.r-bloggers.com/api/v1/popularsearches/_