I am using rvest library to scrape “http://www.r-bloggers.com/search/web%20scraping” to collect Date, author and title information. As additional task, I am navigating through all the pages and getting Date, author and title information from all those pages also.

include necessary packages

#install.packages("rvest")
library(rvest)
library(stringr)
library(dplyr)

A function rbloggerscraper is used to get the Date, title and author information. It takes url and pageid as parameter. This function is useful as it comes handy when I am navigating through all the pages in the pagination list.

rbloggerscraper <- function(url, pageid)
{
  df.scrapedinfo <- data.frame(pageid=numeric(0), blogdate=character(),author=character(),title=character())
  rbloggerpage<-html(url)
  leftcontent <- rbloggerpage %>% html_nodes("#leftcontent") 
  posts <- leftcontent %>% html_nodes(xpath="//div[starts-with(@id,'post-')]")
  for(i in 1:length(posts))
  {
    blogdate <- html_nodes(posts[i],".meta .date") %>% html_text()
    author <- html_nodes(posts[i],".meta a") %>% html_text()
    title <- html_nodes(posts[i],"h2 a") %>% html_text()
    df.scrapedinfo <- rbind(df.scrapedinfo, data.frame(pageid,blogdate,author,title))
  }
  return (df.scrapedinfo)
}

Get the data for the first page.

df.result <- data.frame(pageid=numeric(0),blogdate=character(),author=character(),title=character())
df.r <- rbloggerscraper("http://www.r-bloggers.com/search/web%20scraping",1)
df.result <- rbind(df.result, df.r)

Get the pagination information and get the lastpage number

getLastPage<-function(url)
{
  pagination <- html(url) %>% html_nodes(".pagination")
  lastpage <- html_nodes(pagination,"a") %>% .[[5]] %>% html_text() %>% as.numeric()
  return(lastpage)
}

lastpage <- getLastPage("http://www.r-bloggers.com/search/web%20scraping")

loop through pages from 2 to last page.

scrapeRestOfPages <- function(partURL)
{
  df.r1 <- data.frame(pageid=numeric(0), blogdate=character(),author=character(),title=character())
  for(i in 2:lastpage)
  {
    url <-  str_c(partURL,i,"/")
    print(sprintf("Page number %i",i))
    df.r <- rbloggerscraper(url,i)
    df.r1 <- rbind(df.r1, df.r)
    Sys.sleep(2)
  }
  return(df.r1)
}


df1 <- scrapeRestOfPages("http://www.r-bloggers.com/search/web%20scraping/page/")
## [1] "Page number 2"
## [1] "Page number 3"
## [1] "Page number 4"
## [1] "Page number 5"
## [1] "Page number 6"
## [1] "Page number 7"
## [1] "Page number 8"
## [1] "Page number 9"
## [1] "Page number 10"
## [1] "Page number 11"
## [1] "Page number 12"
## [1] "Page number 13"
## [1] "Page number 14"
## [1] "Page number 15"
## [1] "Page number 16"
## [1] "Page number 17"
df.result <- rbind(df.result, df1)

View the result data frame

nrow(df.result)
## [1] 165
tbl_df(df.result)
## Source: local data frame [165 x 4]
## 
##    pageid           blogdate                author
## 1       1  November 24, 2014         hadleywickham
## 2       1 September 17, 2014 Bob Rudis (@hrbrmstr)
## 3       1     March 12, 2014         Rolf Fredheim
## 4       1      March 5, 2014         Rolf Fredheim
## 5       1  February 25, 2014         Rolf Fredheim
## 6       1      April 5, 2012           Kay Cichini
## 7       1    January 6, 2012           Tony Breyal
## 8       1  December 27, 2011         axiomOfChoice
## 9       1  November 11, 2011           Tony Breyal
## 10      1  November 10, 2011           Tony Breyal
## ..    ...                ...                   ...
## Variables not shown: title (fctr)