#install.packages("rvest")
library(rvest)
library(stringr)
library(dplyr)
df.result <- data.frame(pageid=numeric(0),blogdate=character(),author=character(),title=character())
df.r <- rbloggerscraper("http://www.r-bloggers.com/search/web%20scraping",1)
df.result <- rbind(df.result, df.r)
getLastPage<-function(url)
{
pagination <- html(url) %>% html_nodes(".pagination")
lastpage <- html_nodes(pagination,"a") %>% .[[5]] %>% html_text() %>% as.numeric()
return(lastpage)
}
lastpage <- getLastPage("http://www.r-bloggers.com/search/web%20scraping")
scrapeRestOfPages <- function(partURL)
{
df.r1 <- data.frame(pageid=numeric(0), blogdate=character(),author=character(),title=character())
for(i in 2:lastpage)
{
url <- str_c(partURL,i,"/")
print(sprintf("Page number %i",i))
df.r <- rbloggerscraper(url,i)
df.r1 <- rbind(df.r1, df.r)
Sys.sleep(2)
}
return(df.r1)
}
df1 <- scrapeRestOfPages("http://www.r-bloggers.com/search/web%20scraping/page/")
## [1] "Page number 2"
## [1] "Page number 3"
## [1] "Page number 4"
## [1] "Page number 5"
## [1] "Page number 6"
## [1] "Page number 7"
## [1] "Page number 8"
## [1] "Page number 9"
## [1] "Page number 10"
## [1] "Page number 11"
## [1] "Page number 12"
## [1] "Page number 13"
## [1] "Page number 14"
## [1] "Page number 15"
## [1] "Page number 16"
## [1] "Page number 17"
df.result <- rbind(df.result, df1)
nrow(df.result)
## [1] 165
tbl_df(df.result)
## Source: local data frame [165 x 4]
##
## pageid blogdate author
## 1 1 November 24, 2014 hadleywickham
## 2 1 September 17, 2014 Bob Rudis (@hrbrmstr)
## 3 1 March 12, 2014 Rolf Fredheim
## 4 1 March 5, 2014 Rolf Fredheim
## 5 1 February 25, 2014 Rolf Fredheim
## 6 1 April 5, 2012 Kay Cichini
## 7 1 January 6, 2012 Tony Breyal
## 8 1 December 27, 2011 axiomOfChoice
## 9 1 November 11, 2011 Tony Breyal
## 10 1 November 10, 2011 Tony Breyal
## .. ... ... ...
## Variables not shown: title (fctr)
df.result <- data.frame(pageid=numeric(0),blogdate=character(),author=character(),title=character())
df.r <- rbloggerscraper("http://www.r-bloggers.com/search/twitter",1)
df.result <- rbind(df.result, df.r)
lastpage <- getLastPage("http://www.r-bloggers.com/search/twitter")
df1 <- scrapeRestOfPages("http://www.r-bloggers.com/search/twitter/page/")
## [1] "Page number 2"
## [1] "Page number 3"
## [1] "Page number 4"
## [1] "Page number 5"
## [1] "Page number 6"
## [1] "Page number 7"
## [1] "Page number 8"
## [1] "Page number 9"
## [1] "Page number 10"
## [1] "Page number 11"
## [1] "Page number 12"
## [1] "Page number 13"
## [1] "Page number 14"
## [1] "Page number 15"
## [1] "Page number 16"
## [1] "Page number 17"
## [1] "Page number 18"
## [1] "Page number 19"
## [1] "Page number 20"
## [1] "Page number 21"
## [1] "Page number 22"
## [1] "Page number 23"
## [1] "Page number 24"
## [1] "Page number 25"
## [1] "Page number 26"
## [1] "Page number 27"
## [1] "Page number 28"
## [1] "Page number 29"
## [1] "Page number 30"
## [1] "Page number 31"
## [1] "Page number 32"
## [1] "Page number 33"
## [1] "Page number 34"
## [1] "Page number 35"
## [1] "Page number 36"
## [1] "Page number 37"
## [1] "Page number 38"
## [1] "Page number 39"
## [1] "Page number 40"
## [1] "Page number 41"
## [1] "Page number 42"
## [1] "Page number 43"
## [1] "Page number 44"
## [1] "Page number 45"
## [1] "Page number 46"
## [1] "Page number 47"
## [1] "Page number 48"
## [1] "Page number 49"
## [1] "Page number 50"
## [1] "Page number 51"
## [1] "Page number 52"
## [1] "Page number 53"
## [1] "Page number 54"
## [1] "Page number 55"
## [1] "Page number 56"
## [1] "Page number 57"
## [1] "Page number 58"
## [1] "Page number 59"
## [1] "Page number 60"
## [1] "Page number 61"
## [1] "Page number 62"
## [1] "Page number 63"
## [1] "Page number 64"
## [1] "Page number 65"
## [1] "Page number 66"
## [1] "Page number 67"
## [1] "Page number 68"
## [1] "Page number 69"
## [1] "Page number 70"
## [1] "Page number 71"
## [1] "Page number 72"
## [1] "Page number 73"
## [1] "Page number 74"
## [1] "Page number 75"
## [1] "Page number 76"
## [1] "Page number 77"
## [1] "Page number 78"
## [1] "Page number 79"
## [1] "Page number 80"
## [1] "Page number 81"
## [1] "Page number 82"
## [1] "Page number 83"
## [1] "Page number 84"
## [1] "Page number 85"
## [1] "Page number 86"
## [1] "Page number 87"
## [1] "Page number 88"
## [1] "Page number 89"
## [1] "Page number 90"
## [1] "Page number 91"
## [1] "Page number 92"
## [1] "Page number 93"
## [1] "Page number 94"
## [1] "Page number 95"
## [1] "Page number 96"
## [1] "Page number 97"
## [1] "Page number 98"
## [1] "Page number 99"
## [1] "Page number 100"
## [1] "Page number 101"
## [1] "Page number 102"
## [1] "Page number 103"
## [1] "Page number 104"
## [1] "Page number 105"
## [1] "Page number 106"
## [1] "Page number 107"
## [1] "Page number 108"
## [1] "Page number 109"
## [1] "Page number 110"
## [1] "Page number 111"
## [1] "Page number 112"
## [1] "Page number 113"
## [1] "Page number 114"
## [1] "Page number 115"
## [1] "Page number 116"
## [1] "Page number 117"
## [1] "Page number 118"
## [1] "Page number 119"
## [1] "Page number 120"
## [1] "Page number 121"
## [1] "Page number 122"
## [1] "Page number 123"
## [1] "Page number 124"
## [1] "Page number 125"
## [1] "Page number 126"
## [1] "Page number 127"
## [1] "Page number 128"
## [1] "Page number 129"
## [1] "Page number 130"
## [1] "Page number 131"
## [1] "Page number 132"
## [1] "Page number 133"
## [1] "Page number 134"
## [1] "Page number 135"
## [1] "Page number 136"
## [1] "Page number 137"
## [1] "Page number 138"
## [1] "Page number 139"
## [1] "Page number 140"
## [1] "Page number 141"
## [1] "Page number 142"
## [1] "Page number 143"
## [1] "Page number 144"
## [1] "Page number 145"
## [1] "Page number 146"
## [1] "Page number 147"
## [1] "Page number 148"
## [1] "Page number 149"
## [1] "Page number 150"
## [1] "Page number 151"
## [1] "Page number 152"
## [1] "Page number 153"
## [1] "Page number 154"
## [1] "Page number 155"
## [1] "Page number 156"
## [1] "Page number 157"
## [1] "Page number 158"
## [1] "Page number 159"
## [1] "Page number 160"
## [1] "Page number 161"
## [1] "Page number 162"
## [1] "Page number 163"
## [1] "Page number 164"
## [1] "Page number 165"
## [1] "Page number 166"
## [1] "Page number 167"
## [1] "Page number 168"
## [1] "Page number 169"
## [1] "Page number 170"
## [1] "Page number 171"
## [1] "Page number 172"
## [1] "Page number 173"
## [1] "Page number 174"
## [1] "Page number 175"
## [1] "Page number 176"
## [1] "Page number 177"
## [1] "Page number 178"
## [1] "Page number 179"
## [1] "Page number 180"
## [1] "Page number 181"
## [1] "Page number 182"
## [1] "Page number 183"
## [1] "Page number 184"
## [1] "Page number 185"
## [1] "Page number 186"
## [1] "Page number 187"
## [1] "Page number 188"
## [1] "Page number 189"
## [1] "Page number 190"
## [1] "Page number 191"
## [1] "Page number 192"
## [1] "Page number 193"
## [1] "Page number 194"
## [1] "Page number 195"
## [1] "Page number 196"
## [1] "Page number 197"
## [1] "Page number 198"
## [1] "Page number 199"
## [1] "Page number 200"
## [1] "Page number 201"
## [1] "Page number 202"
## [1] "Page number 203"
## [1] "Page number 204"
## [1] "Page number 205"
## [1] "Page number 206"
## [1] "Page number 207"
## [1] "Page number 208"
## [1] "Page number 209"
## [1] "Page number 210"
## [1] "Page number 211"
## [1] "Page number 212"
## [1] "Page number 213"
## [1] "Page number 214"
## [1] "Page number 215"
## [1] "Page number 216"
## [1] "Page number 217"
## [1] "Page number 218"
## [1] "Page number 219"
## [1] "Page number 220"
## [1] "Page number 221"
## [1] "Page number 222"
## [1] "Page number 223"
## [1] "Page number 224"
## [1] "Page number 225"
## [1] "Page number 226"
## [1] "Page number 227"
## [1] "Page number 228"
## [1] "Page number 229"
## [1] "Page number 230"
## [1] "Page number 231"
## [1] "Page number 232"
df.result <- rbind(df.result, df1)
nrow(df.result)
## [1] 2311
tbl_df(df.result)
## Source: local data frame [2,311 x 4]
##
## pageid blogdate
## 1 1 February 28, 2015
## 2 1 January 19, 2015
## 3 1 January 7, 2015
## 4 1 November 24, 2014
## 5 1 October 20, 2014
## 6 1 September 22, 2014
## 7 1 September 22, 2014
## 8 1 August 10, 2014
## 9 1 June 30, 2014
## 10 1 June 10, 2014
## .. ... ...
## Variables not shown: author (fctr), title (fctr)