About

rvest library provides easy to use web scraping for R in line with the tidyverse metaframework. However, it lacks support for multithreaded scraping, which compares unforably with Python alternatives such as Scrapy. Here I give a very simple example of how to scrape in parallel using furrr package.

Init

options(digits = 3)
library(pacman)
p_load(tidyverse, rvest, httr, tictoc, furrr)

Enable parallel

#use multi session with 10 threads
#you dont need that many actual threads on your machine
plan(multisession(workers = 10))

Wikipedia scraping

List to scrape

#some people to scrape
#600 members of French resistance
French_resistance_fighters = c("https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members",
                              "https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members&pagefrom=Droulers%2C+Eugene%0AEugène+Droulers",
                              "https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members&pagefrom=Lurcat%2C+Jean%0AJean+Lur%C3%A7at#mw-pages"
                              ) %>% 
  map(function(x) {
    x %>% 
      read_html() %>% 
      html_nodes("#mw-pages li a") %>% 
      html_attr("href")
  }) %>% 
  do.call(what = c)

#print first 10 pages
#first one is wrong but whatever
French_resistance_fighters %>% head(20)
##  [1] "/wiki/List_of_people_involved_with_the_French_Resistance"
##  [2] "/wiki/Pierre_Abraham"                                    
##  [3] "/wiki/H%C3%A9l%C3%A8ne_Deschamps_Adams"                  
##  [4] "/wiki/Francine_Agazarian"                                
##  [5] "/wiki/Julienne_Aisner"                                   
##  [6] "/wiki/Virginia_d%27Albert-Lake"                          
##  [7] "/wiki/Berty_Albrecht"                                    
##  [8] "/wiki/Alex_Virot"                                        
##  [9] "/wiki/Celestino_Alfonso"                                 
## [10] "/wiki/%C3%89milien_Amaury"                               
## [11] "/wiki/Dimitri_Amilakhvari"                               
## [12] "/wiki/Henri_Amouroux"                                    
## [13] "/wiki/Val%C3%A9rie_Andr%C3%A9"                           
## [14] "/wiki/Raoul_Angl%C3%A8s"                                 
## [15] "/wiki/Raymond_Anne"                                      
## [16] "/wiki/Anthony_of_Sourozh"                                
## [17] "/wiki/Louis_Armand"                                      
## [18] "/wiki/Arpiar_Aslanian"                                   
## [19] "/wiki/Jacques_Arthuys"                                   
## [20] "/wiki/Louise_Aslanian"
#base url
wiki_base = "en.wikipedia.org"

Single threaded

#time it
tic()

#download
results_single = map(French_resistance_fighters, function(x) {
  #get
  x_get = GET(url = paste0(wiki_base, x))
  
  x_get
})

#stop
toc()
## 43.765 sec elapsed
#looks ok?
results_single[[1]]
## Response [https://en.wikipedia.org/wiki/List_of_people_involved_with_the_French_Resistance]
##   Date: 2019-11-11 00:56
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 44.6 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>List of people involved with the French Resistance - Wikipedia</t...
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakF...
## "wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevant...
## "ready","user.tokens":"loading","mediawiki.legacy.shared":"ready","media...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.t...
## });});</script>
## ...

Multi-threaded

#time it
tic()

#download
results_multi = future_map(French_resistance_fighters, function(x) {
  #get
  x_get = GET(url = paste0(wiki_base, x))
  
  x_get
})

#stop
toc()
## 4.536 sec elapsed
#looks ok?
results_multi[[1]]
## Response [https://en.wikipedia.org/wiki/List_of_people_involved_with_the_French_Resistance]
##   Date: 2019-11-11 00:56
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 44.6 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>List of people involved with the French Resistance - Wikipedia</t...
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakF...
## "wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevant...
## "ready","user.tokens":"loading","mediawiki.legacy.shared":"ready","media...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.t...
## });});</script>
## ...
#identical?
identical(results_single, results_multi)
## [1] FALSE
#the timestamp differs but are contents the same?
object.size(results_single)
## 49679040 bytes
object.size(results_multi)
## 49688824 bytes
#length
length(results_single)
## [1] 600
length(results_multi)
## [1] 600
#status codes
results_single %>% map_dbl(~.$status_code) %>% table()
## .
## 200 
## 600
results_multi %>% map_dbl(~.$status_code) %>% table()
## .
## 200 
## 600