rvest library provides easy to use web scraping for R in line with the tidyverse metaframework. However, it lacks support for multithreaded scraping, which compares unforably with Python alternatives such as Scrapy. Here I give a very simple example of how to scrape in parallel using furrr package.
options(digits = 3)
library(pacman)
p_load(tidyverse, rvest, httr, tictoc, furrr)
#use multi session with 10 threads
#you dont need that many actual threads on your machine
plan(multisession(workers = 10))
#some people to scrape
#600 members of French resistance
French_resistance_fighters = c("https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members",
"https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members&pagefrom=Droulers%2C+Eugene%0AEugène+Droulers",
"https://en.wikipedia.org/w/index.php?title=Category:French_Resistance_members&pagefrom=Lurcat%2C+Jean%0AJean+Lur%C3%A7at#mw-pages"
) %>%
map(function(x) {
x %>%
read_html() %>%
html_nodes("#mw-pages li a") %>%
html_attr("href")
}) %>%
do.call(what = c)
#print first 10 pages
#first one is wrong but whatever
French_resistance_fighters %>% head(20)
## [1] "/wiki/List_of_people_involved_with_the_French_Resistance"
## [2] "/wiki/Pierre_Abraham"
## [3] "/wiki/H%C3%A9l%C3%A8ne_Deschamps_Adams"
## [4] "/wiki/Francine_Agazarian"
## [5] "/wiki/Julienne_Aisner"
## [6] "/wiki/Virginia_d%27Albert-Lake"
## [7] "/wiki/Berty_Albrecht"
## [8] "/wiki/Alex_Virot"
## [9] "/wiki/Celestino_Alfonso"
## [10] "/wiki/%C3%89milien_Amaury"
## [11] "/wiki/Dimitri_Amilakhvari"
## [12] "/wiki/Henri_Amouroux"
## [13] "/wiki/Val%C3%A9rie_Andr%C3%A9"
## [14] "/wiki/Raoul_Angl%C3%A8s"
## [15] "/wiki/Raymond_Anne"
## [16] "/wiki/Anthony_of_Sourozh"
## [17] "/wiki/Louis_Armand"
## [18] "/wiki/Arpiar_Aslanian"
## [19] "/wiki/Jacques_Arthuys"
## [20] "/wiki/Louise_Aslanian"
#base url
wiki_base = "en.wikipedia.org"
#time it
tic()
#download
results_single = map(French_resistance_fighters, function(x) {
#get
x_get = GET(url = paste0(wiki_base, x))
x_get
})
#stop
toc()
## 43.765 sec elapsed
#looks ok?
results_single[[1]]
## Response [https://en.wikipedia.org/wiki/List_of_people_involved_with_the_French_Resistance]
## Date: 2019-11-11 00:56
## Status: 200
## Content-Type: text/html; charset=UTF-8
## Size: 44.6 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>List of people involved with the French Resistance - Wikipedia</t...
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakF...
## "wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevant...
## "ready","user.tokens":"loading","mediawiki.legacy.shared":"ready","media...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.t...
## });});</script>
## ...
#time it
tic()
#download
results_multi = future_map(French_resistance_fighters, function(x) {
#get
x_get = GET(url = paste0(wiki_base, x))
x_get
})
#stop
toc()
## 4.536 sec elapsed
#looks ok?
results_multi[[1]]
## Response [https://en.wikipedia.org/wiki/List_of_people_involved_with_the_French_Resistance]
## Date: 2019-11-11 00:56
## Status: 200
## Content-Type: text/html; charset=UTF-8
## Size: 44.6 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head>
## <meta charset="UTF-8"/>
## <title>List of people involved with the French Resistance - Wikipedia</t...
## <script>document.documentElement.className="client-js";RLCONF={"wgBreakF...
## "wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevant...
## "ready","user.tokens":"loading","mediawiki.legacy.shared":"ready","media...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.implement("user.t...
## });});</script>
## ...
#identical?
identical(results_single, results_multi)
## [1] FALSE
#the timestamp differs but are contents the same?
object.size(results_single)
## 49679040 bytes
object.size(results_multi)
## 49688824 bytes
#length
length(results_single)
## [1] 600
length(results_multi)
## [1] 600
#status codes
results_single %>% map_dbl(~.$status_code) %>% table()
## .
## 200
## 600
results_multi %>% map_dbl(~.$status_code) %>% table()
## .
## 200
## 600