RE: https://stackoverflow.com/questions/47670819/r-webscraping-setting-keywords/47674493?noredirect=1#comment82315287_47674493
library(countrycode)
library(httr)
library(rvest)
## Loading required package: xml2
library(stringi)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
fao_get_news <- function(iso3) {
GET(
url = "http://www.fao.org/countryprofiles/common/allnews/en/",
query = list(iso3=iso3)
) -> res
warn_for_status(res)
if (status_code(res) > 399) return(NULL)
out <- content(res, as="text", encoding="UTF-8")
out <- jsonlite::fromJSON(out)
out$iso3 <- iso3
tbl_df(out)
}
pb <- progress_estimated(length(countrycode_data$iso3c[1:20])) # won't show as not running interactively
map_df(countrycode_data$iso3c[1:20], ~{
pb$tick()$print()
Sys.sleep(5) # no crawl delay specified in site robots.txt so this is the ethical default
fao_get_news(.x)
}) -> xdf
keywords <- c("drought", "food security")
keyword_regex <- sprintf("(%s)", paste0(keywords, collapse="|"))
bind_cols(
xdf,
stri_match_all_regex(tolower(xdf$bodytext), keyword_regex) %>%
map(~.x[,2]) %>%
map_df(~{
res <- table(.x, useNA="always")
nm <- names(res)
nm <- ifelse(is.na(nm), "NONE", stri_replace_all_regex(nm, "[ -]", "_"))
as.list(set_names(as.numeric(res), nm))
})
) %>%
select(-NONE) -> xdf_with_keyword_counts
glimpse(xdf_with_keyword_counts)
## Observations: 743
## Variables: 12
## $ uid <chr> "1069933", "1069560", "1045264", "1044139", "103...
## $ table <chr> "news", "news", "news", "news", "news", "news", ...
## $ title <chr> "FAO Calls for Stronger Collaboration on Transbo...
## $ date <chr> "1511823600", "1511737200", "1508191200", "15081...
## $ bodytext <chr> "28 November 2017- Chief Veterinary Officers and...
## $ date_format <chr> "28/11/2017", "27/11/2017", "17/10/2017", "16/10...
## $ image <chr> "http://www.fao.org/fileadmin/user_upload/rne/im...
## $ pid <chr> "50840", "16275", "70992", "16275", "2330", "409...
## $ detail_pid <chr> "/neareast/news/view/en/c/1069933/", "/asiapacif...
## $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",...
## $ food_security <dbl> NA, NA, 2, 1, NA, 1, NA, NA, NA, 1, NA, NA, NA, ...
## $ drought <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
Session info:
devtools::session_info()
## Session info -------------------------------------------------------------
## setting value
## version R version 3.4.3 Patched (2017-12-05 r73849)
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/New_York
## date 2017-12-06
## Packages -----------------------------------------------------------------
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
## backports 1.1.1 2017-09-25 CRAN (R 3.4.2)
## base * 3.4.3 2017-12-06 local
## bindr 0.1 2016-11-13 CRAN (R 3.4.0)
## bindrcpp 0.2 2017-06-17 cran (@0.2)
## broom 0.4.3 2017-11-20 CRAN (R 3.4.2)
## cellranger 1.1.0 2016-07-27 CRAN (R 3.4.0)
## cli 1.0.0 2017-11-05 CRAN (R 3.4.2)
## codetools 0.2-15 2016-10-05 CRAN (R 3.4.3)
## colorspace 1.3-2 2016-12-14 CRAN (R 3.4.0)
## compiler 3.4.3 2017-12-06 local
## countrycode * 0.19 2017-02-06 CRAN (R 3.4.0)
## crayon 1.3.4 2017-09-16 CRAN (R 3.4.1)
## curl 3.0 2017-10-06 CRAN (R 3.4.2)
## datasets * 3.4.3 2017-12-06 local
## devtools 1.13.4 2017-11-09 CRAN (R 3.4.2)
## digest 0.6.12 2017-01-27 CRAN (R 3.4.0)
## dplyr * 0.7.4 2017-09-28 CRAN (R 3.4.2)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
## forcats * 0.2.0 2017-01-23 CRAN (R 3.4.0)
## foreign 0.8-69 2017-06-22 CRAN (R 3.4.3)
## ggplot2 * 2.2.1.9000 2017-11-20 Github (tidyverse/ggplot2@582acfe)
## glue 1.2.0 2017-10-29 CRAN (R 3.4.2)
## graphics * 3.4.3 2017-12-06 local
## grDevices * 3.4.3 2017-12-06 local
## grid 3.4.3 2017-12-06 local
## gtable 0.2.0 2016-02-26 CRAN (R 3.4.0)
## haven 1.1.0 2017-07-09 cran (@1.1.0)
## hms 0.4.0 2017-11-23 CRAN (R 3.4.3)
## htmltools 0.3.6 2017-04-28 cran (@0.3.6)
## httr * 1.3.1 2017-11-14 Github (hadley/httr@6b2dadc)
## jsonlite 1.5 2017-06-01 CRAN (R 3.4.0)
## knitr 1.17.20 2017-12-04 Github (yihui/knitr@73387d6)
## lattice 0.20-35 2017-03-25 CRAN (R 3.4.3)
## lazyeval 0.2.1 2017-10-29 cran (@0.2.1)
## lubridate 1.7.1 2017-11-03 CRAN (R 3.4.2)
## magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
## memoise 1.1.0 2017-04-21 cran (@1.1.0)
## methods * 3.4.3 2017-12-06 local
## mnormt 1.5-5 2016-10-15 CRAN (R 3.4.0)
## modelr 0.1.1 2017-07-24 CRAN (R 3.4.1)
## munsell 0.4.3 2016-02-13 CRAN (R 3.4.0)
## nlme 3.1-131 2017-02-06 CRAN (R 3.4.3)
## parallel 3.4.3 2017-12-06 local
## pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.0)
## plyr 1.8.4 2016-06-08 CRAN (R 3.4.0)
## psych 1.7.8 2017-09-09 CRAN (R 3.4.2)
## purrr * 0.2.4 2017-10-18 CRAN (R 3.4.2)
## R6 2.2.2 2017-06-17 cran (@2.2.2)
## Rcpp 0.12.14 2017-11-23 CRAN (R 3.4.3)
## readr * 1.1.1 2017-05-16 CRAN (R 3.4.0)
## readxl 1.0.0 2017-04-18 CRAN (R 3.4.0)
## reshape2 1.4.2 2016-10-22 CRAN (R 3.4.0)
## rlang 0.1.4 2017-11-05 CRAN (R 3.4.2)
## rmarkdown 1.8 2017-11-17 CRAN (R 3.4.2)
## rprojroot 1.2 2017-01-16 CRAN (R 3.4.0)
## rstudioapi 0.7 2017-09-07 CRAN (R 3.4.1)
## rvest * 0.3.2 2016-06-17 CRAN (R 3.4.0)
## scales 0.5.0.9000 2017-11-20 Github (hadley/scales@d767915)
## stats * 3.4.3 2017-12-06 local
## stringi * 1.1.6 2017-11-17 CRAN (R 3.4.2)
## stringr * 1.2.0 2017-02-18 CRAN (R 3.4.0)
## tibble * 1.3.4 2017-08-22 cran (@1.3.4)
## tidyr * 0.7.2 2017-10-16 CRAN (R 3.4.2)
## tidyverse * 1.2.1 2017-11-14 CRAN (R 3.4.2)
## tools 3.4.3 2017-12-06 local
## utils * 3.4.3 2017-12-06 local
## withr 2.1.0.9000 2017-11-20 Github (jimhester/withr@daf5a8c)
## xml2 * 1.1.9000 2017-12-01 Github (hadley/xml2@8bb2348)
## yaml 2.1.15 2017-12-01 cran (@2.1.15)