# pip install advertools
# OR:
# pip3 install advertools

library(reticulate)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
adv <- import("advertools")

Convert an XML sitemap to a data.frame:

nyt_recipes <- "https://www.nytimes.com/sitemaps/new/recipe-collects.xml.gz"
nyt_sitemap <- adv$sitemap_to_df(nyt_recipes)
nyt_sitemap

Analyze URLs

The url_to_df function splits a list of URLs to their components, each in a column.

Here the final column dir_2 gets converted to a list because it contains missing values, although it’s expected to be a chr just like column dir_1:

url_df <- adv$url_to_df(nyt_sitemap$loc) 

url_df
url_df$dir_2 %>% 
  head()
## [[1]]
## [1] "jerk seasoning paste"
## 
## [[2]]
## [1] "roasted white sesame seed"
## 
## [[3]]
## [1] "1640510-sam-siftons-suggestions"
## 
## [[4]]
## [1] "31042692-our-10-most-popular-recipes-right-now"
## 
## [[5]]
## [1] "30062982-sheet-pan"
## 
## [[6]]
## [1] "11249289-weekly-plan-1-29"

Using flatten_chr solves the issue, but the user wouldn’t necessarily know that this is an issue, and they would have to guess the type as well. (flatten doesn’t work for example).

url_df$dir_2 <- flatten_chr(url_df$dir_2)

url_df$dir_2 %>% 
  head()
## [1] "jerk seasoning paste"                          
## [2] "roasted white sesame seed"                     
## [3] "1640510-sam-siftons-suggestions"               
## [4] "31042692-our-10-most-popular-recipes-right-now"
## [5] "30062982-sheet-pan"                            
## [6] "11249289-weekly-plan-1-29"
url_df

Is there a better solution for this?