This one is going to be quick, we will use the xsitemap package which crawls XML sitemap
library(xsitemap)
library(urltools)
library(XML)
library(httr)
upload <- xsitemapGet("https://www.rforseo.com/sitemap.xml")
## Reaching for XML sitemap... https://www.rforseo.com/sitemap.xml
## regular sitemap detected - 39 web page url(s) found
## ......................................
head(upload)
## loc lastmod
## 1 https://www.rforseo.com/ 2022-07-29
## 2 https://www.rforseo.com/classic-r-operations 2021-05-08
## 3 https://www.rforseo.com/intro 2022-02-17
## 4 https://www.rforseo.com/r-intro 2022-08-18
## 5 https://www.rforseo.com/rpivottable 2022-08-18
## 6 https://www.rforseo.com/analysis/count-words-n-grams-shingles 2021-04-06
We use the parallel package to allow us to run a core request at the same time.
Warning, with regards to the URL Inspection API, the quota is enforced per Search Console website property (calls querying the same site)
I could be useful to create some extra properties using url directories
library(searchConsoleR)
library(lubridate)
library(parallel)
scr_auth()
res <- mclapply(1:nrow(upload), function(i) {
cat(".")
url <- upload[i,"loc"]
result <- inspection(url, siteUrl = "sc-domain:rforseo.com", languageCode = NULL)
text <- paste0(url,"§",
result[["indexStatusResult"]][["verdict"]],"§",
result[["indexStatusResult"]][["coverageState"]],"§",
result[["indexStatusResult"]][["robotsTxtState"]],"§",
result[["indexStatusResult"]][["indexingState"]],"§",
now())
text
}, mc.cores = detectCores()) ## Split this job across 10 cores
res <- data.frame(unlist(res))
library(stringr)
res[,c("url", "verdict", "coverageState", "robotsTxtState", "indexingState", "date")] <- str_split_fixed(res$unlist.r., '§', 6)
res$unlist.r. <- NULL
# Load the package
library(googleCloudStorageR)
library(bigQueryR)
## project id
gcs_global_bucket("mindful-path-205008")
gcs_auth()
## custom upload function to ignore quotes and column headers
f <- function(input, output) {
write.table(input, sep = ",", col.names = FALSE, row.names = FALSE,
quote = FALSE, file = output, qmethod = "double")}
## upload files to Google Cloud Storage
gcs_upload(res, name = "res.csv", object_function = f,bucket = "gsc_backup")
Now the data is ready to read and loaded whenever its needed