Backup Google Inspection URL data into Bigquery using R

step 1 : getting the URLs

This one is going to be quick, we will use the xsitemap package which crawls XML sitemap

library(xsitemap)
library(urltools)
library(XML)
library(httr)
upload <- xsitemapGet("https://www.rforseo.com/sitemap.xml")
## Reaching for XML sitemap... https://www.rforseo.com/sitemap.xml
## regular sitemap detected -  39  web page url(s) found
## ......................................
head(upload)
##                                                             loc    lastmod
## 1                                      https://www.rforseo.com/ 2022-07-29
## 2                  https://www.rforseo.com/classic-r-operations 2021-05-08
## 3                                 https://www.rforseo.com/intro 2022-02-17
## 4                               https://www.rforseo.com/r-intro 2022-08-18
## 5                           https://www.rforseo.com/rpivottable 2022-08-18
## 6 https://www.rforseo.com/analysis/count-words-n-grams-shingles 2021-04-06

step 2 : Launching the URL Inspection API in parallel

We use the parallel package to allow us to run a core request at the same time.

Warning, with regards to the URL Inspection API, the quota is enforced per Search Console website property (calls querying the same site)

I could be useful to create some extra properties using url directories

library(searchConsoleR)
library(lubridate)
library(parallel)
scr_auth()


res <- mclapply(1:nrow(upload), function(i) {
  cat(".")         
  url <-  upload[i,"loc"]
  result <- inspection(url, siteUrl = "sc-domain:rforseo.com", languageCode = NULL)
  
  text <- paste0(url,"§",
                 result[["indexStatusResult"]][["verdict"]],"§",
                 result[["indexStatusResult"]][["coverageState"]],"§",
                 result[["indexStatusResult"]][["robotsTxtState"]],"§",
                 result[["indexStatusResult"]][["indexingState"]],"§",
                 now())
  text
  
  
   }, mc.cores = detectCores())      ## Split this job across 10 cores

res <- data.frame(unlist(res))

library(stringr)

res[,c("url", "verdict", "coverageState", "robotsTxtState", "indexingState", "date")] <- str_split_fixed(res$unlist.r., '§', 6)
res$unlist.r. <- NULL

step 3 : Save data frame inside a Google Cloud Storage bucket

# Load the package
library(googleCloudStorageR)
library(bigQueryR)


## project id
gcs_global_bucket("mindful-path-205008")

gcs_auth()

## custom upload function to ignore quotes and column headers
f <- function(input, output) {
  write.table(input, sep = ",", col.names = FALSE, row.names = FALSE, 
              quote = FALSE, file = output, qmethod = "double")}

## upload files to Google Cloud Storage
gcs_upload(res, name = "res.csv", object_function = f,bucket = "gsc_backup")

Now the data is ready to read and loaded whenever its needed