1) Configuration
DRIVE_FILE_ID <- "134U6xLIZUZ9sA1BW-X9TLZtUlEYCvQwz"
INPUT_JSON <- glue("https://drive.google.com/uc?export=download&id={DRIVE_FILE_ID}")
OUTPUT_CSV <- "flattened_infringing_urls.csv"
N_WORKERS <- 8 # adjust for your machine
TIMEOUT_S <- 3
2) Load JSON (local file or Google Drive link)
google_to_direct <- function(url) {
m <- stringr::str_match(url, "/d/([^/]+)/")[,2]
ifelse(!is.na(m),
glue::glue("https://drive.google.com/uc?export=download&id={m}"),
url)
}
load_json <- function(src) {
if (grepl("^https?://", src)) {
src <- google_to_direct(src)
r <- httr::GET(src, httr::user_agent("Mozilla/5.0"), httr::timeout(30))
# -- handle Drive confirm page ----------
if (!grepl("content-disposition",
httr::headers(r)[["content-type"]],
ignore.case = TRUE)) {
warn <- names(httr::cookies(r))[grepl("^download_warning", names(httr::cookies(r)))]
if (length(warn))
r <- httr::GET(src, query = list(confirm = httr::cookies(r)[[warn]][["value"]]),
timeout = 30)
}
httr::stop_for_status(r)
return(jsonlite::fromJSON(rawToChar(httr::content(r, "raw")),
simplifyVector = FALSE)) # <- key line
}
jsonlite::fromJSON(src, simplifyVector = FALSE) # <- key line
}
raw_json <- load_json(INPUT_JSON)
3)Flatten the nested notices
flatten_notices <- function(raw) {
purrr::imap_dfr(raw$notices, function(notice, i) {
base <- list(
notice_id = notice$id %||% i,
title = notice$title,
sender = notice$sender_name,
principal = notice$principal_name,
recipient = notice$recipient_name,
date_sent = notice$date_sent
)
purrr::imap_dfr(notice$works, function(work, j) {
tibble(
!!!base,
description = work$description,
infringing_url = purrr::map_chr(work$infringing_urls, "url"),
domain = str_to_lower(domain(infringing_url))
)
})
})
}
df <- flatten_notices(raw_json)
4)Parallel DNS look‑ups (add ip_address)
## ---- dns, message=FALSE -----------------------------------------------------
# 1️⃣ domain extractor that tolerates naked URLs like "//example.com/..."
domain_from_url <- function(u) {
u <- ifelse(grepl("^//", u), paste0("http:", u), u) # add scheme if missing
u <- ifelse(!grepl("^https?://", u), paste0("http://", u), u)
urltools::domain(u)
}
# # 2️⃣ Robust wrapper around curl::nslookup()
resolve_ip <- function(dom) {
if (is.na(dom) || dom == "") return(NA_character_)
# Google DNS‑over‑HTTPS endpoint
url <- sprintf("https://dns.google/resolve?name=%s&type=A", dom)
ans <- tryCatch(jsonlite::fromJSON(url), error = function(e) NULL)
if (is.null(ans) || is.null(ans$Answer)) return(NA_character_)
# keep only A‑records (type 1)
a_recs <- ans$Answer[ans$Answer$type == 1, , drop = FALSE]
if (nrow(a_recs) == 0) return(NA_character_)
a_recs$data[1] # first IPv4 string
}
enrich_with_ip <- function(df) {
# (re‑extract domain using the safer helper, in case the old column is off)
df$domain <- domain_from_url(df$infringing_url)
unique_domains <- unique(df$domain)
cat(glue::glue("🔸 Resolving {length(unique_domains)} unique domains ...\n"))
ip_vec <- if (.Platform$OS.type == "unix") {
unlist(parallel::mclapply(unique_domains, resolve_ip, mc.cores = N_WORKERS))
} else {
cl <- parallel::makeCluster(min(N_WORKERS, parallel::detectCores()))
on.exit(parallel::stopCluster(cl), add = TRUE)
unlist(parallel::parSapply(cl, unique_domains, resolve_ip))
}
ip_map <- setNames(ip_vec, unique_domains)
df$ip_address <- ip_map[df$domain]
df
}
# Re‑run the enrichment with the improved functions
df <- enrich_with_ip(df)
## 🔸 Resolving 3010 unique domains ...
5)Clean helper columns
tidy_principal <- function(x) {
x <- dplyr::coalesce(x, "Unknown")
x <- ifelse(x == "", "Unknown", x)
x |>
stringr::str_to_lower() |>
stringr::str_remove_all("[,.'`]") |>
stringr::str_replace_all("\\s+inc$", "") |>
stringr::str_squish() |>
stringr::str_to_title()
}
get_root_domain <- function(d) {
d <- dplyr::coalesce(d, "unknown")
d <- ifelse(d == "", "unknown", d)
d <- stringr::str_remove(stringr::str_to_lower(d), "^www\\d*\\.")
parts <- stringr::str_split(d, "\\.")
vapply(parts, function(p) paste(tail(p, 2), collapse = "."), FUN.VALUE = character(1))
}
# Add the new columns (note new names)
df <- df %>%
mutate(
principal_clean = tidy_principal(principal),
root_domain_clean = get_root_domain(domain),
month = lubridate::floor_date(lubridate::ymd_hms(date_sent, quiet = TRUE), "month")
)
6)Write the enriched CSV
write_csv(df, OUTPUT_CSV)
glue("✅ CSV written to: {normalizePath(OUTPUT_CSV)}")
## ✅ CSV written to: C:\Users\pssha\OneDrive\Desktop\GTAGRA\Assignment_1\R\flattened_infringing_urls.csv
7)Quick interactive preview
datatable(
df,
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE,
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'Table 1: Flattened & enriched infringing URLs'
)
)