Assignment1 - JSON to CSV and Summarizations

1) Configuration

DRIVE_FILE_ID <- "134U6xLIZUZ9sA1BW-X9TLZtUlEYCvQwz"
INPUT_JSON    <- glue("https://drive.google.com/uc?export=download&id={DRIVE_FILE_ID}")
OUTPUT_CSV    <- "flattened_infringing_urls.csv"
N_WORKERS     <- 8   # adjust for your machine
TIMEOUT_S     <- 3

2) Load JSON (local file or Google Drive link)

google_to_direct <- function(url) {
  m <- stringr::str_match(url, "/d/([^/]+)/")[,2]
  ifelse(!is.na(m),
         glue::glue("https://drive.google.com/uc?export=download&id={m}"),
         url)
}

load_json <- function(src) {
  if (grepl("^https?://", src)) {
    src <- google_to_direct(src)
    r   <- httr::GET(src, httr::user_agent("Mozilla/5.0"), httr::timeout(30))
    # -- handle Drive confirm page ----------
    if (!grepl("content-disposition",
               httr::headers(r)[["content-type"]],
               ignore.case = TRUE)) {
      warn <- names(httr::cookies(r))[grepl("^download_warning", names(httr::cookies(r)))]
      if (length(warn))
        r <- httr::GET(src, query = list(confirm = httr::cookies(r)[[warn]][["value"]]),
                       timeout = 30)
    }
    httr::stop_for_status(r)
    return(jsonlite::fromJSON(rawToChar(httr::content(r, "raw")),
                              simplifyVector = FALSE))   # <- key line
  }
  jsonlite::fromJSON(src, simplifyVector = FALSE)        # <- key line
}

raw_json <- load_json(INPUT_JSON)

3)Flatten the nested notices

flatten_notices <- function(raw) {
  purrr::imap_dfr(raw$notices, function(notice, i) {
    base <- list(
      notice_id = notice$id %||% i,
      title     = notice$title,
      sender    = notice$sender_name,
      principal = notice$principal_name,
      recipient = notice$recipient_name,
      date_sent = notice$date_sent
    )
    purrr::imap_dfr(notice$works, function(work, j) {
      tibble(
        !!!base,
        description    = work$description,
        infringing_url = purrr::map_chr(work$infringing_urls, "url"),
        domain         = str_to_lower(domain(infringing_url))
      )
    })
  })
}

df <- flatten_notices(raw_json)

4)Parallel DNS look‑ups (add ip_address)

## ---- dns, message=FALSE -----------------------------------------------------
# 1️⃣  domain extractor that tolerates naked URLs like "//example.com/..."
domain_from_url <- function(u) {
  u <- ifelse(grepl("^//", u), paste0("http:", u), u)       # add scheme if missing
  u <- ifelse(!grepl("^https?://", u), paste0("http://", u), u)
  urltools::domain(u)
}

# # 2️⃣  Robust wrapper around curl::nslookup()
resolve_ip <- function(dom) {
  if (is.na(dom) || dom == "") return(NA_character_)
  
  # Google DNS‑over‑HTTPS endpoint
  url <- sprintf("https://dns.google/resolve?name=%s&type=A", dom)
  
  ans <- tryCatch(jsonlite::fromJSON(url), error = function(e) NULL)
  if (is.null(ans) || is.null(ans$Answer)) return(NA_character_)
  
  # keep only A‑records (type 1)
  a_recs <- ans$Answer[ans$Answer$type == 1, , drop = FALSE]
  if (nrow(a_recs) == 0) return(NA_character_)
  
  a_recs$data[1]        # first IPv4 string
}


enrich_with_ip <- function(df) {
  # (re‑extract domain using the safer helper, in case the old column is off)
  df$domain <- domain_from_url(df$infringing_url)

  unique_domains <- unique(df$domain)
  cat(glue::glue("🔸 Resolving {length(unique_domains)} unique domains ...\n"))

  ip_vec <- if (.Platform$OS.type == "unix") {
    unlist(parallel::mclapply(unique_domains, resolve_ip, mc.cores = N_WORKERS))
  } else {
    cl <- parallel::makeCluster(min(N_WORKERS, parallel::detectCores()))
    on.exit(parallel::stopCluster(cl), add = TRUE)
    unlist(parallel::parSapply(cl, unique_domains, resolve_ip))
  }

  ip_map <- setNames(ip_vec, unique_domains)
  df$ip_address <- ip_map[df$domain]
  df
}

# Re‑run the enrichment with the improved functions
df <- enrich_with_ip(df)

## 🔸 Resolving 3010 unique domains ...

5)Clean helper columns

tidy_principal <- function(x) {
  x <- dplyr::coalesce(x, "Unknown")
  x <- ifelse(x == "", "Unknown", x)
  x |>
    stringr::str_to_lower() |>
    stringr::str_remove_all("[,.'`]") |>
    stringr::str_replace_all("\\s+inc$", "") |>
    stringr::str_squish() |>
    stringr::str_to_title()
}

get_root_domain <- function(d) {
  d <- dplyr::coalesce(d, "unknown")
  d <- ifelse(d == "", "unknown", d)
  d <- stringr::str_remove(stringr::str_to_lower(d), "^www\\d*\\.")
  parts <- stringr::str_split(d, "\\.")
  vapply(parts, function(p) paste(tail(p, 2), collapse = "."), FUN.VALUE = character(1))
}

# Add the new columns (note new names)
df <- df %>%
  mutate(
    principal_clean = tidy_principal(principal),
    root_domain_clean = get_root_domain(domain),
    month = lubridate::floor_date(lubridate::ymd_hms(date_sent, quiet = TRUE), "month")
  )

6)Write the enriched CSV

write_csv(df, OUTPUT_CSV)
glue("✅ CSV written to: {normalizePath(OUTPUT_CSV)}")

## ✅ CSV written to: C:\Users\pssha\OneDrive\Desktop\GTAGRA\Assignment_1\R\flattened_infringing_urls.csv

7)Quick interactive preview

datatable(
  df,
  options = list(pageLength = 10, scrollX = TRUE),
  rownames = FALSE,
  caption  = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    'Table 1: Flattened & enriched infringing URLs'
  )
)

8)Summary insights

# a) Top Principals (already fine)
df %>% count(principal_clean, sort = TRUE) %>% head(5)

## # A tibble: 1 × 2
##   principal_clean     n
##   <chr>           <int>
## 1 Netflix         66169

# b) Top Infringing Domains
df %>% count(root_domain_clean, sort = TRUE) %>% head(5)

## # A tibble: 5 × 2
##   root_domain_clean     n
##   <chr>             <int>
## 1 chomikuj.pl       23605
## 2 unblockit.onl      6870
## 3 rapidgator.net     1760
## 4 torlock.cc          932
## 5 ul.to               851

# c) IPs hosting many distinct infringing domains
df %>%
  filter(!is.na(ip_address)) %>%
  group_by(ip_address) %>%
  summarise(unique_domains = n_distinct(root_domain_clean), .groups = "drop") %>%
  arrange(desc(unique_domains)) %>%
  head(10)

## # A tibble: 10 × 2
##    ip_address      unique_domains
##    <chr>                    <int>
##  1 199.59.243.228             108
##  2 172.233.219.78              48
##  3 172.237.146.38              42
##  4 172.237.146.8               41
##  5 69.16.230.165               41
##  6 103.224.182.253             39
##  7 172.233.219.49              39
##  8 172.237.146.25              32
##  9 172.233.219.123             30
## 10 104.21.96.1                 24

Assignment1 - JSON to CSV and Summarizations

Pullipudi Siva Mani Subrahmanya Hari Vamsi

2025-07-15

1) Configuration

2) Load JSON (local file or Google Drive link)

3)Flatten the nested notices

4)Parallel DNS look‑ups (add ip_address)

5)Clean helper columns

6)Write the enriched CSV

7)Quick interactive preview

8)Summary insights