Journal Article Scraper

Importing Required Libraries

library(httr)       # HTTP requests
library(rvest)      # HTML parsing
library(xml2)       # read_html()
library(stringr)    # regex helpers
library(dplyr)      # data wrangling
library(readr)      # write_csv()

Functions

# ---- Google‑Drive utilities ---------------------------------------------------
google_drive_id <- function(url) {
  # 1) ".../d/<ID>/view"
  m <- str_match(url, "/d/([0-9A-Za-z_-]{10,})")[,2]
  if (!is.na(m)) return(m)
  # 2) "...?id=<ID>"
  qs <- httr::parse_url(url)$query
  return(qs[["id"]] %||% "")
}

fetch_html_from_gdrive <- function(url) {
  file_id <- google_drive_id(url)
  if (file_id == "")
    stop("❌ Couldn't extract file‑ID from the provided link.")
  dl_url <- paste0("https://drive.google.com/uc?export=download&id=", file_id)
  resp   <- httr::GET(dl_url, user_agent("Mozilla/5.0"))
  httr::stop_for_status(resp)
  content(resp, as = "text", encoding = "UTF-8")
}

# ---- Text cleaners ------------------------------------------------------------
find_first_publish_date <- function(node) {
  txt  <- html_text(node, trim = TRUE)
  pat  <- "First\\s+published(?:\\s+online)?\\s+([A-Za-z]+\\s+\\d{1,2},?\\s+\\d{4})"
  m    <- str_match(txt, regex(pat, ignore_case = TRUE))[,2]
  ifelse(is.na(m), "", m)
}

clean_abstract <- function(raw) {
  raw %>% 
    str_remove(regex("\\b(Show|Hide|Preview|Full)\\s*abstract\\b", ignore_case = TRUE)) %>% 
    str_remove(regex("^\\s*Abstract\\s*:?", ignore_case = TRUE)) %>% 
    str_squish()
}

canonical_doi <- function(href) {
  pat <- "10\\.\\d{4,9}/[^\\s/#?]+"
  if (str_detect(href, pat))
    paste0("https://doi.org/", str_extract(href, pat))
  else href
}

Core Scraping Logic

extract_article_data <- function(container) {
  # Initialise
  out <- list(title = "", authors = "", date = "", doi = "", abstract = "")
  
  # -- Title --------------------------------------------------------------------
  title_selectors <- c(
    "h3.item-title","h4.item-title","h5.item-title","div.art_title",
    "div.hlFld-Title","a.ref.nowrap",".tocHeading",
    "h3","h4","h5","h2","[class*='title']"
  )
  for (sel in title_selectors) {
    el <- html_element(container, sel)
    if (!is.na(el)) {
      raw <- html_text(el, trim = TRUE) |> str_replace_all("\u00a0", " ")
      out$title <- str_squish(raw)
      break
    }
  }
  
  # -- Authors ------------------------------------------------------------------
  author_selectors <- c(
    "div.contrib","div.contributors","div.author","div.authors",
    "span.hlFld-ContribAuthor","div.art_authors",
    "[class*='contrib']","[class*='author']"
  )
  for (sel in author_selectors) {
    el <- html_element(container, sel)
    if (!is.na(el)) {
      out$authors <- html_text(el, trim = TRUE) |> str_squish()
      break
    }
  }
  
  # -- Date ---------------------------------------------------------------------
  date_selectors <- c(
    "div.pub-date","div.published-date","span.pub-date",
    "div.date","[class*='date']","[class*='publish']"
  )
  for (sel in date_selectors) {
    el <- html_element(container, sel)
    if (!is.na(el) && str_length(html_text(el, trim = TRUE)) > 0) {
      out$date <- html_text(el, trim = TRUE)
      break
    }
  }
  if (out$date == "")
    out$date <- find_first_publish_date(container)
  
  # -- DOI ----------------------------------------------------------------------
  doi_el <- html_element(
    container, 
    xpath = ".//a[contains(@href,'doi.org') or contains(@href,'/doi/')]"
  )
  if (!is.na(doi_el))
    out$doi <- canonical_doi(html_attr(doi_el, "href"))
  
  # -- Abstract -----------------------------------------------------------------
  abs_selectors <- c(
    "div.abstract","div.abstractSection","div.hlFld-Abstract",
    "p.abstract","[class*='abstract']"
  )
  for (sel in abs_selectors) {
    el <- html_element(container, sel)
    if (!is.na(el)) {
      out$abstract <- clean_abstract(html_text(el, trim = TRUE))
      if (out$abstract != "") break
    }
  }
  
  out
}

extract_articles_from_soup <- function(soup) {
  sel <- c(
    "div.issue-item","div.issue-item-container","div.article-list-item",
    "article.item","div.hlFld-Fulltext","div.tocHeading",
    "div.art_title","div[class*='issue-item']","div[class*='article']",
    "li.item","div.item"
  )
  containers <- character(0)
  for (s in sel) {
    tmp <- html_elements(soup, s)
    if (length(tmp) > 0) { containers <- tmp; break }
  }
  # Fallback —broad search
  if (length(containers) == 0)
    containers <- html_elements(
      soup, xpath = "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'doi') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'author') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'abstract') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'volume') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'issue')]"
    )[1:20]
  
  articles <- purrr::map(containers, extract_article_data) |>
              purrr::keep(~ .x$title != "") |>
              bind_rows()
  articles
}

save_to_csv <- function(df, file = "journal_articles.csv") {
  write_csv(df, file)
  message(glue::glue("✔ Saved {nrow(df)} articles to {file}"))
}

Main Pipeline

gdrive_url <- "https://drive.google.com/file/d/1At1Y8CbwlInSQC5fbMvyExklEEKvctli/view?usp=sharing"

cat(strrep("=",60), "\nGOOGLE‑DRIVE JOURNAL ARTICLE SCRAPER (R)\n", strrep("=",60), "\n")

## ============================================================ 
## GOOGLE‑DRIVE JOURNAL ARTICLE SCRAPER (R)
##  ============================================================

html_content <- fetch_html_from_gdrive(gdrive_url)
soup         <- read_html(html_content)
articles_df  <- extract_articles_from_soup(soup)

if (nrow(articles_df) == 0) {
  stop("❌ No articles found — did the HTML structure change?")
}

# Preview in console
cat("\nArticle summary:\n", strrep("-",60), "\n")

## 
## Article summary:
##  ------------------------------------------------------------

purrr::pwalk(
  articles_df,
  function(title, authors, date, doi, abstract) {
    cat(glue::glue("{title}\n  Authors: {authors}\n  Date: {date}\n  DOI: {doi}\n  Abstract: {str_trunc(abstract, 150)}\n\n"))
  }
)

## Conceptual Research: Multidisciplinary Insights for Marketing
## Authors: Irina V. Kozlenkova Caleb Warren Suresh Kotha Reihane Boghrati Robert W. Palmatier
## Date: November 16, 2024
## DOI: https://doi.org/10.1177/00222429241302814
## Abstract: Preview abstractHide abstract AbstractConceptual research is fundamental to advancing theory and, thus, science. Conceptual articles launch new res...
## Fixing Onlies Versus Advancing Multiples: Number of Children and Parents’ Preferences for Educational Products
## Authors: Phyllis Xue Wang Ce Liang Qiyuan Wang
## Date: December 11, 2024
## DOI: https://doi.org/10.1177/00222429241306009
## Abstract: Preview abstractHide abstract AbstractDue to a continuous decline in fertility rates in recent decades, the number of one-child families has been i...
## Retailer Differentiation in Social Media: An Investigation of Firm-Generated Content on Twitter
## Authors: Mikhail Lysyakov P.K. Kannan Siva Viswanathan Kunpeng Zhang
## Date: February 11, 2025
## DOI: https://doi.org/10.1177/00222429241298654
## Abstract: Preview abstractHide abstract AbstractSocial media platforms have been used by firms for a variety of purposes: building firms’ brand image, increa...
## Cardio with Mr. Treadmill: How Anthropomorphizing the Means of Goal Pursuit Increases Motivation
## Authors: Lili Wang Maferima Touré-Tillery
## Date: November 22, 2024
## DOI: https://doi.org/10.1177/00222429241303387
## Abstract: Preview abstractHide abstract AbstractThis article examines the motivational consequences of anthropomorphizing the means of goal pursuit. Eight st...
## The Impact of App Crashes on Consumer Engagement
## Authors: Savannah Wei Shi Seoungwoo Lee Kirthi Kalyanam Michel Wedel
## Date: November 22, 2024
## DOI: https://doi.org/10.1177/00222429241304322
## Abstract: Preview abstractHide abstract AbstractThe authors develop and test a theoretical framework to examine the impact of app crashes on app engagement. ...
## Beyond the Pair: Media Archetypes and Complex Channel Synergies in Advertising
## Authors: J. Jason Bell Felipe Thomaz Andrew T. Stephen
## Date: February 26, 2025
## DOI: https://doi.org/10.1177/00222429241302808
## Abstract: Preview abstractHide abstract AbstractPrior research on advertising media mixes has mostly focused on single channels (e.g., television), pairwise ...
## Color Me Effective: The Impact of Color Saturation on Perceptions of Potency and Product Efficacy
## Authors: Lauren I. Labrecque Stefanie Sohn Barbara Seegebarth Christy Ashley
## Date: January 31, 2025
## DOI: https://doi.org/10.1177/00222429241296392
## Abstract: Preview abstractHide abstract AbstractConsumers use observable cues, like color, to help them evaluate products. This research establishes that con...
## Racial Inequity in Donation-Based Crowdfunding Platforms: The Role of Facial Emotional Expressiveness
## Authors: Elham Yazdani Anindita Chakravarty Jeffrey Inman
## Date: February 24, 2025
## DOI: https://doi.org/10.1177/00222429241300320
## Abstract: Preview abstractHide abstract AbstractDonation-based crowdfunding platforms often claim to pursue equitable outcomes for all beneficiaries, yet man...

save_to_csv(articles_df)

## ✔ Saved 8 articles to journal_articles.csv

## View the downloaded CSV
library(DT)          # install.packages("DT") if you don’t have it

## Warning: package 'DT' was built under R version 4.4.3

# If you’ve just run the pipeline, articles_df is already in memory.
# Otherwise, load it back from disk:
# articles_df <- readr::read_csv("journal_articles.csv")

DT::datatable(
  articles_df,
  options = list(pageLength = 10, scrollX = TRUE),
  rownames = FALSE,
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    'Table.1:.Scraped journal articles'
  )
)

Journal Article Scraper

Pullipudi Siva Mani Subrahmanya Hari Vamsi

2025-07-15

Importing Required Libraries

Functions

Core Scraping Logic

Main Pipeline