Importing Required Libraries
library(httr) # HTTP requests
library(rvest) # HTML parsing
library(xml2) # read_html()
library(stringr) # regex helpers
library(dplyr) # data wrangling
library(readr) # write_csv()
Functions
# ---- Google‑Drive utilities ---------------------------------------------------
google_drive_id <- function(url) {
# 1) ".../d/<ID>/view"
m <- str_match(url, "/d/([0-9A-Za-z_-]{10,})")[,2]
if (!is.na(m)) return(m)
# 2) "...?id=<ID>"
qs <- httr::parse_url(url)$query
return(qs[["id"]] %||% "")
}
fetch_html_from_gdrive <- function(url) {
file_id <- google_drive_id(url)
if (file_id == "")
stop("❌ Couldn't extract file‑ID from the provided link.")
dl_url <- paste0("https://drive.google.com/uc?export=download&id=", file_id)
resp <- httr::GET(dl_url, user_agent("Mozilla/5.0"))
httr::stop_for_status(resp)
content(resp, as = "text", encoding = "UTF-8")
}
# ---- Text cleaners ------------------------------------------------------------
find_first_publish_date <- function(node) {
txt <- html_text(node, trim = TRUE)
pat <- "First\\s+published(?:\\s+online)?\\s+([A-Za-z]+\\s+\\d{1,2},?\\s+\\d{4})"
m <- str_match(txt, regex(pat, ignore_case = TRUE))[,2]
ifelse(is.na(m), "", m)
}
clean_abstract <- function(raw) {
raw %>%
str_remove(regex("\\b(Show|Hide|Preview|Full)\\s*abstract\\b", ignore_case = TRUE)) %>%
str_remove(regex("^\\s*Abstract\\s*:?", ignore_case = TRUE)) %>%
str_squish()
}
canonical_doi <- function(href) {
pat <- "10\\.\\d{4,9}/[^\\s/#?]+"
if (str_detect(href, pat))
paste0("https://doi.org/", str_extract(href, pat))
else href
}
Core Scraping Logic
extract_article_data <- function(container) {
# Initialise
out <- list(title = "", authors = "", date = "", doi = "", abstract = "")
# -- Title --------------------------------------------------------------------
title_selectors <- c(
"h3.item-title","h4.item-title","h5.item-title","div.art_title",
"div.hlFld-Title","a.ref.nowrap",".tocHeading",
"h3","h4","h5","h2","[class*='title']"
)
for (sel in title_selectors) {
el <- html_element(container, sel)
if (!is.na(el)) {
raw <- html_text(el, trim = TRUE) |> str_replace_all("\u00a0", " ")
out$title <- str_squish(raw)
break
}
}
# -- Authors ------------------------------------------------------------------
author_selectors <- c(
"div.contrib","div.contributors","div.author","div.authors",
"span.hlFld-ContribAuthor","div.art_authors",
"[class*='contrib']","[class*='author']"
)
for (sel in author_selectors) {
el <- html_element(container, sel)
if (!is.na(el)) {
out$authors <- html_text(el, trim = TRUE) |> str_squish()
break
}
}
# -- Date ---------------------------------------------------------------------
date_selectors <- c(
"div.pub-date","div.published-date","span.pub-date",
"div.date","[class*='date']","[class*='publish']"
)
for (sel in date_selectors) {
el <- html_element(container, sel)
if (!is.na(el) && str_length(html_text(el, trim = TRUE)) > 0) {
out$date <- html_text(el, trim = TRUE)
break
}
}
if (out$date == "")
out$date <- find_first_publish_date(container)
# -- DOI ----------------------------------------------------------------------
doi_el <- html_element(
container,
xpath = ".//a[contains(@href,'doi.org') or contains(@href,'/doi/')]"
)
if (!is.na(doi_el))
out$doi <- canonical_doi(html_attr(doi_el, "href"))
# -- Abstract -----------------------------------------------------------------
abs_selectors <- c(
"div.abstract","div.abstractSection","div.hlFld-Abstract",
"p.abstract","[class*='abstract']"
)
for (sel in abs_selectors) {
el <- html_element(container, sel)
if (!is.na(el)) {
out$abstract <- clean_abstract(html_text(el, trim = TRUE))
if (out$abstract != "") break
}
}
out
}
extract_articles_from_soup <- function(soup) {
sel <- c(
"div.issue-item","div.issue-item-container","div.article-list-item",
"article.item","div.hlFld-Fulltext","div.tocHeading",
"div.art_title","div[class*='issue-item']","div[class*='article']",
"li.item","div.item"
)
containers <- character(0)
for (s in sel) {
tmp <- html_elements(soup, s)
if (length(tmp) > 0) { containers <- tmp; break }
}
# Fallback —broad search
if (length(containers) == 0)
containers <- html_elements(
soup, xpath = "//*[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'doi') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'author') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'abstract') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'volume') or contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'issue')]"
)[1:20]
articles <- purrr::map(containers, extract_article_data) |>
purrr::keep(~ .x$title != "") |>
bind_rows()
articles
}
save_to_csv <- function(df, file = "journal_articles.csv") {
write_csv(df, file)
message(glue::glue("✔ Saved {nrow(df)} articles to {file}"))
}
Main Pipeline
gdrive_url <- "https://drive.google.com/file/d/1At1Y8CbwlInSQC5fbMvyExklEEKvctli/view?usp=sharing"
cat(strrep("=",60), "\nGOOGLE‑DRIVE JOURNAL ARTICLE SCRAPER (R)\n", strrep("=",60), "\n")
## ============================================================
## GOOGLE‑DRIVE JOURNAL ARTICLE SCRAPER (R)
## ============================================================
html_content <- fetch_html_from_gdrive(gdrive_url)
soup <- read_html(html_content)
articles_df <- extract_articles_from_soup(soup)
if (nrow(articles_df) == 0) {
stop("❌ No articles found — did the HTML structure change?")
}
# Preview in console
cat("\nArticle summary:\n", strrep("-",60), "\n")
##
## Article summary:
## ------------------------------------------------------------
purrr::pwalk(
articles_df,
function(title, authors, date, doi, abstract) {
cat(glue::glue("{title}\n Authors: {authors}\n Date: {date}\n DOI: {doi}\n Abstract: {str_trunc(abstract, 150)}\n\n"))
}
)
## Conceptual Research: Multidisciplinary Insights for Marketing
## Authors: Irina V. Kozlenkova Caleb Warren Suresh Kotha Reihane Boghrati Robert W. Palmatier
## Date: November 16, 2024
## DOI: https://doi.org/10.1177/00222429241302814
## Abstract: Preview abstractHide abstract AbstractConceptual research is fundamental to advancing theory and, thus, science. Conceptual articles launch new res...
## Fixing Onlies Versus Advancing Multiples: Number of Children and Parents’ Preferences for Educational Products
## Authors: Phyllis Xue Wang Ce Liang Qiyuan Wang
## Date: December 11, 2024
## DOI: https://doi.org/10.1177/00222429241306009
## Abstract: Preview abstractHide abstract AbstractDue to a continuous decline in fertility rates in recent decades, the number of one-child families has been i...
## Retailer Differentiation in Social Media: An Investigation of Firm-Generated Content on Twitter
## Authors: Mikhail Lysyakov P.K. Kannan Siva Viswanathan Kunpeng Zhang
## Date: February 11, 2025
## DOI: https://doi.org/10.1177/00222429241298654
## Abstract: Preview abstractHide abstract AbstractSocial media platforms have been used by firms for a variety of purposes: building firms’ brand image, increa...
## Cardio with Mr. Treadmill: How Anthropomorphizing the Means of Goal Pursuit Increases Motivation
## Authors: Lili Wang Maferima Touré-Tillery
## Date: November 22, 2024
## DOI: https://doi.org/10.1177/00222429241303387
## Abstract: Preview abstractHide abstract AbstractThis article examines the motivational consequences of anthropomorphizing the means of goal pursuit. Eight st...
## The Impact of App Crashes on Consumer Engagement
## Authors: Savannah Wei Shi Seoungwoo Lee Kirthi Kalyanam Michel Wedel
## Date: November 22, 2024
## DOI: https://doi.org/10.1177/00222429241304322
## Abstract: Preview abstractHide abstract AbstractThe authors develop and test a theoretical framework to examine the impact of app crashes on app engagement. ...
## Beyond the Pair: Media Archetypes and Complex Channel Synergies in Advertising
## Authors: J. Jason Bell Felipe Thomaz Andrew T. Stephen
## Date: February 26, 2025
## DOI: https://doi.org/10.1177/00222429241302808
## Abstract: Preview abstractHide abstract AbstractPrior research on advertising media mixes has mostly focused on single channels (e.g., television), pairwise ...
## Color Me Effective: The Impact of Color Saturation on Perceptions of Potency and Product Efficacy
## Authors: Lauren I. Labrecque Stefanie Sohn Barbara Seegebarth Christy Ashley
## Date: January 31, 2025
## DOI: https://doi.org/10.1177/00222429241296392
## Abstract: Preview abstractHide abstract AbstractConsumers use observable cues, like color, to help them evaluate products. This research establishes that con...
## Racial Inequity in Donation-Based Crowdfunding Platforms: The Role of Facial Emotional Expressiveness
## Authors: Elham Yazdani Anindita Chakravarty Jeffrey Inman
## Date: February 24, 2025
## DOI: https://doi.org/10.1177/00222429241300320
## Abstract: Preview abstractHide abstract AbstractDonation-based crowdfunding platforms often claim to pursue equitable outcomes for all beneficiaries, yet man...
save_to_csv(articles_df)
## ✔ Saved 8 articles to journal_articles.csv
## View the downloaded CSV
library(DT) # install.packages("DT") if you don’t have it
## Warning: package 'DT' was built under R version 4.4.3
# If you’ve just run the pipeline, articles_df is already in memory.
# Otherwise, load it back from disk:
# articles_df <- readr::read_csv("journal_articles.csv")
DT::datatable(
articles_df,
options = list(pageLength = 10, scrollX = TRUE),
rownames = FALSE,
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
'Table.1:.Scraped journal articles'
)
)