title: “Semantic Scholar API Data Fetch”

output: html_document

date: “2024-06-09”

editor_options:

markdown:

wrap: 72

Setup

Loading Libraries

library(httr)
library(retry)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)

Fetching API Key

Fetch the API key from environment variables. You need to get your own API key.

api_key <- Sys.getenv("S2_API_KEY")
if (api_key == "") {
  stop("API key not found in environment variables.")
}

Define Retry Configuration

Define the retry configuration

retry_config <- list(
  max_attempts = 5,
  retry_times = c(1, 2, 4, 8, 16),
  max_total_wait_time = Inf,
  terminate_on = 0L,
  pause_min = 1,
  pause_cap = 1
)

Function to Make an API Request

Function to make an API request

make_request <- function(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token) {
  response <- RETRY(
    "GET",
    "https://api.semanticscholar.org/graph/v1/paper/search/bulk",
    add_headers(`x-api-key` = api_key),
    query = list(
      query = query,
      fields = fields,
      limit = limit,
      publicationTypes = publication_types,
      s2FieldsOfStudy = s2FieldsOfStudy,
      year = year,
      minCitationCount = min_citation_count,
      token = token
    ),
    times = retry_config$max_attempts,
    pause_base = retry_config$pause_min,
    pause_cap = retry_config$pause_cap
  )
  stop_for_status(response)
  return(response)
}

Function to Process Response

Function to process API response data and store in a list.

There might be a way of optimizing this better.

process_data <- function(data) {
  papers <- data$data
  paper_list <- vector("list", length(papers))
  
  for (i in seq_along(papers)) {
    paper <- papers[[i]]
    new_row <- data.frame(
      Title = if (!is.null(paper$title)) paper$title else NA,
      Year = if (!is.null(paper$year)) paper$year else NA,
      Number_of_Authors = if (!is.null(length(paper$authors))) length(paper$authors) else NA,
      References = if (!is.null(paper$referenceCount)) paper$referenceCount else NA,
      Citations = if (!is.null(paper$citationCount)) paper$citationCount else NA,
      Publication_Venue_Name = if (!is.null(paper$publicationVenue$name)) paper$publicationVenue$name else NA,
      stringsAsFactors = FALSE
    )
    paper_list[[i]] <- new_row
  }
  
  return(paper_list)
}

Defining Query Terms

query <- "Artificial Intelligence | AI | Neural Network* | Deep Learning | Reinforcement Learning | Machine Learning | Computer Vision | Natural Language Processing | Robotics"
fields <- "title,year,authors,referenceCount,citationCount,publicationVenue"
limit <- 1000  # This is the limit for Bulk Search
publication_types <- "JournalArticle,ConferencePaper"
s2FieldsOfStudy <- "Computer Science,Mathematics,Artificial Intelligence,Machine Learning"
year <- "1980-2023"
min_citation_count <- 0

Fetching First Round of Data

all_papers_list <- list()
initial_response <- make_request(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token = NULL)
initial_data <- content(initial_response, "parsed")
all_papers_list <- c(all_papers_list, process_data(initial_data))

print(initial_data$total)

## [1] 327662

Running a Loop to get all the data, with time remaining etc.

There’s a lot of data-here. About 320k papers. This will take a while. (About 5 minutes)

# Initialize variables
cycle_count <- 1
token <- initial_data$token

while (!is.null(token)) {
  next_response <- make_request(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token)
  next_data <- content(next_response, "parsed")
  all_papers_list <- c(all_papers_list, process_data(next_data))
  token <- next_data$token
}

Combining into a single dataset

all_papers <- bind_rows(all_papers_list)

Looking for Journals

# Group by Publication_Venue_Name and summarize the count of papers
journal_summary <- all_papers %>%
  group_by(Publication_Venue_Name) %>%
  summarize(Number_of_Papers = n(), .groups = 'drop') %>%
  left_join(
    all_papers %>%
      filter(Year < 2000) %>%
      group_by(Publication_Venue_Name) %>%
      summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
    by = c("Publication_Venue_Name")
  ) %>%
  replace_na(list(Number_of_Papers_Before_2000 = 0))

journal_summary <- all_papers %>%
  group_by(Publication_Venue_Name) %>%
  summarize(Number_of_Papers = n(), .groups = 'drop') %>%
  left_join(
    all_papers %>%
      filter(Year < 2000) %>%
      group_by(Publication_Venue_Name) %>%
      summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
    by = c("Publication_Venue_Name")
  ) %>%
  replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
  arrange(desc(Number_of_Papers_Before_2000))


journal_summary <- all_papers %>%
  group_by(Publication_Venue_Name) %>%
  summarize(Number_of_Papers = n(), .groups = 'drop') %>%
  left_join(
    all_papers %>%
      filter(Year < 2000) %>%
      group_by(Publication_Venue_Name) %>%
      summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
    by = c("Publication_Venue_Name")
  ) %>%
  replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
  arrange(desc(Number_of_Papers_Before_2000))

print(journal_summary)

## # A tibble: 3,891 × 3
##    Publication_Venue_Name                Number_of_Papers Number_of_Papers_Bef…¹
##    <chr>                                            <int>                  <int>
##  1 <NA>                                              5266                    796
##  2 Robotica (Cambridge. Print)                        176                     53
##  3 IEEE/RJS International Conference on…              613                     48
##  4 International Symposium on Experimen…               41                     15
##  5 AAAI Conference on Artificial Intell…               88                     13
##  6 IEEE Transactions on Systems, Man an…               12                     12
##  7 International Joint Conference on Ar…               44                     10
##  8 Robot Soccer World Cup                              77                     10
##  9 Autonome Mobile Systeme                             14                      9
## 10 IEEE International Conference on Rob…              804                      9
## # ℹ 3,881 more rows
## # ℹ abbreviated name: ¹Number_of_Papers_Before_2000

Here’s the list of journals I went for

target_journals = c("International Joint Conference on Artificial Intelligence",
                    "Robotica (Cambridge. Print)",
                    "International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems",
                    "IEEE Transactions on Systems, Man and Cybernetics",
                    "AAAI Conference on Artificial Intelligence",
                    "International Conference on Scientific Computing",
                    "Applied Artificial Intelligence",
                    "Computer",
                    "Artificial Intelligence Review",
                    "Artificial Intelligence",
                    "Annals of Mathematics and Artificial Intelligence",
                    "International Journal of Intelligent Systems"
)

How does this list differ from Google Scholar’s Top 20 AI Journals?

# Top 20 AI journals from Google Scholar 
target_journals2 <- c(
  "Neural Information Processing Systems",
  "International Conference on Learning Representations",
  "International Conference on Machine Learning",
  "AAAI Conference on Artificial Intelligence",
  "Expert systems with applications",
  "IEEE Transactions on Neural Networks and Learning Systems",
  "IEEE Transactions on Systems, Man and Cybernetics",
  "Neurocomputing",
  "International Joint Conference on Artificial Intelligence",
  "Applied Soft Computing",
  "Knowledge-Based Systems",
  "Neural Computing and Applications",
  "IEEE transactions on fuzzy systems",
  "Journal of machine learning research",
  "Artificial Intelligence Review",
  "International Conference on Artificial Intelligence and Statistics",
  "Neural Networks",
  "Engineering Applications of Artificial Intelligence",
  "Applied Intelligence",
  "Conference on Robot Learning"
)

excluded_journals <- setdiff(target_journals2, target_journals)

excluded_papers <- all_papers %>%
  filter(Publication_Venue_Name %in% excluded_journals) %>%
  filter(Year > 1983)

excluded_summary <- excluded_papers %>%
  group_by(Publication_Venue_Name) %>%
  summarize(Number_of_Papers = n(), .groups = 'drop') %>%
  left_join(
    all_papers %>%
      filter(Year < 2000) %>%
      group_by(Publication_Venue_Name) %>%
      summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
    by = c("Publication_Venue_Name")
  ) %>%
  replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
  arrange(desc(Number_of_Papers_Before_2000))


head(excluded_summary)

## # A tibble: 6 × 3
##   Publication_Venue_Name                 Number_of_Papers Number_of_Papers_Bef…¹
##   <chr>                                             <int>                  <int>
## 1 Neural Information Processing Systems                40                      4
## 2 Neural Networks                                       7                      2
## 3 Applied Soft Computing                                9                      0
## 4 Conference on Robot Learning                         55                      0
## 5 Expert systems with applications                     21                      0
## 6 IEEE Transactions on Neural Networks …               24                      0
## # ℹ abbreviated name: ¹Number_of_Papers_Before_2000

Several of the top journals don’t have many publications before year 2000.

Combining AI Journal Publications and ArXiv posts

# Combine the data for technical journals and arXiv papers
technical_papers <- all_papers %>%
  filter(Publication_Venue_Name %in% target_journals) 

# Average stats by year for technical journals
avg_stats_by_year_tech <- technical_papers %>%
  group_by(Year) %>%
  summarise(
    Average_References = mean(References, na.rm = TRUE),
    Average_Coauthors = mean(Number_of_Authors, na.rm = TRUE),
    Total_Papers = n()
  ) %>%
  mutate(Source = "Technical Journals")

arXiv_papers <- all_papers %>%
  filter(Publication_Venue_Name == "arXiv.org") %>%
  filter(Citations > 15)

# Average stats by year for arXiv papers
avg_stats_by_year_arxiv <- arXiv_papers %>%
  group_by(Year) %>%
  summarise(
    Average_References = mean(References, na.rm = TRUE),
    Average_Coauthors = mean(Number_of_Authors, na.rm = TRUE),
    Total_Papers = n()
  ) %>%
  mutate(Source = "arXiv papers (>15 citations)")

# Combine the two datasets
combined_stats_by_year1 <- bind_rows(avg_stats_by_year_tech, avg_stats_by_year_arxiv)

Now we’re ready to plot the data for total papers per year

# Plot for Total Papers by Year
plot_total_papers <- ggplot(combined_stats_by_year1, aes(x = Year, color = Source)) +
  geom_line(aes(y = Total_Papers)) +
  geom_vline(xintercept = 2013, linetype = "dashed", color = "black", alpha = 0.5) +  # Add vertical line at Year 2012
  geom_vline(xintercept = 1983, linetype = "dashed", color = "black", alpha = 0.5) +  # Add vertical line at Year 2012
  labs(
    title = "Data Used",
    x = "Year",
    y = "Number of Papers",
    color = "Source"
  ) +
  theme_minimal()

print(plot_total_papers)

We see that there’s not many observations in 1980, 1981, 1982 for AI journals. There also not many observatinos before 2012 for prominent ArXiv posts

avg_stats_by_year_tech <- avg_stats_by_year_tech %>%
  filter(Year > 1983)

sum(avg_stats_by_year_tech$Total_Papers)

## [1] 366

avg_stats_by_year_arxiv_later <- avg_stats_by_year_arxiv %>%
  filter(Year > 2012)

sum(avg_stats_by_year_arxiv_later$Total_Papers)

## [1] 118

We have quite a lot of data.

Plotting Average Co-Authors and Reference Counts

# Combine the two datasets
combined_stats_by_year <- bind_rows(avg_stats_by_year_tech, avg_stats_by_year_arxiv_later)

# Plot for Average References by Year
plot_avg_references <- ggplot(combined_stats_by_year, aes(x = Year, color = Source)) +
  geom_line(aes(y = Average_References)) + 
  geom_smooth(aes(y = Average_References, group = Source), se = FALSE, color = "black", method = "lm", size=0.3) +  # This will set the color for the line
  labs(
    title = "Average Number of References by Year for AI Papers",
    x = "Year",
    y = "Average Number of References",
    color = "Source"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Print the plot
print(plot_avg_references)

## `geom_smooth()` using formula = 'y ~ x'

# Plot for Average Co-authors by Year
plot_avg_coauthors <- ggplot(combined_stats_by_year, aes(x = Year, color = Source)) +
  geom_line(aes(y = Average_Coauthors)) +
  geom_smooth(aes(y = Average_Coauthors, group = Source), se = FALSE, color = "black", method = "lm", size=0.3) +  # This will set the color for the line
  labs(
    title = "Average Number of Co-authors by Year for AI Papers",
    x = "Year",
    y = "Average Number of Co-authors",
    color = "Source"
  ) +
  theme_minimal()

# Display the plot
print(plot_avg_coauthors)

## `geom_smooth()` using formula = 'y ~ x'