title: “Semantic Scholar API Data Fetch” |
output: html_document |
date: “2024-06-09” |
editor_options: |
markdown: |
wrap: 72 |
Loading Libraries
library(httr)
library(retry)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
Fetch the API key from environment variables. You need to get your own API key.
api_key <- Sys.getenv("S2_API_KEY")
if (api_key == "") {
stop("API key not found in environment variables.")
}
Define the retry configuration
retry_config <- list(
max_attempts = 5,
retry_times = c(1, 2, 4, 8, 16),
max_total_wait_time = Inf,
terminate_on = 0L,
pause_min = 1,
pause_cap = 1
)
Function to make an API request
make_request <- function(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token) {
response <- RETRY(
"GET",
"https://api.semanticscholar.org/graph/v1/paper/search/bulk",
add_headers(`x-api-key` = api_key),
query = list(
query = query,
fields = fields,
limit = limit,
publicationTypes = publication_types,
s2FieldsOfStudy = s2FieldsOfStudy,
year = year,
minCitationCount = min_citation_count,
token = token
),
times = retry_config$max_attempts,
pause_base = retry_config$pause_min,
pause_cap = retry_config$pause_cap
)
stop_for_status(response)
return(response)
}
Function to process API response data and store in a list.
There might be a way of optimizing this better.
process_data <- function(data) {
papers <- data$data
paper_list <- vector("list", length(papers))
for (i in seq_along(papers)) {
paper <- papers[[i]]
new_row <- data.frame(
Title = if (!is.null(paper$title)) paper$title else NA,
Year = if (!is.null(paper$year)) paper$year else NA,
Number_of_Authors = if (!is.null(length(paper$authors))) length(paper$authors) else NA,
References = if (!is.null(paper$referenceCount)) paper$referenceCount else NA,
Citations = if (!is.null(paper$citationCount)) paper$citationCount else NA,
Publication_Venue_Name = if (!is.null(paper$publicationVenue$name)) paper$publicationVenue$name else NA,
stringsAsFactors = FALSE
)
paper_list[[i]] <- new_row
}
return(paper_list)
}
query <- "Artificial Intelligence | AI | Neural Network* | Deep Learning | Reinforcement Learning | Machine Learning | Computer Vision | Natural Language Processing | Robotics"
fields <- "title,year,authors,referenceCount,citationCount,publicationVenue"
limit <- 1000 # This is the limit for Bulk Search
publication_types <- "JournalArticle,ConferencePaper"
s2FieldsOfStudy <- "Computer Science,Mathematics,Artificial Intelligence,Machine Learning"
year <- "1980-2023"
min_citation_count <- 0
Fetching First Round of Data
all_papers_list <- list()
initial_response <- make_request(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token = NULL)
initial_data <- content(initial_response, "parsed")
all_papers_list <- c(all_papers_list, process_data(initial_data))
print(initial_data$total)
## [1] 327662
There’s a lot of data-here. About 320k papers. This will take a while. (About 5 minutes)
# Initialize variables
cycle_count <- 1
token <- initial_data$token
while (!is.null(token)) {
next_response <- make_request(api_key, query, fields, limit, publication_types, s2FieldsOfStudy, year, min_citation_count, token)
next_data <- content(next_response, "parsed")
all_papers_list <- c(all_papers_list, process_data(next_data))
token <- next_data$token
}
Combining into a single dataset
all_papers <- bind_rows(all_papers_list)
# Group by Publication_Venue_Name and summarize the count of papers
journal_summary <- all_papers %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers = n(), .groups = 'drop') %>%
left_join(
all_papers %>%
filter(Year < 2000) %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
by = c("Publication_Venue_Name")
) %>%
replace_na(list(Number_of_Papers_Before_2000 = 0))
journal_summary <- all_papers %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers = n(), .groups = 'drop') %>%
left_join(
all_papers %>%
filter(Year < 2000) %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
by = c("Publication_Venue_Name")
) %>%
replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
arrange(desc(Number_of_Papers_Before_2000))
journal_summary <- all_papers %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers = n(), .groups = 'drop') %>%
left_join(
all_papers %>%
filter(Year < 2000) %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
by = c("Publication_Venue_Name")
) %>%
replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
arrange(desc(Number_of_Papers_Before_2000))
print(journal_summary)
## # A tibble: 3,891 × 3
## Publication_Venue_Name Number_of_Papers Number_of_Papers_Bef…¹
## <chr> <int> <int>
## 1 <NA> 5266 796
## 2 Robotica (Cambridge. Print) 176 53
## 3 IEEE/RJS International Conference on… 613 48
## 4 International Symposium on Experimen… 41 15
## 5 AAAI Conference on Artificial Intell… 88 13
## 6 IEEE Transactions on Systems, Man an… 12 12
## 7 International Joint Conference on Ar… 44 10
## 8 Robot Soccer World Cup 77 10
## 9 Autonome Mobile Systeme 14 9
## 10 IEEE International Conference on Rob… 804 9
## # ℹ 3,881 more rows
## # ℹ abbreviated name: ¹Number_of_Papers_Before_2000
Here’s the list of journals I went for
target_journals = c("International Joint Conference on Artificial Intelligence",
"Robotica (Cambridge. Print)",
"International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems",
"IEEE Transactions on Systems, Man and Cybernetics",
"AAAI Conference on Artificial Intelligence",
"International Conference on Scientific Computing",
"Applied Artificial Intelligence",
"Computer",
"Artificial Intelligence Review",
"Artificial Intelligence",
"Annals of Mathematics and Artificial Intelligence",
"International Journal of Intelligent Systems"
)
How does this list differ from Google Scholar’s Top 20 AI Journals?
# Top 20 AI journals from Google Scholar
target_journals2 <- c(
"Neural Information Processing Systems",
"International Conference on Learning Representations",
"International Conference on Machine Learning",
"AAAI Conference on Artificial Intelligence",
"Expert systems with applications",
"IEEE Transactions on Neural Networks and Learning Systems",
"IEEE Transactions on Systems, Man and Cybernetics",
"Neurocomputing",
"International Joint Conference on Artificial Intelligence",
"Applied Soft Computing",
"Knowledge-Based Systems",
"Neural Computing and Applications",
"IEEE transactions on fuzzy systems",
"Journal of machine learning research",
"Artificial Intelligence Review",
"International Conference on Artificial Intelligence and Statistics",
"Neural Networks",
"Engineering Applications of Artificial Intelligence",
"Applied Intelligence",
"Conference on Robot Learning"
)
excluded_journals <- setdiff(target_journals2, target_journals)
excluded_papers <- all_papers %>%
filter(Publication_Venue_Name %in% excluded_journals) %>%
filter(Year > 1983)
excluded_summary <- excluded_papers %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers = n(), .groups = 'drop') %>%
left_join(
all_papers %>%
filter(Year < 2000) %>%
group_by(Publication_Venue_Name) %>%
summarize(Number_of_Papers_Before_2000 = n(), .groups = 'drop'),
by = c("Publication_Venue_Name")
) %>%
replace_na(list(Number_of_Papers_Before_2000 = 0)) %>%
arrange(desc(Number_of_Papers_Before_2000))
head(excluded_summary)
## # A tibble: 6 × 3
## Publication_Venue_Name Number_of_Papers Number_of_Papers_Bef…¹
## <chr> <int> <int>
## 1 Neural Information Processing Systems 40 4
## 2 Neural Networks 7 2
## 3 Applied Soft Computing 9 0
## 4 Conference on Robot Learning 55 0
## 5 Expert systems with applications 21 0
## 6 IEEE Transactions on Neural Networks … 24 0
## # ℹ abbreviated name: ¹Number_of_Papers_Before_2000
Several of the top journals don’t have many publications before year 2000.
# Combine the data for technical journals and arXiv papers
technical_papers <- all_papers %>%
filter(Publication_Venue_Name %in% target_journals)
# Average stats by year for technical journals
avg_stats_by_year_tech <- technical_papers %>%
group_by(Year) %>%
summarise(
Average_References = mean(References, na.rm = TRUE),
Average_Coauthors = mean(Number_of_Authors, na.rm = TRUE),
Total_Papers = n()
) %>%
mutate(Source = "Technical Journals")
arXiv_papers <- all_papers %>%
filter(Publication_Venue_Name == "arXiv.org") %>%
filter(Citations > 15)
# Average stats by year for arXiv papers
avg_stats_by_year_arxiv <- arXiv_papers %>%
group_by(Year) %>%
summarise(
Average_References = mean(References, na.rm = TRUE),
Average_Coauthors = mean(Number_of_Authors, na.rm = TRUE),
Total_Papers = n()
) %>%
mutate(Source = "arXiv papers (>15 citations)")
# Combine the two datasets
combined_stats_by_year1 <- bind_rows(avg_stats_by_year_tech, avg_stats_by_year_arxiv)
Now we’re ready to plot the data for total papers per year
# Plot for Total Papers by Year
plot_total_papers <- ggplot(combined_stats_by_year1, aes(x = Year, color = Source)) +
geom_line(aes(y = Total_Papers)) +
geom_vline(xintercept = 2013, linetype = "dashed", color = "black", alpha = 0.5) + # Add vertical line at Year 2012
geom_vline(xintercept = 1983, linetype = "dashed", color = "black", alpha = 0.5) + # Add vertical line at Year 2012
labs(
title = "Data Used",
x = "Year",
y = "Number of Papers",
color = "Source"
) +
theme_minimal()
print(plot_total_papers)
We see that there’s not many observations in 1980, 1981, 1982 for AI journals. There also not many observatinos before 2012 for prominent ArXiv posts
avg_stats_by_year_tech <- avg_stats_by_year_tech %>%
filter(Year > 1983)
sum(avg_stats_by_year_tech$Total_Papers)
## [1] 366
avg_stats_by_year_arxiv_later <- avg_stats_by_year_arxiv %>%
filter(Year > 2012)
sum(avg_stats_by_year_arxiv_later$Total_Papers)
## [1] 118
We have quite a lot of data.