Introduction

The getMetadata function works well with one PubMed record. However, there are cases where we need to parse multiple records.

This tutorial shows how can we extract the metadata from multiple PubMed records retrieved with the easyPubMed package.

Parsing PubMed records

Preparation

If you have a entrez key you can save it in the variable ENTREZ_KEY.

ENTREZ_KEY=NULL
library(easyPubMed)
library(xml2)
library(dplyr)
library(XML)

Get pubmed records

Next, we will download approximately 1500 PubMed record.

query <- '"breast neoplasms, male"[MeSH Terms] AND "journal article"[pt] AND ("2010"[dp]:"2022"[dp])'

#download the records in the working directory
batch_pubmed_download(query,
                      dest_file_prefix = "maleCancer_",
                      format = "xml", api_key = ENTREZ_KEY,
                      batch_size = 400)
## [1] "PubMed data batch 1 / 4 downloaded..."
## [1] "PubMed data batch 2 / 4 downloaded..."
## [1] "PubMed data batch 3 / 4 downloaded..."
## [1] "PubMed data batch 4 / 4 downloaded..."
## [1] "maleCancer_01.txt" "maleCancer_02.txt" "maleCancer_03.txt"
## [4] "maleCancer_04.txt"

In this case, the records are saved in .txt files. We need to read them into the R environment. The readXMLPubmed function in the readXML.R script read one file containing the PubMed Records.

#copy the funcitons in your working directory and export them.
#if you have the function in other place, you need to specify the path
source('getMetadata.R')
source('readXML.R')
source('getAuthors.R')
source('makeDfFromXmlTxt.R')
source('nodeList2Df.R')

Parse metadata

Since we are working with multiple records, we use the lapply function with the getMetadata.

files <- dir(pattern = 'maleCancer*')
files <- lapply(files, readXMLPubmed)|> Reduce(c, x= _)
files <- lapply(files, read_xml)

metadata <- lapply(files, getMetadata, metadata.list = c('meshDescrip','meshQualifier','journal','year'))

rm(files)
gc()

Convert metadata list into a dataframe

Now, the metadata object is a list that contains the metadata for each record obtained.

str(metadata[[1]])
## List of 4
##  $ meshDescrip  :'data.frame':   7 obs. of  5 variables:
##   ..$ text        : chr [1:7] "Humans" "Male" "Breast Neoplasms, Male" "Cardiovascular Diseases" ...
##   ..$ UI          : chr [1:7] "D006801" "D008297" "D018567" "D002318" ...
##   ..$ MajorTopicYN: chr [1:7] "N" "N" "Y" "Y" ...
##   ..$ Type        : chr [1:7] "D006801" "D008297" "D018567" "D002318" ...
##   ..$ id          : chr [1:7] "36661672" "36661672" "36661672" "36661672" ...
##  $ meshQualifier:'data.frame':   5 obs. of  4 variables:
##   ..$ text        : chr [1:5] "ethnology" "therapy" "epidemiology" "ethnology" ...
##   ..$ UI          : chr [1:5] "Q000208" "Q000628" "Q000453" "Q000208" ...
##   ..$ MajorTopicYN: chr [1:5] "N" "N" "N" "N" ...
##   ..$ id          : chr [1:5] "36661672" "36661672" "36661672" "36661672" ...
##  $ journal      :'data.frame':   1 obs. of  2 variables:
##   ..$ Title: chr "Current oncology (Toronto, Ont.)"
##   ..$ id   : chr "36661672"
##  $ year         :'data.frame':   1 obs. of  2 variables:
##   ..$ Year: chr "2022"
##   ..$ id  : chr "36661672"

I prefer to use a dataframe, so we can work with the tidyverse packages. We can use the rbindlist function to combine the data into a single dataframe.

years <- lapply(metadata, `[[`, 'year') |> data.table::rbindlist(fill=TRUE)
knitr::kable(head(years))
Year id
2022 36661672
2022 36633028
2023 36564243
2023 36494460
2022 36456130
2022 36440188

Metada Analysis

Paper per year

We can start to make some analysis on the metadata. For example, we can create a histogram showing the publication years.

library(ggplot2)

papersYearsPlot <-
  years |>
  as_tibble() |>
  count(Year) |>
  arrange() |>
  mutate(across(Year, as.integer)) |>
  ggplot() + 
  geom_col(aes(x = Year, y = n), fill = '#4271AE')+
  labs(x = 'year',
       y = 'N of arcticles',
       title = 'Papers of male cancer') +
  theme(plot.title = element_text(face = "bold"),
        panel.border = element_rect(color = "black", fill=NA, size = 0.8),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        axis.text.x = element_text(angle = -90, vjust = 0.5))

papersYearsPlot

Journals

Which are the main journals that publish related to the query?

library(tidyr)
journals <- lapply(metadata, `[[`, 'journal') |> data.table::rbindlist(fill=TRUE)

journalsTbl <-
  journals |>
  select(Title) |>
  drop_na() |>
  count(Title) |>
  slice_max(n, n = 20) |>
  arrange(desc(n))

datatable(journalsTbl)

Analyzing MeSH terms

What are the most frequent MeSH terms in the records?

library(forcats)

meshD <- lapply(metadata, `[[`, 'meshDescrip') |> data.table::rbindlist(fill=TRUE)
meshQ <- lapply(metadata, `[[`, 'meshDescrip') |> data.table::rbindlist(fill=TRUE)

meshD$Type <- NULL
meshD <-
  meshD |>
  select(id, text) |>
  mutate(type = 'descriptor')

meshQ <-
  meshQ |>
  select(id, text) |>
  mutate(type = 'qualifiers')

mesh <-
  data.table::rbindlist(list(meshD,meshQ))

meshFreqPlot <-
  mesh |>
  drop_na() |>
  count(text, type) |>
  arrange(desc(n))|>
  slice_max(order_by = n, n = 25) |>
  mutate(meshes = fct_reorder(text, n)) |>
  ggplot() +
  geom_col(aes(x = n, y = meshes, fill = type))+
  labs(x = 'Mesh Terms',
       y = '',
       title = 'Most frequent MeSH terms') +
  theme(plot.title = element_text(face = "bold"),
        panel.border = element_rect(color = "black", fill=NA, size = 0.8),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        axis.text.x = element_text(angle = -90, vjust = 0.5))

meshFreqPlot

As specified in the Example: Parsing PubMed Records Using Custom Functions vignette, you can currently parse the following metadata by placing in the metadata.list argument of the getMetadata function: abstract, meshDescrip, meshQualifier, keywordTerms, pubType, journal, country, language, title, year and authors.

## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Spanish_Mexico.utf8  LC_CTYPE=Spanish_Mexico.utf8   
## [3] LC_MONETARY=Spanish_Mexico.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Spanish_Mexico.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_0.5.1   tidyr_1.2.0     ggplot2_3.3.6   XML_3.99-0.10  
## [5] dplyr_1.0.9     xml2_1.3.3      easyPubMed_2.22 DT_0.27        
## 
## loaded via a namespace (and not attached):
##  [1] highr_0.10        bslib_0.4.2       compiler_4.2.1    pillar_1.7.0     
##  [5] jquerylib_0.1.4   tools_4.2.1       digest_0.6.29     gtable_0.3.0     
##  [9] jsonlite_1.8.4    evaluate_0.20     lifecycle_1.0.3   tibble_3.1.7     
## [13] pkgconfig_2.0.3   rlang_1.0.6       cli_3.3.0         DBI_1.1.3        
## [17] rstudioapi_0.13   crosstalk_1.2.0   yaml_2.3.7        xfun_0.36        
## [21] fastmap_1.1.0     withr_2.5.0       knitr_1.42        generics_0.1.3   
## [25] vctrs_0.5.2       htmlwidgets_1.5.4 sass_0.4.5        grid_4.2.1       
## [29] tidyselect_1.1.2  data.table_1.14.2 glue_1.6.2        R6_2.5.1         
## [33] fansi_1.0.3       rmarkdown_2.20.1  farver_2.1.1      purrr_0.3.4      
## [37] magrittr_2.0.3    scales_1.2.0      htmltools_0.5.4   ellipsis_0.3.2   
## [41] assertthat_0.2.1  colorspace_2.0-3  labeling_0.4.2    utf8_1.2.2       
## [45] munsell_0.5.0     cachem_1.0.6      crayon_1.5.1