Extract PUBMED data for Obesity:

library(RISmed)
library(dplyr)

# set topic of interest
topic <- "obesity[MeSH]"

# create publication count variable
pub_count <- EUtilsSummary(topic, retmax = 1, mindate = 2000, maxdate = 2012)
pub_count <- pub_count@count

# determine number of iterations needed for loop
iter <- ceiling(pub_count / 100)
#iter <- 10 # test

# set number of publications to pull in each iteration
retmax = 100

i <- 1
for (i in 1:iter){
  # set restart value
  retstart = ifelse(i == 1, 0, (i - 1) * 100)
  
  # set query
  query <- EUtilsSummary(topic, retstart = retstart, retmax = retmax, mindate = 2000, maxdate = 2012)
  
  # fetch query results
  fetch <- EUtilsGet(query)
  
  # create character vectors
  pmids <- PMID(fetch)
  article_titles <- ArticleTitle(fetch)
  year_published <- YearPubmed(fetch)
  
  # create base data frame
  base <- data.frame(PMID = pmids, Article_Title = article_titles, YearPub = year_published)
  
  authors <- Author(fetch)
  mesh <- Mesh(fetch)
  
  # loop through each result and extract needed data
  j <- 1
  for (j in 1:length(pmids)){
    
    # create authors data frame for each publication
    tbl_auth_inc <- authors[[j]]
    tbl_auth_inc$PMID <- pmids[j]
    
    # create MeSH descriptor data frame for each publication
    tbl_MeSH_inc <- mesh[[j]] %>% 
      filter(Type == "Descriptor") # filter to just Descriptor types
    
    # convert variables to character
    tbl_MeSH_inc[1:2] <- lapply(tbl_MeSH_inc[1:2], function(x) as.character(x))
    
    # add PMID
    tbl_MeSH_inc$PMID <- pmids[j]
    
    # if first publication create final data frame, else append to final data frame
    if (j == 1){
      tbl_Auth <- tbl_auth_inc
      tbl_MeSH <- tbl_MeSH_inc
    }
    else {
      tbl_Auth <- bind_rows(tbl_Auth, tbl_auth_inc)
      tbl_MeSH <- bind_rows(tbl_MeSH, tbl_MeSH_inc)
    }
  }
  
  if (i == 1){
    base_f <- base
    tbl_Auth_f <- tbl_Auth
    tbl_MeSH_f <- tbl_MeSH
  }
  else {
    base_f <- bind_rows(base_f, base)
    tbl_Auth_f <- bind_rows(tbl_Auth_f, tbl_Auth)
    tbl_MeSH_f <- bind_rows(tbl_MeSH_f, tbl_MeSH)
  }
}

# clean up
rm(base, tbl_Auth, tbl_auth_inc, tbl_MeSH, tbl_MeSH_inc,
   article_titles, authors, fetch, i, iter, j, mesh,
   pmids, pub_count, query, retmax, retstart, topic,
   year_published)

# write output for later use
saveRDS(base_f, "./base_f.rds")
saveRDS(tbl_Auth_f, "./tbl_Auth_f.rds")
saveRDS(tbl_MeSH_f, "./tbl_MeSH_f.rds")

Aggregate data

# aggregate by MeSH heading
mesh_grp <- group_by(tbl_MeSH_f, Heading) %>% 
  tally() %>%
  ungroup() %>% 
  filter(Heading != "Obesity") %>% 
  arrange(-n)

rm(tbl_MeSH_f)

head(mesh_grp)
## Source: local data frame [6 x 2]
## 
##           Heading     n
##             (chr) (int)
## 1          Humans 76603
## 2          Female 50433
## 3            Male 46174
## 4           Adult 31195
## 5     Middle Aged 27358
## 6 Body Mass Index 24548

Plot top 25 MeSH headings

# plot of top 25 headings
library(ggplot2)
ggplot(slice(mesh_grp, 1:25), aes(Heading, n, label = n)) +
  geom_bar(width = .25, stat = "identity", colour = "light grey") +
  coord_flip() +
  geom_text(colour = "black") +
  ggtitle("Top 25 MeSH Headings for Obesity search")

You can see Hypertension and Diabetes as two diseases associated with Obesity.

Read in UMLS semantic file

# read in UMLS semantic types
umls <- read.csv("http://semanticnetwork.nlm.nih.gov/download/SemGroups.txt",
                 sep = "|",
                 colClasses = "character",
                 header = FALSE)

head(umls)
##     V1                     V2   V3                                  V4
## 1 ACTI Activities & Behaviors T052                            Activity
## 2 ACTI Activities & Behaviors T053                            Behavior
## 3 ACTI Activities & Behaviors T056      Daily or Recreational Activity
## 4 ACTI Activities & Behaviors T051                               Event
## 5 ACTI Activities & Behaviors T064 Governmental or Regulatory Activity
## 6 ACTI Activities & Behaviors T055                 Individual Behavior