Extract PUBMED data for Obesity:
library(RISmed)
library(dplyr)
# set topic of interest
topic <- "obesity[MeSH]"
# create publication count variable
pub_count <- EUtilsSummary(topic, retmax = 1, mindate = 2000, maxdate = 2012)
pub_count <- pub_count@count
# determine number of iterations needed for loop
iter <- ceiling(pub_count / 100)
#iter <- 10 # test
# set number of publications to pull in each iteration
retmax = 100
i <- 1
for (i in 1:iter){
# set restart value
retstart = ifelse(i == 1, 0, (i - 1) * 100)
# set query
query <- EUtilsSummary(topic, retstart = retstart, retmax = retmax, mindate = 2000, maxdate = 2012)
# fetch query results
fetch <- EUtilsGet(query)
# create character vectors
pmids <- PMID(fetch)
article_titles <- ArticleTitle(fetch)
year_published <- YearPubmed(fetch)
# create base data frame
base <- data.frame(PMID = pmids, Article_Title = article_titles, YearPub = year_published)
authors <- Author(fetch)
mesh <- Mesh(fetch)
# loop through each result and extract needed data
j <- 1
for (j in 1:length(pmids)){
# create authors data frame for each publication
tbl_auth_inc <- authors[[j]]
tbl_auth_inc$PMID <- pmids[j]
# create MeSH descriptor data frame for each publication
tbl_MeSH_inc <- mesh[[j]] %>%
filter(Type == "Descriptor") # filter to just Descriptor types
# convert variables to character
tbl_MeSH_inc[1:2] <- lapply(tbl_MeSH_inc[1:2], function(x) as.character(x))
# add PMID
tbl_MeSH_inc$PMID <- pmids[j]
# if first publication create final data frame, else append to final data frame
if (j == 1){
tbl_Auth <- tbl_auth_inc
tbl_MeSH <- tbl_MeSH_inc
}
else {
tbl_Auth <- bind_rows(tbl_Auth, tbl_auth_inc)
tbl_MeSH <- bind_rows(tbl_MeSH, tbl_MeSH_inc)
}
}
if (i == 1){
base_f <- base
tbl_Auth_f <- tbl_Auth
tbl_MeSH_f <- tbl_MeSH
}
else {
base_f <- bind_rows(base_f, base)
tbl_Auth_f <- bind_rows(tbl_Auth_f, tbl_Auth)
tbl_MeSH_f <- bind_rows(tbl_MeSH_f, tbl_MeSH)
}
}
# clean up
rm(base, tbl_Auth, tbl_auth_inc, tbl_MeSH, tbl_MeSH_inc,
article_titles, authors, fetch, i, iter, j, mesh,
pmids, pub_count, query, retmax, retstart, topic,
year_published)
# write output for later use
saveRDS(base_f, "./base_f.rds")
saveRDS(tbl_Auth_f, "./tbl_Auth_f.rds")
saveRDS(tbl_MeSH_f, "./tbl_MeSH_f.rds")
Aggregate data
# aggregate by MeSH heading
mesh_grp <- group_by(tbl_MeSH_f, Heading) %>%
tally() %>%
ungroup() %>%
filter(Heading != "Obesity") %>%
arrange(-n)
rm(tbl_MeSH_f)
head(mesh_grp)
## Source: local data frame [6 x 2]
##
## Heading n
## (chr) (int)
## 1 Humans 76603
## 2 Female 50433
## 3 Male 46174
## 4 Adult 31195
## 5 Middle Aged 27358
## 6 Body Mass Index 24548
Plot top 25 MeSH headings
# plot of top 25 headings
library(ggplot2)
ggplot(slice(mesh_grp, 1:25), aes(Heading, n, label = n)) +
geom_bar(width = .25, stat = "identity", colour = "light grey") +
coord_flip() +
geom_text(colour = "black") +
ggtitle("Top 25 MeSH Headings for Obesity search")
You can see Hypertension and Diabetes as two diseases associated with Obesity.
Read in UMLS semantic file
# read in UMLS semantic types
umls <- read.csv("http://semanticnetwork.nlm.nih.gov/download/SemGroups.txt",
sep = "|",
colClasses = "character",
header = FALSE)
head(umls)
## V1 V2 V3 V4
## 1 ACTI Activities & Behaviors T052 Activity
## 2 ACTI Activities & Behaviors T053 Behavior
## 3 ACTI Activities & Behaviors T056 Daily or Recreational Activity
## 4 ACTI Activities & Behaviors T051 Event
## 5 ACTI Activities & Behaviors T064 Governmental or Regulatory Activity
## 6 ACTI Activities & Behaviors T055 Individual Behavior