Scrape Pubtypes
htm <- xml2::read_html("https://www.nlm.nih.gov/mesh/pubtypes.html")
ptyps <- data.frame(Name = htm %>% rvest::html_nodes("dt") %>% rvest::html_text(),
Desc = htm %>% rvest::html_nodes("dd") %>% rvest::html_text(), stringsAsFactors = F)
tags$p("Types used in Query")
Types used in Query
ptyps[c(3, 5, 33:38, 43, 47, 66, 100, 106, 124, 134, 136:141, 145, 160), ]
Create Query and Search Pubmed
# https://ropensci.org/tutorials/rentrez_tutorial/
library(rentrez)
entrez_db_searchable("pubmed")
## Searchable fields for database 'pubmed'
## ALL All terms from all searchable fields
## UID Unique number assigned to publication
## FILT Limits the records
## TITL Words in title of publication
## WORD Free text associated with publication
## MESH Medical Subject Headings assigned to publication
## MAJR MeSH terms of major importance to publication
## AUTH Author(s) of publication
## JOUR Journal abbreviation of publication
## AFFL Author's institutional affiliation and address
## ECNO EC number for enzyme or CAS registry number
## SUBS CAS chemical name or MEDLINE Substance Name
## PDAT Date of publication
## EDAT Date publication first accessible through Entrez
## VOL Volume number of publication
## PAGE Page number(s) of publication
## PTYP Type of publication (e.g., review)
## LANG Language of publication
## ISS Issue number of publication
## SUBH Additional specificity for MeSH term
## SI Cross-reference from publication to other databases
## MHDA Date publication was indexed with MeSH terms
## TIAB Free text associated with Abstract/Title
## OTRM Other terms associated with publication
## INVR Investigator
## COLN Corporate Author of publication
## CNTY Country of publication
## PAPX MeSH pharmacological action pre-explosions
## GRNT NIH Grant Numbers
## MDAT Date of last modification
## CDAT Date of completion
## PID Publisher ID
## FAUT First Author of publication
## FULL Full Author Name(s) of publication
## FINV Full name of investigator
## TT Words in transliterated title of publication
## LAUT Last Author of publication
## PPDT Date of print publication
## EPDT Date of Electronic publication
## LID ELocation ID
## CRDT Date publication first accessible through Entrez
## BOOK ID of the book that contains the document
## ED Section's Editor
## ISBN ISBN
## PUBN Publisher's name
## AUCL Author Cluster ID
## EID Extended PMID
## DSO Additional text from the summary
## AUID Author Identifier
## PS Personal Name as Subject
## COIS Conflict of Interest Statements
p_types <- paste(ptyps[c(3, 5, 33:38, 43, 47, 66, 100, 106, 124, 134, 136:141, 145,
160), 1], "[PTYP]", sep = "") %>% paste(collapse = " OR ")
educ <- paste(c("education", "elementary", "student", "grade", "grader", "middle School",
"high school", "undergraduate", "university", "graduate", "pedagogy"), "[TIAB]",
sep = "") %>% paste(collapse = " OR ")
tags$p("Query:")
Query:
(query <- paste0("mindfulness[TIAB] OR meditation[TIAB] OR meditation[MESH] AND ",
"(", educ, ")", " AND ", "2008:2018[EDAT]", " AND ", "(", p_types, ")"))
## [1] "mindfulness[TIAB] OR meditation[TIAB] OR meditation[MESH] AND (education[TIAB] OR elementary[TIAB] OR student[TIAB] OR grade[TIAB] OR grader[TIAB] OR middle School[TIAB] OR high school[TIAB] OR undergraduate[TIAB] OR university[TIAB] OR graduate[TIAB] OR pedagogy[TIAB]) AND 2008:2018[EDAT] AND (Academic Dissertations[PTYP] OR Adaptive Clinical Trial[PTYP] OR Clinical Study[PTYP] OR Clinical Trial, Phase I[PTYP] OR Clinical Trial, Phase II[PTYP] OR Clinical Trial, Phase III[PTYP] OR Clinical Trial, Phase IV[PTYP] OR Clinical Trial[PTYP] OR Comparative Study[PTYP] OR Controlled Clinical Trial[PTYP] OR Evaluation Studies[PTYP] OR Meta-Analysis[PTYP] OR Observational Study[PTYP] OR Pragmatic Clinical Trial[PTYP] OR Randomized Controlled Trial[PTYP] OR Research Support, N.I.H., Extramural[PTYP] OR Research Support, N.I.H., Intramural[PTYP] OR Research Support, Non-U.S. Gov't[PTYP] OR Research Support, U.S. Government[PTYP] OR Research Support, U.S. Gov't, Non-P.H.S.[PTYP] OR Research Support, U.S. Gov't, P.H.S.[PTYP] OR Review[PTYP] OR Validation Studies[PTYP])"
(ids <- entrez_search(db = "pubmed", term = query, sort = "relevance", retmax = 1000,
use_history = T))
## Entrez search result with 516 hits (object contains 516 IDs and a web_history object)
## Search term (as translated): mindfulness[TIAB] OR meditation[TIAB] OR "meditati ...
summary_records <- list()
for (seq_start in seq(1, round(length(ids$ids), -1), 50)) {
recs <- entrez_summary(db = "pubmed", web_history = ids$web_history, retmax = 50,
retstart = seq_start)
summary_records <- append(recs, summary_records, after = length(summary_records))
}
Extract Titles and remove stopwords
tags$h3("All Titles")
lapply(summary_records, FUN = extract_from_esummary, c("title")) %>% do.call("rbind",
.) %>% as.data.frame %>% DT::datatable()
pub_titles <- lapply(summary_records, FUN = extract_from_esummary, c("title")) %>%
lapply(function(.) strsplit(., "\\s{1,}"))
pub_tm <- textmineR::CreateDtm(pub_titles)
tags$h3("Term Frequency:")
textmineR::TermDocFreq(pub_tm) %>% arrange(desc(term_freq)) %>% DT::datatable()
pub_dtm <- pub_tm %>% textmineR::Dtm2Docs()
Find titles with Cognitive and Students
tags$p("It looks like cognitive and students appear frequently, these are the studies with cognitive in the title:")
It looks like cognitive and students appear frequently, these are the studies with cognitive in the title:
(pub_tchar <- lapply(summary_records, FUN = extract_from_esummary, c("title"))) %>%
.[str_detect(., "cognitive")] %>% do.call("rbind", .) %>% as.data.frame %>% DT::datatable()
And Students
tags$p("And with students in the title:")
And with students in the title:
pub_tchar %>% .[str_detect(., "students")] %>% do.call("rbind", .) %>% as.data.frame %>%
DT::datatable()
tags$p("with students & cognitive in the title:")
with students & cognitive in the title:
pub_tchar %>% .[str_detect(., "students&cognitive")] %>% do.call("rbind", .) %>%
as.data.frame %>% DT::datatable()
Using arules to determine associated subjects
library(arules)
pub_items <- as(lapply(pub_dtm, function(.) strsplit(., "\\s{1,}") %>% unlist) %>%
lapply(function(.) {
.[-c(1, length(.))]
}), "transactions")
pub_apriori <- apriori(pub_items, parameter = list(support = 0.01))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 5
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1443 item(s), 515 transaction(s)] done [0.01s].
## sorting and recoding items ... [172 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## writing ... [776 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
(pub_apriori_df <- DATAFRAME(pub_apriori) %>% as.data.frame() %>% arrange(desc(support)))