CAEP 7712 - Final Project

Scrape Pubtypes

htm <- xml2::read_html("https://www.nlm.nih.gov/mesh/pubtypes.html")
ptyps <- data.frame(Name = htm %>% rvest::html_nodes("dt") %>% rvest::html_text(), 
    Desc = htm %>% rvest::html_nodes("dd") %>% rvest::html_text(), stringsAsFactors = F)
tags$p("Types used in Query")

Types used in Query

ptyps[c(3, 5, 33:38, 43, 47, 66, 100, 106, 124, 134, 136:141, 145, 160), ]

Create Query and Search Pubmed

# https://ropensci.org/tutorials/rentrez_tutorial/
library(rentrez)
entrez_db_searchable("pubmed")

## Searchable fields for database 'pubmed'
##   ALL     All terms from all searchable fields 
##   UID     Unique number assigned to publication 
##   FILT    Limits the records 
##   TITL    Words in title of publication 
##   WORD    Free text associated with publication 
##   MESH    Medical Subject Headings assigned to publication 
##   MAJR    MeSH terms of major importance to publication 
##   AUTH    Author(s) of publication 
##   JOUR    Journal abbreviation of publication 
##   AFFL    Author's institutional affiliation and address 
##   ECNO    EC number for enzyme or CAS registry number 
##   SUBS    CAS chemical name or MEDLINE Substance Name 
##   PDAT    Date of publication 
##   EDAT    Date publication first accessible through Entrez 
##   VOL     Volume number of publication 
##   PAGE    Page number(s) of publication 
##   PTYP    Type of publication (e.g., review) 
##   LANG    Language of publication 
##   ISS     Issue number of publication 
##   SUBH    Additional specificity for MeSH term 
##   SI      Cross-reference from publication to other databases 
##   MHDA    Date publication was indexed with MeSH terms 
##   TIAB    Free text associated with Abstract/Title 
##   OTRM    Other terms associated with publication 
##   INVR    Investigator 
##   COLN    Corporate Author of publication 
##   CNTY    Country of publication 
##   PAPX    MeSH pharmacological action pre-explosions 
##   GRNT    NIH Grant Numbers 
##   MDAT    Date of last modification 
##   CDAT    Date of completion 
##   PID     Publisher ID 
##   FAUT    First Author of publication 
##   FULL    Full Author Name(s) of publication 
##   FINV    Full name of investigator 
##   TT      Words in transliterated title of publication 
##   LAUT    Last Author of publication 
##   PPDT    Date of print publication 
##   EPDT    Date of Electronic publication 
##   LID     ELocation ID 
##   CRDT    Date publication first accessible through Entrez 
##   BOOK    ID of the book that contains the document 
##   ED      Section's Editor 
##   ISBN    ISBN 
##   PUBN    Publisher's name 
##   AUCL    Author Cluster ID 
##   EID     Extended PMID 
##   DSO     Additional text from the summary 
##   AUID    Author Identifier 
##   PS      Personal Name as Subject 
##   COIS    Conflict of Interest Statements

p_types <- paste(ptyps[c(3, 5, 33:38, 43, 47, 66, 100, 106, 124, 134, 136:141, 145, 
    160), 1], "[PTYP]", sep = "") %>% paste(collapse = " OR ")
educ <- paste(c("education", "elementary", "student", "grade", "grader", "middle School", 
    "high school", "undergraduate", "university", "graduate", "pedagogy"), "[TIAB]", 
    sep = "") %>% paste(collapse = " OR ")
tags$p("Query:")

Query:

(query <- paste0("mindfulness[TIAB] OR meditation[TIAB] OR meditation[MESH] AND ", 
    "(", educ, ")", " AND ", "2008:2018[EDAT]", " AND ", "(", p_types, ")"))

## [1] "mindfulness[TIAB] OR meditation[TIAB] OR meditation[MESH] AND (education[TIAB] OR elementary[TIAB] OR student[TIAB] OR grade[TIAB] OR grader[TIAB] OR middle School[TIAB] OR high school[TIAB] OR undergraduate[TIAB] OR university[TIAB] OR graduate[TIAB] OR pedagogy[TIAB]) AND 2008:2018[EDAT] AND (Academic Dissertations[PTYP] OR Adaptive Clinical Trial[PTYP] OR Clinical Study[PTYP] OR Clinical Trial, Phase I[PTYP] OR Clinical Trial, Phase II[PTYP] OR Clinical Trial, Phase III[PTYP] OR Clinical Trial, Phase IV[PTYP] OR Clinical Trial[PTYP] OR Comparative Study[PTYP] OR Controlled Clinical Trial[PTYP] OR Evaluation Studies[PTYP] OR Meta-Analysis[PTYP] OR Observational Study[PTYP] OR Pragmatic Clinical Trial[PTYP] OR Randomized Controlled Trial[PTYP] OR Research Support, N.I.H., Extramural[PTYP] OR Research Support, N.I.H., Intramural[PTYP] OR Research Support, Non-U.S. Gov't[PTYP] OR Research Support, U.S. Government[PTYP] OR Research Support, U.S. Gov't, Non-P.H.S.[PTYP] OR Research Support, U.S. Gov't, P.H.S.[PTYP] OR Review[PTYP] OR Validation Studies[PTYP])"

(ids <- entrez_search(db = "pubmed", term = query, sort = "relevance", retmax = 1000, 
    use_history = T))

## Entrez search result with 516 hits (object contains 516 IDs and a web_history object)
##  Search term (as translated):  mindfulness[TIAB] OR meditation[TIAB] OR "meditati ...

summary_records <- list()
for (seq_start in seq(1, round(length(ids$ids), -1), 50)) {
    recs <- entrez_summary(db = "pubmed", web_history = ids$web_history, retmax = 50, 
        retstart = seq_start)
    summary_records <- append(recs, summary_records, after = length(summary_records))
}

Extract Titles and remove stopwords

tags$h3("All Titles")

All Titles

lapply(summary_records, FUN = extract_from_esummary, c("title")) %>% do.call("rbind", 
    .) %>% as.data.frame %>% DT::datatable()

pub_titles <- lapply(summary_records, FUN = extract_from_esummary, c("title")) %>% 
    lapply(function(.) strsplit(., "\\s{1,}"))
pub_tm <- textmineR::CreateDtm(pub_titles)
tags$h3("Term Frequency:")

Term Frequency:

textmineR::TermDocFreq(pub_tm) %>% arrange(desc(term_freq)) %>% DT::datatable()

pub_dtm <- pub_tm %>% textmineR::Dtm2Docs()

Find titles with Cognitive and Students

tags$p("It looks like cognitive and students appear frequently, these are the studies with cognitive in the title:")

It looks like cognitive and students appear frequently, these are the studies with cognitive in the title:

(pub_tchar <- lapply(summary_records, FUN = extract_from_esummary, c("title"))) %>% 
    .[str_detect(., "cognitive")] %>% do.call("rbind", .) %>% as.data.frame %>% DT::datatable()

And Students

tags$p("And with students in the title:")

And with students in the title:

pub_tchar %>% .[str_detect(., "students")] %>% do.call("rbind", .) %>% as.data.frame %>% 
    DT::datatable()

tags$p("with students & cognitive in the title:")

with students & cognitive in the title:

pub_tchar %>% .[str_detect(., "students&cognitive")] %>% do.call("rbind", .) %>% 
    as.data.frame %>% DT::datatable()

Using arules to determine associated subjects

library(arules)
pub_items <- as(lapply(pub_dtm, function(.) strsplit(., "\\s{1,}") %>% unlist) %>% 
    lapply(function(.) {
        .[-c(1, length(.))]
    }), "transactions")
pub_apriori <- apriori(pub_items, parameter = list(support = 0.01))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 5 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1443 item(s), 515 transaction(s)] done [0.01s].
## sorting and recoding items ... [172 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## writing ... [776 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

(pub_apriori_df <- DATAFRAME(pub_apriori) %>% as.data.frame() %>% arrange(desc(support)))

CAEP 7712 - Final Project

Stephen Synchronicity

2018-11-18

All Titles

Term Frequency: