.. and I had nothing todo on sunday morning.
library(tidyverse)
library(europepmc)
library(tidylog)
Now literature databases (usually) don’t support regex searches. But R does. So let’s fetch the relevant data
df_lit <- europepmc::epmc_search("CRISPR", limit = 50000)
#> 42289 records found, returning 42289
Inspection with skimr looks good:
skimr::skim(df_lit)
#> Skim summary statistics
#> n obs: 42289
#> n variables: 30
#>
#> ── Variable type:character ──────────────────────────────────────────────────────────────────────────────────
#> variable missing complete n min max empty n_unique
#> authorString 438 41851 42289 5 26671 0 40270
#> bookid 42284 5 42289 9 9 0 5
#> doi 415 41874 42289 12 55 0 41794
#> firstIndexDate 2 42287 42289 10 10 0 2394
#> firstPublicationDate 2 42287 42289 10 10 0 2928
#> hasBook 2 42287 42289 1 1 0 2
#> hasDbCrossReferences 2 42287 42289 1 1 0 2
#> hasLabsLinks 2 42287 42289 1 1 0 2
#> hasPDF 2 42287 42289 1 1 0 2
#> hasReferences 2 42287 42289 1 1 0 2
#> hasSuppl 2 42287 42289 1 1 0 2
#> hasTextMinedTerms 2 42287 42289 1 1 0 2
#> hasTMAccessionNumbers 2 42287 42289 1 1 0 2
#> id 2 42287 42289 5 12 0 42283
#> inEPMC 2 42287 42289 1 1 0 2
#> inPMC 2 42287 42289 1 1 0 2
#> isOpenAccess 2 42287 42289 1 1 0 2
#> issue 11235 31054 42289 1 46 0 794
#> journalIssn 1525 40764 42289 9 33 0 2545
#> journalTitle 1512 40777 42289 3 57 0 2538
#> journalVolume 2309 39980 42289 1 12 0 928
#> pageInfo 6758 35531 42289 0 39 2 25641
#> pmcid 9014 33275 42289 9 10 0 33271
#> pmid 2141 40148 42289 8 8 0 40144
#> pubType 2 42287 42289 4 223 0 712
#> pubYear 31 42258 42289 4 4 0 19
#> source 2 42287 42289 3 3 0 7
#> title 5 42284 42289 5 373 0 42060
#>
#> ── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────
#> variable missing complete n mean sd p0 p25 p50 p75 p100
#> citedByCount 2 42287 42289 12.03 64.83 0 0 2 9 6425
#> versionNumber 42240 49 42289 1.8 0.82 1 1 2 2 4
#> hist
#> ▇▁▁▁▁▁▁▁
#> ▇▁▇▁▁▂▁▁
First step is to extract acronyms: I use a regex that capture two things:
Then I removed all the extracted strings that were just crispr, crispr cas, or crispr cpf.
pattern <- "([CRISPR]{4,}[-\\/]{1}[[:alnum:]]{3,})|([C]\\w*[PR]+\\w*[PR]+\\w*)"
df_acronyms <- df_lit %>%
mutate(acronym = str_extract_all(title, pattern)) %>%
unnest(acronym) %>%
mutate(
clean_acro = acronym %>%
str_replace_all("[[:punct:]]+", " ") %>%
str_remove_all("[[:digit:]]") %>%
tolower() %>%
str_squish()
) %>%
filter(!str_detect(clean_acro,
"^crispr[ais]?( ?((d?cas[a-dn]?)|(cpf)))?$")
) %>%
filter(nchar(clean_acro) > 3)
#> mutate: new variable 'acronym' with 384 unique values and <1% NA
#> mutate: new variable 'clean_acro' with 262 unique values and <1% NA
#> filter: removed 6,944 rows (90%), 759 rows remaining
#> filter: removed 7 rows (1%), 752 rows remaining
Removing all the correct english, e.g. “CRISPR-mediated”, “CORRECTION”
library(hunspell)
check_words <- function(string){
tokens <- str_split(string, " ")
out <- map_lgl(tokens,
~any(hunspell_check(.x)))
return(out)
}
df_pre <- df_acronyms %>%
filter(!check_words(clean_acro)) %>%
mutate(
url = ifelse(!is.na(doi), glue::glue("https://doi.org/{doi}"),
ifelse(!is.na(pmid), glue::glue("https://www.ncbi.nlm.nih.gov/pubmed/?term={pmid}"),
NA_character_)
),
title = glue::glue(
"<a href=\"{url}\" target=\"_blank\">{title}</a>"
)
)
#> filter: removed 531 rows (71%), 221 rows remaining
#> mutate: changed 221 values (100%) of 'title' (0 new NA)
#> new variable 'url' with 215 unique values and 0% NA
Upon inspection, I decided to split the output in two tables,
crisprdf_crispr <- df_pre %>%
filter(str_detect(clean_acro, "crispr")) %>%
select(acronym, year = pubYear, title, author = authorString, journal = journalTitle, url)
#> filter: removed 91 rows (41%), 130 rows remaining
#> select: dropped 30 variables (id, source, pmid, doi, authorString, …)
df_other <- anti_join(df_pre,
df_crispr) %>%
select(acronym, year = pubYear, title, author = authorString, journal = journalTitle, url)
#> Joining, by = c("title", "acronym", "url")
#> anti_join: added no columns
#> > rows only in x 91
#> > rows only in y ( 0)
#> > matched rows (130)
#> > =====
#> > rows total 91
#> select: dropped 30 variables (id, source, pmid, doi, authorString, …)
I c&ped this great piece from here: Extending DT child rows example to generate folding DT tables
library(DT)
# http://www.reigo.eu/2018/04/extending-dt-child-row-example/
# datatable2 - datatable with child rows
datatable2 <- function(x, vars = NULL, opts = NULL, ...) {
names_x <- names(x)
if (is.null(vars)) stop("'vars' must be specified!")
pos <- match(vars, names_x)
if (any(map_chr(x[, pos], typeof) == "list"))
stop("list columns are not supported in datatable2()")
pos <- pos[pos <= ncol(x)] + 1
rownames(x) <- NULL
if (nrow(x) > 0) x <- cbind(' ' = '⊕', x)
# options
opts <- c(
opts,
list(
columnDefs = list(
list(visible = FALSE, targets = c(0, pos)),
list(orderable = FALSE, className = 'details-control', targets = 1),
list(className = 'dt-left', targets = 1:3),
list(className = 'dt-right', targets = 4:ncol(x))
)
))
datatable(
x,
...,
escape = FALSE,
options = opts,
callback = JS(.callback2(x = x, pos = c(0, pos)))
)
}
.callback2 <- function(x, pos = NULL) {
part1 <- "table.column(1).nodes().to$().css({cursor: 'pointer'});"
part2 <- .child_row_table2(x, pos = pos)
part3 <-
"
table.on('click', 'td.details-control', function() {
var td = $(this), row = table.row(td.closest('tr'));
if (row.child.isShown()) {
row.child.hide();
td.html('⊕');
} else {
row.child(format(row.data())).show();
td.html('⊖');
}
});"
paste(part1, part2, part3)
}
.child_row_table2 <- function(x, pos = NULL) {
names_x <- paste0(names(x), ":")
text <- "
var format = function(d) {
text = '<div><table >' +
"
for (i in seq_along(pos)) {
text <- paste(text, glue::glue(
"'<tr>' +
'<td>' + '{names_x[pos[i]]}' + '</td>' +
'<td>' + d[{pos[i]}] + '</td>' +
'</tr>' + " ))
}
paste0(text,
"'</table></div>'
return text;};"
)
}