Someone needs to make a list of method-acronyms based on CRISPR.

.. and I had nothing todo on sunday morning.

Literature data from EPMC

library(tidyverse)
library(europepmc)
library(tidylog)

Now literature databases (usually) don’t support regex searches. But R does. So let’s fetch the relevant data

df_lit <- europepmc::epmc_search("CRISPR", limit = 50000)
#> 42289 records found, returning 42289

Inspection with skimr looks good:

skimr::skim(df_lit)
#> Skim summary statistics
#>  n obs: 42289 
#>  n variables: 30 
#> 
#> ── Variable type:character ──────────────────────────────────────────────────────────────────────────────────
#>               variable missing complete     n min   max empty n_unique
#>           authorString     438    41851 42289   5 26671     0    40270
#>                 bookid   42284        5 42289   9     9     0        5
#>                    doi     415    41874 42289  12    55     0    41794
#>         firstIndexDate       2    42287 42289  10    10     0     2394
#>   firstPublicationDate       2    42287 42289  10    10     0     2928
#>                hasBook       2    42287 42289   1     1     0        2
#>   hasDbCrossReferences       2    42287 42289   1     1     0        2
#>           hasLabsLinks       2    42287 42289   1     1     0        2
#>                 hasPDF       2    42287 42289   1     1     0        2
#>          hasReferences       2    42287 42289   1     1     0        2
#>               hasSuppl       2    42287 42289   1     1     0        2
#>      hasTextMinedTerms       2    42287 42289   1     1     0        2
#>  hasTMAccessionNumbers       2    42287 42289   1     1     0        2
#>                     id       2    42287 42289   5    12     0    42283
#>                 inEPMC       2    42287 42289   1     1     0        2
#>                  inPMC       2    42287 42289   1     1     0        2
#>           isOpenAccess       2    42287 42289   1     1     0        2
#>                  issue   11235    31054 42289   1    46     0      794
#>            journalIssn    1525    40764 42289   9    33     0     2545
#>           journalTitle    1512    40777 42289   3    57     0     2538
#>          journalVolume    2309    39980 42289   1    12     0      928
#>               pageInfo    6758    35531 42289   0    39     2    25641
#>                  pmcid    9014    33275 42289   9    10     0    33271
#>                   pmid    2141    40148 42289   8     8     0    40144
#>                pubType       2    42287 42289   4   223     0      712
#>                pubYear      31    42258 42289   4     4     0       19
#>                 source       2    42287 42289   3     3     0        7
#>                  title       5    42284 42289   5   373     0    42060
#> 
#> ── Variable type:integer ────────────────────────────────────────────────────────────────────────────────────
#>       variable missing complete     n  mean    sd p0 p25 p50 p75 p100
#>   citedByCount       2    42287 42289 12.03 64.83  0   0   2   9 6425
#>  versionNumber   42240       49 42289  1.8   0.82  1   1   2   2    4
#>      hist
#>  ▇▁▁▁▁▁▁▁
#>  ▇▁▇▁▁▂▁▁

Data wrangling

Clean up acronyms

First step is to extract acronyms: I use a regex that capture two things:

  • CRISPR (tpyos allowed), followed by letters and/or numbers, connected with a hyphon/minus.
  • A series of alphabetic characters, capital c in the beginning, at least two more occurences of capital p/r in the word.

Then I removed all the extracted strings that were just crispr, crispr cas, or crispr cpf.

pattern <- "([CRISPR]{4,}[-\\/]{1}[[:alnum:]]{3,})|([C]\\w*[PR]+\\w*[PR]+\\w*)"
df_acronyms <- df_lit %>% 
  mutate(acronym = str_extract_all(title, pattern)) %>% 
  unnest(acronym) %>% 
  mutate(
         clean_acro = acronym %>% 
           str_replace_all("[[:punct:]]+", " ") %>% 
           str_remove_all("[[:digit:]]") %>% 
           tolower() %>% 
           str_squish()
         ) %>% 
  filter(!str_detect(clean_acro,
                     "^crispr[ais]?( ?((d?cas[a-dn]?)|(cpf)))?$")
         ) %>% 
  filter(nchar(clean_acro) > 3)
#> mutate: new variable 'acronym' with 384 unique values and <1% NA
#> mutate: new variable 'clean_acro' with 262 unique values and <1% NA
#> filter: removed 6,944 rows (90%), 759 rows remaining
#> filter: removed 7 rows (1%), 752 rows remaining

Using hunspell for more cleaning

Removing all the correct english, e.g. “CRISPR-mediated”, “CORRECTION”

library(hunspell)
check_words <- function(string){
  tokens <- str_split(string, " ")
  out <- map_lgl(tokens,
                 ~any(hunspell_check(.x)))
  return(out)
}

Splitting the data in CRISPR and non-CRISPR

Upon inspection, I decided to split the output in two tables,

  • one with more conservative acronyms, still containing the literal crispr
  • one with the more loose acronyms, probably the more interesting stuff.
df_crispr <- df_pre %>% 
  filter(str_detect(clean_acro, "crispr")) %>% 
  select(acronym, year = pubYear, title, author = authorString, journal = journalTitle, url)
#> filter: removed 91 rows (41%), 130 rows remaining
#> select: dropped 30 variables (id, source, pmid, doi, authorString, …)
df_other <- anti_join(df_pre,
                      df_crispr) %>% 
  select(acronym, year = pubYear, title, author = authorString, journal = journalTitle, url)
#> Joining, by = c("title", "acronym", "url")
#> anti_join: added no columns
#>            > rows only in x    91
#>            > rows only in y  (  0)
#>            > matched rows    (130)
#>            >                 =====
#>            > rows total        91
#> select: dropped 30 variables (id, source, pmid, doi, authorString, …)

Extending DT

I c&ped this great piece from here: Extending DT child rows example to generate folding DT tables

library(DT)
# http://www.reigo.eu/2018/04/extending-dt-child-row-example/
# datatable2 - datatable with child rows
datatable2 <- function(x, vars = NULL, opts = NULL, ...) {
  
  names_x <- names(x)
  if (is.null(vars)) stop("'vars' must be specified!")
  pos <- match(vars, names_x)
  if (any(map_chr(x[, pos], typeof) == "list"))
    stop("list columns are not supported in datatable2()")
  
  pos <- pos[pos <= ncol(x)] + 1
  rownames(x) <- NULL
  if (nrow(x) > 0) x <- cbind(' ' = '&oplus;', x)

  # options
  opts <- c(
    opts, 
    list(
      columnDefs = list(
        list(visible = FALSE, targets = c(0, pos)),
        list(orderable = FALSE, className = 'details-control', targets = 1),
        list(className = 'dt-left', targets = 1:3),
        list(className = 'dt-right', targets = 4:ncol(x))
      )
  ))
  
  datatable(
    x, 
    ...,
    escape = FALSE,
    options = opts,
    callback = JS(.callback2(x = x, pos = c(0, pos)))
  )
}

.callback2 <- function(x, pos = NULL) {
  
  part1 <- "table.column(1).nodes().to$().css({cursor: 'pointer'});"
  
  part2 <- .child_row_table2(x, pos = pos)
  
  part3 <- 
  "
   table.on('click', 'td.details-control', function() {
    var td = $(this), row = table.row(td.closest('tr'));
    if (row.child.isShown()) {
      row.child.hide();
      td.html('&oplus;');
    } else {
      row.child(format(row.data())).show();
      td.html('&ominus;');
    }
  });"
  
  paste(part1, part2, part3)
} 

.child_row_table2 <- function(x, pos = NULL) {
  
  names_x <- paste0(names(x), ":")
  text <- "
  var format = function(d) {
    text = '<div><table >' + 
  "

  for (i in seq_along(pos)) {
    text <- paste(text, glue::glue(
        "'<tr>' +
          '<td>' + '{names_x[pos[i]]}' + '</td>' +
          '<td>' + d[{pos[i]}] + '</td>' +
        '</tr>' + " ))
  }

  paste0(text,
    "'</table></div>'
      return text;};"
  )
}

Results

CRISPR…

Other acronyms