Building a thematic map using textmineR

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(textmineR)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'textmineR'

## The following object is masked from 'package:Matrix':
## 
##     update

## The following object is masked from 'package:stats':
## 
##     update

library(bib2df)
library(tidylog)

## 
## Attaching package: 'tidylog'

## The following objects are masked from 'package:dplyr':
## 
##     add_count, add_tally, anti_join, count, distinct, distinct_all,
##     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
##     full_join, group_by, group_by_all, group_by_at, group_by_if,
##     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
##     relocate, rename, rename_all, rename_at, rename_if, rename_with,
##     right_join, sample_frac, sample_n, select, select_all, select_at,
##     select_if, semi_join, slice, slice_head, slice_max, slice_min,
##     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
##     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
##     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
##     transmute_if, ungroup

## The following objects are masked from 'package:tidyr':
## 
##     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
##     spread, uncount

## The following object is masked from 'package:stats':
## 
##     filter

df <- europepmc::epmc_search("biofilm", "raw", limit = 3000)

## 115351 records found, returning 3000

df <- df %>% map(
  ~discard(.x, ~is.list(.x))
        ) %>% 
  map_df(bind_rows)

Alternative

# df <- bib2df("data.bib")

Generating dtm and fitting the model

dtm_better <- CreateDtm(
  df$abstractText,
  df$id,
  ngram_window = c(1,3),
  stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
                   stopwords::stopwords(source = "smart"),
                   c("î", "â", "ml")), # this is the default value
  lower = TRUE, # lowercase - this is the default value
  remove_punctuation = TRUE, # punctuation - this is the default
  remove_numbers = TRUE, # numbers - this is the default
  verbose = FALSE, # Turn off status bar for this demo
  cpus = 2
)
model <- FitLdaModel(dtm = dtm_better, 
                     k = 10,
                     iterations = 200, # I usually recommend at least 500 iterations or more
                     burnin = 180,
                     alpha = 0.1,
                     beta = 0.05,
                     optimize_alpha = TRUE,
                     calc_likelihood = TRUE,
                     calc_coherence = TRUE,
                     calc_r2 = TRUE,
                     cpus = 2)

Extracting metrics

model$top_terms <- GetTopTerms(phi = model$phi, M = 5)
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100
model$labels <- LabelTopics(assignments = model$theta > 0.05, 
                            dtm = dtm_better,
                            M = 2)
model$summary <- data.frame(topic = rownames(model$phi),
                            label = model$labels,
                            coherence = round(model$coherence, 3),
                            prevalence = round(model$prevalence,3),
                            top_terms = apply(model$top_terms, 2, function(x){
                              paste(x, collapse = ", ")
                            }),
                            stringsAsFactors = FALSE)

Wrangling the results

df_sum <- model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]
df_sum <- df_sum %>% 
  unite(label,starts_with("label."), sep = " / ")
df_clean <- df_sum %>%  
  mutate(term = str_split(top_terms, ", ")) %>% 
  unnest(term) %>% 
  distinct() %>% 
  select(-topic, -top_terms)

## mutate: new variable 'term' (list) with 10 unique values and 0% NA

## distinct: no rows removed

## select: dropped 2 variables (topic, top_terms)

topics <- unique(df_clean$label) #finding unique topic names
df_edges <- df_clean %>% 
  select(label, term)

## select: dropped 2 variables (coherence, prevalence)

Converting to graph using tidygraph

library(tidygraph)

## 
## Attaching package: 'tidygraph'

## The following objects are masked from 'package:tidylog':
## 
##     anti_join, distinct, filter, full_join, group_by, inner_join,
##     left_join, mutate, mutate_all, mutate_at, rename, right_join,
##     sample_frac, sample_n, select, semi_join, slice, top_n, transmute,
##     ungroup

## The following object is masked from 'package:stats':
## 
##     filter

g_tidy <- as_tbl_graph(df_edges) %>% 
  activate(nodes) %>% 
  mutate(category = case_when(
    name %in% topics ~ "topic",
    TRUE ~ "terms"
  )
  )

Using ggraph to plot the result.

library(ggraph)
gg_x <- ggraph(g_tidy, layout = 'fr') + 
  geom_edge_diagonal(show.legend = FALSE) + 
  geom_node_label(aes(label = name, fill = category), repel = TRUE) +
  scale_fill_brewer(type = "qual")+
  theme_graph() + 
  ggtitle("Thematic Map for OA-Papers on Biofilm in 2019")
gg_x