Building a thematic map using textmineR

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE
)

library(tidyverse)
library(textmineR)
library(bib2df)
library(tidylog)

df <- europepmc::epmc_search("biofilm", "raw", limit = 3000)

df <- df %>% map(
  ~discard(.x, ~is.list(.x))
        ) %>% 
  map_df(bind_rows)

Alternative

# df <- bib2df("data.bib")

Generating dtm and fitting the model

dtm_better <- CreateDtm(
  df$abstractText,
  df$id,
  ngram_window = c(1,3),
  stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
                   stopwords::stopwords(source = "smart"),
                   c("î", "â", "ml")), # this is the default value
  lower = TRUE, # lowercase - this is the default value
  remove_punctuation = TRUE, # punctuation - this is the default
  remove_numbers = TRUE, # numbers - this is the default
  verbose = FALSE, # Turn off status bar for this demo
  cpus = 2
)
model <- FitLdaModel(dtm = dtm_better, 
                     k = 10,
                     iterations = 200, # I usually recommend at least 500 iterations or more
                     burnin = 180,
                     alpha = 0.1,
                     beta = 0.05,
                     optimize_alpha = TRUE,
                     calc_likelihood = TRUE,
                     calc_coherence = TRUE,
                     calc_r2 = TRUE,
                     cpus = 2)

Extracting metrics

model$top_terms <- GetTopTerms(phi = model$phi, M = 5)
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100
model$labels <- LabelTopics(assignments = model$theta > 0.05, 
                            dtm = dtm_better,
                            M = 2)
model$summary <- data.frame(topic = rownames(model$phi),
                            label = model$labels,
                            coherence = round(model$coherence, 3),
                            prevalence = round(model$prevalence,3),
                            top_terms = apply(model$top_terms, 2, function(x){
                              paste(x, collapse = ", ")
                            }),
                            stringsAsFactors = FALSE)

Wrangling the results

df_sum <- model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]
df_sum <- df_sum %>% 
  unite(label,starts_with("label."), sep = " / ")
df_clean <- df_sum %>%  
  mutate(term = str_split(top_terms, ", ")) %>% 
  unnest(term) %>% 
  distinct() %>% 
  select(-topic, -top_terms)
topics <- unique(df_clean$label) #finding unique topic names
df_edges <- df_clean %>% 
  select(label, term)

Converting to graph using tidygraph

library(tidygraph)
g_tidy <- as_tbl_graph(df_edges) %>% 
  activate(nodes) %>% 
  mutate(category = case_when(
    name %in% topics ~ "topic",
    TRUE ~ "terms"
  )
  )

Using ggraph to plot the result.

library(ggraph)
gg_x <- ggraph(g_tidy, layout = 'fr') + 
  geom_edge_diagonal(show.legend = FALSE) + 
  geom_node_label(aes(label = name, fill = category), repel = TRUE) +
  scale_fill_brewer(type = "qual")+
  theme_graph() + 
  ggtitle("Thematic Map for the latest 3000 OA-Papers on Biofilm")
gg_x