library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(textmineR)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'textmineR'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## update
library(bib2df)
library(tidylog)
##
## Attaching package: 'tidylog'
## The following objects are masked from 'package:dplyr':
##
## add_count, add_tally, anti_join, count, distinct, distinct_all,
## distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
## full_join, group_by, group_by_all, group_by_at, group_by_if,
## inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
## relocate, rename, rename_all, rename_at, rename_if, rename_with,
## right_join, sample_frac, sample_n, select, select_all, select_at,
## select_if, semi_join, slice, slice_head, slice_max, slice_min,
## slice_sample, slice_tail, summarise, summarise_all, summarise_at,
## summarise_if, summarize, summarize_all, summarize_at, summarize_if,
## tally, top_frac, top_n, transmute, transmute_all, transmute_at,
## transmute_if, ungroup
## The following objects are masked from 'package:tidyr':
##
## drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
## spread, uncount
## The following object is masked from 'package:stats':
##
## filter
df <- europepmc::epmc_search("biofilm", "raw", limit = 3000)
## 115351 records found, returning 3000
df <- df %>% map(
~discard(.x, ~is.list(.x))
) %>%
map_df(bind_rows)
Alternative
# df <- bib2df("data.bib")
Generating dtm and fitting the model
dtm_better <- CreateDtm(
df$abstractText,
df$id,
ngram_window = c(1,3),
stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
stopwords::stopwords(source = "smart"),
c("î", "â", "ml")), # this is the default value
lower = TRUE, # lowercase - this is the default value
remove_punctuation = TRUE, # punctuation - this is the default
remove_numbers = TRUE, # numbers - this is the default
verbose = FALSE, # Turn off status bar for this demo
cpus = 2
)
model <- FitLdaModel(dtm = dtm_better,
k = 10,
iterations = 200, # I usually recommend at least 500 iterations or more
burnin = 180,
alpha = 0.1,
beta = 0.05,
optimize_alpha = TRUE,
calc_likelihood = TRUE,
calc_coherence = TRUE,
calc_r2 = TRUE,
cpus = 2)
Extracting metrics
model$top_terms <- GetTopTerms(phi = model$phi, M = 5)
model$prevalence <- colSums(model$theta) / sum(model$theta) * 100
model$labels <- LabelTopics(assignments = model$theta > 0.05,
dtm = dtm_better,
M = 2)
model$summary <- data.frame(topic = rownames(model$phi),
label = model$labels,
coherence = round(model$coherence, 3),
prevalence = round(model$prevalence,3),
top_terms = apply(model$top_terms, 2, function(x){
paste(x, collapse = ", ")
}),
stringsAsFactors = FALSE)
Wrangling the results
df_sum <- model$summary[ order(model$summary$prevalence, decreasing = TRUE) , ][ 1:10 , ]
df_sum <- df_sum %>%
unite(label,starts_with("label."), sep = " / ")
df_clean <- df_sum %>%
mutate(term = str_split(top_terms, ", ")) %>%
unnest(term) %>%
distinct() %>%
select(-topic, -top_terms)
## mutate: new variable 'term' (list) with 10 unique values and 0% NA
## distinct: no rows removed
## select: dropped 2 variables (topic, top_terms)
topics <- unique(df_clean$label) #finding unique topic names
df_edges <- df_clean %>%
select(label, term)
## select: dropped 2 variables (coherence, prevalence)
Converting to graph using tidygraph
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following objects are masked from 'package:tidylog':
##
## anti_join, distinct, filter, full_join, group_by, inner_join,
## left_join, mutate, mutate_all, mutate_at, rename, right_join,
## sample_frac, sample_n, select, semi_join, slice, top_n, transmute,
## ungroup
## The following object is masked from 'package:stats':
##
## filter
g_tidy <- as_tbl_graph(df_edges) %>%
activate(nodes) %>%
mutate(category = case_when(
name %in% topics ~ "topic",
TRUE ~ "terms"
)
)
Using ggraph to plot the result.
library(ggraph)
gg_x <- ggraph(g_tidy, layout = 'fr') +
geom_edge_diagonal(show.legend = FALSE) +
geom_node_label(aes(label = name, fill = category), repel = TRUE) +
scale_fill_brewer(type = "qual")+
theme_graph() +
ggtitle("Thematic Map for OA-Papers on Biofilm in 2019")
gg_x