BertopicR Power Hour

1 Sample Function

Code

add_two_number <- function(number1, number2){
  answer <- sum(number1, number2)
  return(answer)
}

add_two_number(3,4)

[1] 7

2 BertopicR Example

2.1 Load Libraries

Code

library(BertopicR)
library(dplyr)

2.2 modular approach

2.2.1 Load and Clean Data

Code

# import docs to fit model to
set.seed(12) # ensure the same sample is taken each time

data <- SegmentR::sprinklr_export %>% 
  filter(!stringr::str_detect(Message, "A word of advice\\?")) %>%
  select(created_time = CreatedTime, message = Message) %>%
  tidyr::drop_na(message) %>%
  LimpiaR::limpiar_duplicates(message) %>%
  sample_n(2000) 

data_clean <- data %>%
  mutate(message_bert = message) %>%
  ParseR::clean_text(text_var = message_bert,
                     tolower = FALSE, # don't lower case
                     hashtags = FALSE, # remove hashtags
                     mentions = TRUE, # keep mentions
                     punctuation = TRUE, # keep punctuation
                     emojis = FALSE, # remove emojis
                     digits = TRUE) # keep digits

data_clean %>% head(5)

# A tibble: 5 × 3
  created_time        message                                       message_bert
  <dttm>              <chr>                                         <chr>       
1 2018-10-17 03:20:20 "why shouldn't he call himself by his usual … "why should…
2 2018-10-29 21:35:57 "Just wait, @BetoORourke will demand everyon… "Just wait,…
3 2018-10-15 14:28:43 "Celebrating Hispanic Heritage month dancing… "Celebratin…
4 2018-10-29 05:07:11 "A few months ago I was contacted by @stmoti… "A few mont…
5 2018-10-20 20:02:36 "Today I Celebrate \nSimon Bolivar \nCelebra… "Today I Ce…

2.2.2 Embed the Documents

Code

embedder <- bt_make_embedder_st(model = "all-mpnet-base-v2")
embeddings <- bt_do_embedding(embedder = embedder, 
                              documents = data_clean$message_bert, 
                              accelerator = "mps")

# look at embeddings
embeddings[1:10]

 [1]  0.0396  0.0048 -0.1196 -0.0262 -0.0681 -0.0724 -0.0272 -0.0240 -0.0424
[10] -0.0407

Code

dim(embeddings)

[1] 2000  768

Code

length(data_clean$message_bert)

[1] 2000

2.2.3 reduce embeddings

Code

reducer <- bt_make_reducer_umap(n_components = 10L, metric = "cosine")
reduced_embeddings <- bt_do_reducing(reducer, embeddings)

UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=10, random_state=42, verbose=True)
Thu Nov  9 17:22:39 2023 Construct fuzzy simplicial set
Thu Nov  9 17:22:43 2023 Finding Nearest Neighbors
Thu Nov  9 17:22:44 2023 Finished Nearest Neighbor Search
Thu Nov  9 17:22:46 2023 Construct embedding
Thu Nov  9 17:22:51 2023 Finished embedding

Now we can look at how reducing the dimensions of the embeddings has changed where the documents lie in space and reduced the number of dimensions representing each document.

Code

embeddings[1:10]

 [1]  0.0396  0.0048 -0.1196 -0.0262 -0.0681 -0.0724 -0.0272 -0.0240 -0.0424
[10] -0.0407

Code

reduced_embeddings[1:10]

 [1] 8.4 8.7 6.6 7.1 6.7 7.0 7.7 6.6 6.5 7.4

Code

dim(reduced_embeddings)

[1] 2000   10

Code

dim(embeddings)

[1] 2000  768

2.2.4 clusterering

We are going to define the clusterer but let the model building process perform the clustering rather than performing clustering externally as we did with creating and reducing the embeddings.

Code

clusterer <- bt_make_clusterer_hdbscan(min_cluster_size = 10L, cluster_selection_method = "eom")

2.2.5 compile and fit the model

We have already completed document embedding and reduction so we pass empty models to bt_compile_model to allow us to skip these steps.

Code

model <- bt_compile_model(embedding_model = bt_empty_embedder(), # skip embedding step
                          reduction_model = bt_empty_reducer(), # skip reducing step
                          clustering_model = clusterer) # clustering method to use

bt_fit_model(model = model, documents = data_clean$message_bert, embeddings = reduced_embeddings)

model$get_topic_info() %>% select(-Representative_Docs) %>% DT::datatable(options = list(pageLength = 5))

2.2.6 Visualise the umap

We are going to reduce the embeddings to 2d so that we can visualise the clusters (topics). It is important here not to reduce the reduced_embeddings - we’ve already lost some information by reducing the original dimensions to 10 dimensions.

Code

# reduce embeddings to 2d
reducer2d <- bt_make_reducer_umap(n_components = 2L, metric = "cosine")
embeddings_2d <- bt_do_reducing(reducer2d, embeddings)

UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, random_state=42, verbose=True)
Thu Nov  9 17:22:54 2023 Construct fuzzy simplicial set
Thu Nov  9 17:22:58 2023 Finding Nearest Neighbors
Thu Nov  9 17:22:58 2023 Finished Nearest Neighbor Search
Thu Nov  9 17:22:58 2023 Construct embedding
Thu Nov  9 17:23:02 2023 Finished embedding

Code

model$visualize_documents(docs = data_clean$message_bert, reduced_embeddings = embeddings_2d,
                          hide_annotations = TRUE)$show()

2.2.7 representation

Code

representation_mmr <- bt_representation_mmr(fitted_model = model,
                                            embedding_model = embedder,
                                            diversity = 0.5)

representation_openai <- bt_representation_openai(fitted_model = model,
                                                  documents = data_clean$message_bert,
                                                  openai_model = "gpt-3.5-turbo",
                                                  api_key = Sys.getenv("OPENAI_API_KEY"),
                                                  chat = TRUE,
                                                  nr_repr_docs = 30L)


topic_table <- model$get_topic_info() %>%
  select(-Representative_Docs) %>%
  mutate(representation_mmr = representation_mmr,
         representation_openai = representation_openai
         )

topic_table %>% DT::datatable(options = list(pageLength = 5))

2.2.8 investigate topics

Now let’s investigate the topics in a little bit more depth by looking at bigrams.

First we need to match each document to the topic it has been assigned to.

Code

document_topic_df <- data.frame(message = data_clean$message_bert,
                                topic = model$topics_)

document_topic_df %>% head(5)

                                                                                                                                                                                        message
1                                                                                   why shouldn't he call himself by his usual name? Jimmy Carter did because that's what everybody called him.
2 Just wait, @BetoORourke will demand everyone not even acknowledge all negative facts about him, and demand he be named winner due to his "identifying as the winner" or some other idiotic BS
3                                                                                       Celebrating Hispanic Heritage month dancing with mariachi this morning! Perfect way to start the week  
4          A few months ago I was contacted by @stmotivationmag to come to Salt Lake City Utah for @drummer909 show. It was lit. He did his thang. Rehearsal and soundcheck.                   
5                                                                           Today I Celebrate \nSimon Bolivar \nCelebrating Hispanic Heritage Month\nGoogle Simon Bolivar you will be impressed
  topic
1     2
2     2
3    -1
4    -1
5     1

Now we can use ParseR to create a bigram in the same way as we always have. Note that I have not created a cleaned data column to use here, which is very bad practice when creating bigrams.

Code

document_topic_df %>%
  filter(topic == 1) %>%
  ParseR::count_ngram(text_var = message, min_freq = 5) %>%
  purrr::pluck("viz") %>%
  ParseR::viz_ngram()

This would be a really useful point to use LandscapeR to investigate your topics.

2.3 merge topics

Maybe there are some topics that are particularly alike and could be merged together?

We can see how the topics were formed and where they branch off from one another which might help us identify similar topics.

Code

model$visualize_hierarchy()$show()

Let’s look specifically at topics 17 and 23 which both appear to be about hispanic celebration.

Code

document_topic_df %>%
  filter(topic == 17) %>%
  DT::datatable(options = list(pageLength = 5))

Code

document_topic_df %>%
  filter(topic == 23) %>%
  DT::datatable(options = list(pageLength = 5))

Maybe it would make sense to merge these two topics.

Code

bt_merge_topics(fitted_model = model,
                documents = data_clean$message_bert,
                topics_to_merge = list(17L, 23L))

model$get_topic_info() %>% 
  select(-Representative_Docs) %>%
  DT::datatable(options = list(pageLength = 5))

2.3.1 outliers

Another catch with using hdbscan as our clustering algorithm is that along with numerous small topics, we also get a large number of “outlier” documents, which are document’s that don’t fit nicely into the definted topics. Is this a good thing? Is it a bad thing? This is kind of dependent on what you want to achieve..

Maybe you want to segment your data very neatly into topics, in which case you can try to distribute the outlier documents into existing topics.

Code

outliers <- bt_outliers_ctfidf(fitted_model = model,
                               documents = data_clean$message_bert,
                               topics = model$topics_,
                               threshold = 0.01)

outliers %>%
  filter(current_topics == -1, new_topics != -1) %>%
  DT::datatable(options = list(pageLength = 5))

We can see here that 782/806 outliers have been redistributed. Do we agree with how they’ve been redistributed? It’s worth comparing the messages to the topics they’ve been distributed to.

If we’re happy with this we can update our table

Code

document_topic_df_updated <- document_topic_df %>%
  mutate(updated_topics = outliers$new_topics)