Code
add_two_number <- function(number1, number2){
answer <- sum(number1, number2)
return(answer)
}
add_two_number(3,4)[1] 7
add_two_number <- function(number1, number2){
answer <- sum(number1, number2)
return(answer)
}
add_two_number(3,4)[1] 7
library(BertopicR)
library(dplyr)# import docs to fit model to
set.seed(12) # ensure the same sample is taken each time
data <- SegmentR::sprinklr_export %>%
filter(!stringr::str_detect(Message, "A word of advice\\?")) %>%
select(created_time = CreatedTime, message = Message) %>%
tidyr::drop_na(message) %>%
LimpiaR::limpiar_duplicates(message) %>%
sample_n(2000)
data_clean <- data %>%
mutate(message_bert = message) %>%
ParseR::clean_text(text_var = message_bert,
tolower = FALSE, # don't lower case
hashtags = FALSE, # remove hashtags
mentions = TRUE, # keep mentions
punctuation = TRUE, # keep punctuation
emojis = FALSE, # remove emojis
digits = TRUE) # keep digits
data_clean %>% head(5)# A tibble: 5 × 3
created_time message message_bert
<dttm> <chr> <chr>
1 2018-10-17 03:20:20 "why shouldn't he call himself by his usual … "why should…
2 2018-10-29 21:35:57 "Just wait, @BetoORourke will demand everyon… "Just wait,…
3 2018-10-15 14:28:43 "Celebrating Hispanic Heritage month dancing… "Celebratin…
4 2018-10-29 05:07:11 "A few months ago I was contacted by @stmoti… "A few mont…
5 2018-10-20 20:02:36 "Today I Celebrate \nSimon Bolivar \nCelebra… "Today I Ce…
embedder <- bt_make_embedder_st(model = "all-mpnet-base-v2")
embeddings <- bt_do_embedding(embedder = embedder,
documents = data_clean$message_bert,
accelerator = "mps")
# look at embeddings
embeddings[1:10] [1] 0.0396 0.0048 -0.1196 -0.0262 -0.0681 -0.0724 -0.0272 -0.0240 -0.0424
[10] -0.0407
dim(embeddings)[1] 2000 768
length(data_clean$message_bert)[1] 2000
reducer <- bt_make_reducer_umap(n_components = 10L, metric = "cosine")
reduced_embeddings <- bt_do_reducing(reducer, embeddings)UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=10, random_state=42, verbose=True)
Thu Nov 9 17:22:39 2023 Construct fuzzy simplicial set
Thu Nov 9 17:22:43 2023 Finding Nearest Neighbors
Thu Nov 9 17:22:44 2023 Finished Nearest Neighbor Search
Thu Nov 9 17:22:46 2023 Construct embedding
Thu Nov 9 17:22:51 2023 Finished embedding
Now we can look at how reducing the dimensions of the embeddings has changed where the documents lie in space and reduced the number of dimensions representing each document.
embeddings[1:10] [1] 0.0396 0.0048 -0.1196 -0.0262 -0.0681 -0.0724 -0.0272 -0.0240 -0.0424
[10] -0.0407
reduced_embeddings[1:10] [1] 8.4 8.7 6.6 7.1 6.7 7.0 7.7 6.6 6.5 7.4
dim(reduced_embeddings)[1] 2000 10
dim(embeddings)[1] 2000 768
We are going to define the clusterer but let the model building process perform the clustering rather than performing clustering externally as we did with creating and reducing the embeddings.
clusterer <- bt_make_clusterer_hdbscan(min_cluster_size = 10L, cluster_selection_method = "eom")We have already completed document embedding and reduction so we pass empty models to bt_compile_model to allow us to skip these steps.
model <- bt_compile_model(embedding_model = bt_empty_embedder(), # skip embedding step
reduction_model = bt_empty_reducer(), # skip reducing step
clustering_model = clusterer) # clustering method to use
bt_fit_model(model = model, documents = data_clean$message_bert, embeddings = reduced_embeddings)
model$get_topic_info() %>% select(-Representative_Docs) %>% DT::datatable(options = list(pageLength = 5))We are going to reduce the embeddings to 2d so that we can visualise the clusters (topics). It is important here not to reduce the reduced_embeddings - we’ve already lost some information by reducing the original dimensions to 10 dimensions.
# reduce embeddings to 2d
reducer2d <- bt_make_reducer_umap(n_components = 2L, metric = "cosine")
embeddings_2d <- bt_do_reducing(reducer2d, embeddings) UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, random_state=42, verbose=True)
Thu Nov 9 17:22:54 2023 Construct fuzzy simplicial set
Thu Nov 9 17:22:58 2023 Finding Nearest Neighbors
Thu Nov 9 17:22:58 2023 Finished Nearest Neighbor Search
Thu Nov 9 17:22:58 2023 Construct embedding
Thu Nov 9 17:23:02 2023 Finished embedding
model$visualize_documents(docs = data_clean$message_bert, reduced_embeddings = embeddings_2d,
hide_annotations = TRUE)$show()representation_mmr <- bt_representation_mmr(fitted_model = model,
embedding_model = embedder,
diversity = 0.5)
representation_openai <- bt_representation_openai(fitted_model = model,
documents = data_clean$message_bert,
openai_model = "gpt-3.5-turbo",
api_key = Sys.getenv("OPENAI_API_KEY"),
chat = TRUE,
nr_repr_docs = 30L)
topic_table <- model$get_topic_info() %>%
select(-Representative_Docs) %>%
mutate(representation_mmr = representation_mmr,
representation_openai = representation_openai
)
topic_table %>% DT::datatable(options = list(pageLength = 5))Now let’s investigate the topics in a little bit more depth by looking at bigrams.
First we need to match each document to the topic it has been assigned to.
document_topic_df <- data.frame(message = data_clean$message_bert,
topic = model$topics_)
document_topic_df %>% head(5) message
1 why shouldn't he call himself by his usual name? Jimmy Carter did because that's what everybody called him.
2 Just wait, @BetoORourke will demand everyone not even acknowledge all negative facts about him, and demand he be named winner due to his "identifying as the winner" or some other idiotic BS
3 Celebrating Hispanic Heritage month dancing with mariachi this morning! Perfect way to start the week
4 A few months ago I was contacted by @stmotivationmag to come to Salt Lake City Utah for @drummer909 show. It was lit. He did his thang. Rehearsal and soundcheck.
5 Today I Celebrate \nSimon Bolivar \nCelebrating Hispanic Heritage Month\nGoogle Simon Bolivar you will be impressed
topic
1 2
2 2
3 -1
4 -1
5 1
Now we can use ParseR to create a bigram in the same way as we always have. Note that I have not created a cleaned data column to use here, which is very bad practice when creating bigrams.
document_topic_df %>%
filter(topic == 1) %>%
ParseR::count_ngram(text_var = message, min_freq = 5) %>%
purrr::pluck("viz") %>%
ParseR::viz_ngram()This would be a really useful point to use LandscapeR to investigate your topics.
Maybe there are some topics that are particularly alike and could be merged together?
We can see how the topics were formed and where they branch off from one another which might help us identify similar topics.
model$visualize_hierarchy()$show()Let’s look specifically at topics 17 and 23 which both appear to be about hispanic celebration.
document_topic_df %>%
filter(topic == 17) %>%
DT::datatable(options = list(pageLength = 5))document_topic_df %>%
filter(topic == 23) %>%
DT::datatable(options = list(pageLength = 5))Maybe it would make sense to merge these two topics.
bt_merge_topics(fitted_model = model,
documents = data_clean$message_bert,
topics_to_merge = list(17L, 23L))
model$get_topic_info() %>%
select(-Representative_Docs) %>%
DT::datatable(options = list(pageLength = 5))Another catch with using hdbscan as our clustering algorithm is that along with numerous small topics, we also get a large number of “outlier” documents, which are document’s that don’t fit nicely into the definted topics. Is this a good thing? Is it a bad thing? This is kind of dependent on what you want to achieve..
Maybe you want to segment your data very neatly into topics, in which case you can try to distribute the outlier documents into existing topics.
outliers <- bt_outliers_ctfidf(fitted_model = model,
documents = data_clean$message_bert,
topics = model$topics_,
threshold = 0.01)
outliers %>%
filter(current_topics == -1, new_topics != -1) %>%
DT::datatable(options = list(pageLength = 5))We can see here that 782/806 outliers have been redistributed. Do we agree with how they’ve been redistributed? It’s worth comparing the messages to the topics they’ve been distributed to.
If we’re happy with this we can update our table
document_topic_df_updated <- document_topic_df %>%
mutate(updated_topics = outliers$new_topics)