all-MiniLM-L6-v2
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
# Train BERTopic
# Initiate BERTopic
##
Batches: 0%| | 0/109 [00:00<?, ?it/s]
Batches: 1%| | 1/109 [00:01<02:22, 1.32s/it]
Batches: 2%|1 | 2/109 [00:01<01:19, 1.34it/s]
Batches: 3%|2 | 3/109 [00:02<00:59, 1.78it/s]
Batches: 4%|3 | 4/109 [00:02<00:49, 2.12it/s]
Batches: 5%|4 | 5/109 [00:02<00:44, 2.35it/s]
Batches: 6%|5 | 6/109 [00:03<00:40, 2.53it/s]
Batches: 6%|6 | 7/109 [00:03<00:38, 2.65it/s]
Batches: 7%|7 | 8/109 [00:03<00:36, 2.74it/s]
Batches: 8%|8 | 9/109 [00:04<00:35, 2.81it/s]
Batches: 9%|9 | 10/109 [00:04<00:34, 2.85it/s]
Batches: 10%|# | 11/109 [00:04<00:34, 2.88it/s]
Batches: 11%|#1 | 12/109 [00:05<00:33, 2.90it/s]
Batches: 12%|#1 | 13/109 [00:05<00:32, 2.92it/s]
Batches: 13%|#2 | 14/109 [00:05<00:32, 2.93it/s]
Batches: 14%|#3 | 15/109 [00:06<00:31, 2.94it/s]
Batches: 15%|#4 | 16/109 [00:06<00:31, 2.95it/s]
Batches: 16%|#5 | 17/109 [00:06<00:31, 2.95it/s]
Batches: 17%|#6 | 18/109 [00:07<00:30, 2.95it/s]
Batches: 17%|#7 | 19/109 [00:07<00:30, 2.94it/s]
Batches: 18%|#8 | 20/109 [00:07<00:30, 2.94it/s]
Batches: 19%|#9 | 21/109 [00:08<00:29, 2.95it/s]
Batches: 20%|## | 22/109 [00:08<00:29, 2.95it/s]
Batches: 21%|##1 | 23/109 [00:08<00:29, 2.95it/s]
Batches: 22%|##2 | 24/109 [00:09<00:28, 2.96it/s]
Batches: 23%|##2 | 25/109 [00:09<00:28, 2.96it/s]
Batches: 24%|##3 | 26/109 [00:09<00:28, 2.96it/s]
Batches: 25%|##4 | 27/109 [00:10<00:27, 2.96it/s]
Batches: 26%|##5 | 28/109 [00:10<00:27, 2.97it/s]
Batches: 27%|##6 | 29/109 [00:10<00:27, 2.96it/s]
Batches: 28%|##7 | 30/109 [00:11<00:26, 2.96it/s]
Batches: 28%|##8 | 31/109 [00:11<00:26, 2.96it/s]
Batches: 29%|##9 | 32/109 [00:11<00:25, 2.96it/s]
Batches: 30%|### | 33/109 [00:12<00:25, 2.96it/s]
Batches: 31%|###1 | 34/109 [00:12<00:25, 2.97it/s]
Batches: 32%|###2 | 35/109 [00:12<00:24, 2.96it/s]
Batches: 33%|###3 | 36/109 [00:13<00:24, 2.97it/s]
Batches: 34%|###3 | 37/109 [00:13<00:24, 2.90it/s]
Batches: 35%|###4 | 38/109 [00:13<00:24, 2.89it/s]
Batches: 36%|###5 | 39/109 [00:14<00:24, 2.91it/s]
Batches: 37%|###6 | 40/109 [00:14<00:23, 2.92it/s]
Batches: 38%|###7 | 41/109 [00:14<00:23, 2.94it/s]
Batches: 39%|###8 | 42/109 [00:15<00:22, 2.95it/s]
Batches: 39%|###9 | 43/109 [00:15<00:22, 2.95it/s]
Batches: 40%|#### | 44/109 [00:15<00:22, 2.95it/s]
Batches: 41%|####1 | 45/109 [00:16<00:21, 2.96it/s]
Batches: 42%|####2 | 46/109 [00:16<00:21, 2.95it/s]
Batches: 43%|####3 | 47/109 [00:16<00:21, 2.95it/s]
Batches: 44%|####4 | 48/109 [00:17<00:20, 2.95it/s]
Batches: 45%|####4 | 49/109 [00:17<00:20, 2.94it/s]
Batches: 46%|####5 | 50/109 [00:17<00:20, 2.95it/s]
Batches: 47%|####6 | 51/109 [00:18<00:19, 2.95it/s]
Batches: 48%|####7 | 52/109 [00:18<00:19, 2.94it/s]
Batches: 49%|####8 | 53/109 [00:18<00:18, 2.95it/s]
Batches: 50%|####9 | 54/109 [00:19<00:18, 2.96it/s]
Batches: 50%|##### | 55/109 [00:19<00:18, 2.96it/s]
Batches: 51%|#####1 | 56/109 [00:19<00:17, 2.96it/s]
Batches: 52%|#####2 | 57/109 [00:20<00:17, 2.96it/s]
Batches: 53%|#####3 | 58/109 [00:20<00:16, 3.00it/s]
Batches: 54%|#####4 | 59/109 [00:20<00:16, 2.99it/s]
Batches: 55%|#####5 | 60/109 [00:21<00:16, 3.03it/s]
Batches: 56%|#####5 | 61/109 [00:21<00:15, 3.00it/s]
Batches: 57%|#####6 | 62/109 [00:21<00:15, 3.05it/s]
Batches: 58%|#####7 | 63/109 [00:22<00:15, 3.02it/s]
Batches: 59%|#####8 | 64/109 [00:22<00:14, 3.00it/s]
Batches: 60%|#####9 | 65/109 [00:22<00:14, 3.03it/s]
Batches: 61%|###### | 66/109 [00:23<00:13, 3.09it/s]
Batches: 61%|######1 | 67/109 [00:23<00:13, 3.04it/s]
Batches: 62%|######2 | 68/109 [00:23<00:13, 3.14it/s]
Batches: 63%|######3 | 69/109 [00:24<00:12, 3.22it/s]
Batches: 64%|######4 | 70/109 [00:24<00:11, 3.25it/s]
Batches: 65%|######5 | 71/109 [00:24<00:12, 3.16it/s]
Batches: 66%|######6 | 72/109 [00:25<00:11, 3.24it/s]
Batches: 67%|######6 | 73/109 [00:25<00:10, 3.35it/s]
Batches: 68%|######7 | 74/109 [00:25<00:10, 3.42it/s]
Batches: 69%|######8 | 75/109 [00:25<00:09, 3.51it/s]
Batches: 70%|######9 | 76/109 [00:26<00:09, 3.49it/s]
Batches: 71%|####### | 77/109 [00:26<00:09, 3.43it/s]
Batches: 72%|#######1 | 78/109 [00:26<00:08, 3.47it/s]
Batches: 72%|#######2 | 79/109 [00:27<00:08, 3.46it/s]
Batches: 73%|#######3 | 80/109 [00:27<00:08, 3.55it/s]
Batches: 74%|#######4 | 81/109 [00:27<00:07, 3.58it/s]
Batches: 75%|#######5 | 82/109 [00:27<00:07, 3.62it/s]
Batches: 76%|#######6 | 83/109 [00:28<00:07, 3.65it/s]
Batches: 77%|#######7 | 84/109 [00:28<00:06, 3.71it/s]
Batches: 78%|#######7 | 85/109 [00:28<00:06, 3.75it/s]
Batches: 79%|#######8 | 86/109 [00:28<00:06, 3.82it/s]
Batches: 80%|#######9 | 87/109 [00:29<00:05, 3.95it/s]
Batches: 81%|######## | 88/109 [00:29<00:05, 3.86it/s]
Batches: 82%|########1 | 89/109 [00:29<00:05, 3.97it/s]
Batches: 83%|########2 | 90/109 [00:29<00:04, 4.03it/s]
Batches: 83%|########3 | 91/109 [00:30<00:04, 4.13it/s]
Batches: 84%|########4 | 92/109 [00:30<00:03, 4.28it/s]
Batches: 85%|########5 | 93/109 [00:30<00:03, 4.16it/s]
Batches: 86%|########6 | 94/109 [00:30<00:03, 4.41it/s]
Batches: 87%|########7 | 95/109 [00:30<00:03, 4.60it/s]
Batches: 88%|########8 | 96/109 [00:31<00:02, 4.74it/s]
Batches: 89%|########8 | 97/109 [00:31<00:02, 4.77it/s]
Batches: 90%|########9 | 98/109 [00:31<00:02, 4.82it/s]
Batches: 91%|######### | 99/109 [00:31<00:02, 4.84it/s]
Batches: 92%|#########1| 100/109 [00:31<00:01, 5.06it/s]
Batches: 93%|#########2| 101/109 [00:32<00:01, 5.15it/s]
Batches: 94%|#########3| 102/109 [00:32<00:01, 5.33it/s]
Batches: 94%|#########4| 103/109 [00:32<00:01, 5.73it/s]
Batches: 95%|#########5| 104/109 [00:32<00:00, 6.06it/s]
Batches: 96%|#########6| 105/109 [00:32<00:00, 5.92it/s]
Batches: 97%|#########7| 106/109 [00:32<00:00, 6.33it/s]
Batches: 98%|#########8| 107/109 [00:33<00:00, 7.06it/s]
Batches: 100%|##########| 109/109 [00:33<00:00, 3.29it/s]
topic_model = BERTopic(n_gram_range=(1, 1),umap_model=umap_model, vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs,embeddings=embeddings)
topic_model.visualize_topics().show()
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings,hide_document_hover=True).show()
hierarchical_topics =topic_model.hierarchical_topics(docs)
##
0%| | 0/51 [00:00<?, ?it/s]
18%|#7 | 9/51 [00:00<00:00, 89.56it/s]
39%|###9 | 20/51 [00:00<00:00, 97.83it/s]
63%|######2 | 32/51 [00:00<00:00, 107.47it/s]
86%|########6 | 44/51 [00:00<00:00, 108.32it/s]
100%|##########| 51/51 [00:00<00:00, 104.58it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_barchart(top_n_topics=60,n_words=8).show()
# topic_model2 = BERTopic(verbose=True,vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs)
hierarchical_topics = topic_model.hierarchical_topics(docs)
##
0%| | 0/51 [00:00<?, ?it/s]
24%|##3 | 12/51 [00:00<00:00, 117.08it/s]
47%|####7 | 24/51 [00:00<00:00, 105.51it/s]
69%|######8 | 35/51 [00:00<00:00, 105.80it/s]
92%|#########2| 47/51 [00:00<00:00, 107.50it/s]
100%|##########| 51/51 [00:00<00:00, 107.08it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings).show()
import re
# Prepare data
# trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
# trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
# trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
# trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
# timestamps = trump.date.to_list()
#tweets = trump.text.to_list()
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
timestamps = df.yearp.to_list()
#from bertopic import BERTopic
topic_model_t = topic_model
#topics_t, probs_t = topic_model5.fit_transform(docs)
topics, probs = topic_model_t.fit_transform(docs,embeddings)
topics_over_time = topic_model_t.topics_over_time(docs, timestamps, datetime_format=" %Y",
global_tuning=True, evolution_tuning=True)
topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=20,width=800,height=600).show()