all-mpnet-base-v2
sentence_model = SentenceTransformer("allenai-specter")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
##
Batches: 0%| | 0/109 [00:00<?, ?it/s]
Batches: 1%| | 1/109 [00:06<11:05, 6.16s/it]
Batches: 2%|1 | 2/109 [00:10<08:39, 4.85s/it]
Batches: 3%|2 | 3/109 [00:14<07:53, 4.47s/it]
Batches: 4%|3 | 4/109 [00:18<07:30, 4.29s/it]
Batches: 5%|4 | 5/109 [00:22<07:12, 4.16s/it]
Batches: 6%|5 | 6/109 [00:25<06:49, 3.97s/it]
Batches: 6%|6 | 7/109 [00:29<06:36, 3.89s/it]
Batches: 7%|7 | 8/109 [00:33<06:33, 3.90s/it]
Batches: 8%|8 | 9/109 [00:36<06:06, 3.67s/it]
Batches: 9%|9 | 10/109 [00:39<05:58, 3.62s/it]
Batches: 10%|# | 11/109 [00:43<05:51, 3.59s/it]
Batches: 11%|#1 | 12/109 [00:46<05:34, 3.44s/it]
Batches: 12%|#1 | 13/109 [00:49<05:18, 3.32s/it]
Batches: 13%|#2 | 14/109 [00:52<05:05, 3.21s/it]
Batches: 14%|#3 | 15/109 [00:55<04:58, 3.18s/it]
Batches: 15%|#4 | 16/109 [00:58<04:42, 3.03s/it]
Batches: 16%|#5 | 17/109 [01:01<04:35, 2.99s/it]
Batches: 17%|#6 | 18/109 [01:04<04:45, 3.13s/it]
Batches: 17%|#7 | 19/109 [01:07<04:30, 3.01s/it]
Batches: 18%|#8 | 20/109 [01:10<04:25, 2.98s/it]
Batches: 19%|#9 | 21/109 [01:12<04:10, 2.85s/it]
Batches: 20%|## | 22/109 [01:15<04:12, 2.91s/it]
Batches: 21%|##1 | 23/109 [01:18<04:01, 2.81s/it]
Batches: 22%|##2 | 24/109 [01:21<03:51, 2.72s/it]
Batches: 23%|##2 | 25/109 [01:23<03:44, 2.68s/it]
Batches: 24%|##3 | 26/109 [01:26<03:41, 2.67s/it]
Batches: 25%|##4 | 27/109 [01:29<03:45, 2.74s/it]
Batches: 26%|##5 | 28/109 [01:31<03:39, 2.71s/it]
Batches: 27%|##6 | 29/109 [01:34<03:26, 2.58s/it]
Batches: 28%|##7 | 30/109 [01:36<03:20, 2.54s/it]
Batches: 28%|##8 | 31/109 [01:39<03:16, 2.52s/it]
Batches: 29%|##9 | 32/109 [01:41<03:07, 2.43s/it]
Batches: 30%|### | 33/109 [01:43<02:59, 2.36s/it]
Batches: 31%|###1 | 34/109 [01:45<02:53, 2.31s/it]
Batches: 32%|###2 | 35/109 [01:47<02:51, 2.32s/it]
Batches: 33%|###3 | 36/109 [01:50<02:43, 2.24s/it]
Batches: 34%|###3 | 37/109 [01:52<02:38, 2.21s/it]
Batches: 35%|###4 | 38/109 [01:54<02:33, 2.17s/it]
Batches: 36%|###5 | 39/109 [01:56<02:36, 2.23s/it]
Batches: 37%|###6 | 40/109 [01:58<02:30, 2.19s/it]
Batches: 38%|###7 | 41/109 [02:00<02:28, 2.18s/it]
Batches: 39%|###8 | 42/109 [02:03<02:26, 2.19s/it]
Batches: 39%|###9 | 43/109 [02:05<02:21, 2.15s/it]
Batches: 40%|#### | 44/109 [02:07<02:17, 2.11s/it]
Batches: 41%|####1 | 45/109 [02:09<02:12, 2.07s/it]
Batches: 42%|####2 | 46/109 [02:11<02:10, 2.07s/it]
Batches: 43%|####3 | 47/109 [02:13<02:07, 2.05s/it]
Batches: 44%|####4 | 48/109 [02:15<02:07, 2.09s/it]
Batches: 45%|####4 | 49/109 [02:17<02:02, 2.04s/it]
Batches: 46%|####5 | 50/109 [02:19<01:54, 1.94s/it]
Batches: 47%|####6 | 51/109 [02:20<01:50, 1.91s/it]
Batches: 48%|####7 | 52/109 [02:22<01:48, 1.90s/it]
Batches: 49%|####8 | 53/109 [02:24<01:46, 1.90s/it]
Batches: 50%|####9 | 54/109 [02:26<01:44, 1.89s/it]
Batches: 50%|##### | 55/109 [02:28<01:44, 1.93s/it]
Batches: 51%|#####1 | 56/109 [02:30<01:41, 1.92s/it]
Batches: 52%|#####2 | 57/109 [02:32<01:41, 1.95s/it]
Batches: 53%|#####3 | 58/109 [02:34<01:34, 1.84s/it]
Batches: 54%|#####4 | 59/109 [02:35<01:29, 1.79s/it]
Batches: 55%|#####5 | 60/109 [02:37<01:25, 1.75s/it]
Batches: 56%|#####5 | 61/109 [02:38<01:21, 1.69s/it]
Batches: 57%|#####6 | 62/109 [02:40<01:18, 1.67s/it]
Batches: 58%|#####7 | 63/109 [02:42<01:16, 1.67s/it]
Batches: 59%|#####8 | 64/109 [02:44<01:19, 1.77s/it]
Batches: 60%|#####9 | 65/109 [02:45<01:14, 1.70s/it]
Batches: 61%|###### | 66/109 [02:47<01:11, 1.66s/it]
Batches: 61%|######1 | 67/109 [02:49<01:11, 1.70s/it]
Batches: 62%|######2 | 68/109 [02:50<01:06, 1.63s/it]
Batches: 63%|######3 | 69/109 [02:52<01:03, 1.58s/it]
Batches: 64%|######4 | 70/109 [02:53<00:59, 1.54s/it]
Batches: 65%|######5 | 71/109 [02:55<00:59, 1.57s/it]
Batches: 66%|######6 | 72/109 [02:56<00:57, 1.56s/it]
Batches: 67%|######6 | 73/109 [02:58<00:54, 1.51s/it]
Batches: 68%|######7 | 74/109 [02:59<00:51, 1.49s/it]
Batches: 69%|######8 | 75/109 [03:00<00:49, 1.46s/it]
Batches: 70%|######9 | 76/109 [03:02<00:48, 1.48s/it]
Batches: 71%|####### | 77/109 [03:03<00:47, 1.48s/it]
Batches: 72%|#######1 | 78/109 [03:05<00:45, 1.47s/it]
Batches: 72%|#######2 | 79/109 [03:06<00:44, 1.47s/it]
Batches: 73%|#######3 | 80/109 [03:08<00:42, 1.45s/it]
Batches: 74%|#######4 | 81/109 [03:09<00:39, 1.42s/it]
Batches: 75%|#######5 | 82/109 [03:10<00:37, 1.39s/it]
Batches: 76%|#######6 | 83/109 [03:12<00:35, 1.36s/it]
Batches: 77%|#######7 | 84/109 [03:13<00:34, 1.36s/it]
Batches: 78%|#######7 | 85/109 [03:14<00:32, 1.36s/it]
Batches: 79%|#######8 | 86/109 [03:16<00:31, 1.37s/it]
Batches: 80%|#######9 | 87/109 [03:17<00:28, 1.30s/it]
Batches: 81%|######## | 88/109 [03:18<00:27, 1.30s/it]
Batches: 82%|########1 | 89/109 [03:20<00:25, 1.30s/it]
Batches: 83%|########2 | 90/109 [03:21<00:24, 1.28s/it]
Batches: 83%|########3 | 91/109 [03:22<00:22, 1.24s/it]
Batches: 84%|########4 | 92/109 [03:23<00:20, 1.21s/it]
Batches: 85%|########5 | 93/109 [03:24<00:19, 1.24s/it]
Batches: 86%|########6 | 94/109 [03:25<00:17, 1.19s/it]
Batches: 87%|########7 | 95/109 [03:26<00:15, 1.14s/it]
Batches: 88%|########8 | 96/109 [03:28<00:14, 1.11s/it]
Batches: 89%|########8 | 97/109 [03:29<00:13, 1.09s/it]
Batches: 90%|########9 | 98/109 [03:30<00:11, 1.06s/it]
Batches: 91%|######### | 99/109 [03:31<00:10, 1.05s/it]
Batches: 92%|#########1| 100/109 [03:31<00:09, 1.02s/it]
Batches: 93%|#########2| 101/109 [03:33<00:08, 1.02s/it]
Batches: 94%|#########3| 102/109 [03:33<00:06, 1.01it/s]
Batches: 94%|#########4| 103/109 [03:34<00:05, 1.05it/s]
Batches: 95%|#########5| 104/109 [03:35<00:04, 1.11it/s]
Batches: 96%|#########6| 105/109 [03:36<00:03, 1.10it/s]
Batches: 97%|#########7| 106/109 [03:37<00:02, 1.15it/s]
Batches: 98%|#########8| 107/109 [03:37<00:01, 1.28it/s]
Batches: 99%|#########9| 108/109 [03:38<00:00, 1.47it/s]
Batches: 100%|##########| 109/109 [03:38<00:00, 2.00s/it]
# Train BERTopic
# Initiate BERTopic
topic_model = BERTopic(n_gram_range=(1, 3),umap_model=umap_model, vectorizer_model=vectorizer_model,embedding_model=sentence_model)
# topic_model.save("all_mp_mod")
# topic_model = BERTopic.load("all_mp_mod")
# topic_model.save("SPECTER_mod")
# topic_model = BERTopic.load("SPECTER_mod")
topics, probs = topic_model.fit_transform(docs)
topic_model.visualize_topics().show()
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings,hide_document_hover=True).show()
hierarchical_topics =topic_model.hierarchical_topics(docs)
##
0%| | 0/45 [00:00<?, ?it/s]
47%|####6 | 21/45 [00:00<00:00, 204.21it/s]
96%|#########5| 43/45 [00:00<00:00, 210.28it/s]
100%|##########| 45/45 [00:00<00:00, 208.93it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_barchart(top_n_topics=60,n_words=8).show()
# topic_model2 = BERTopic(verbose=True,vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs,embeddings=embeddings)
#hierarchical_topics = topic_model.hierarchical_topics(docs)
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)
##
0%| | 0/45 [00:00<?, ?it/s]
42%|####2 | 19/45 [00:00<00:00, 188.85it/s]
84%|########4 | 38/45 [00:00<00:00, 186.70it/s]
100%|##########| 45/45 [00:00<00:00, 187.35it/s]
hierarchical_topics = topic_model.hierarchical_topics(docs)
##
0%| | 0/45 [00:00<?, ?it/s]
49%|####8 | 22/45 [00:00<00:00, 212.39it/s]
100%|##########| 45/45 [00:00<00:00, 215.62it/s]
100%|##########| 45/45 [00:00<00:00, 214.12it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings).show()
import re
# Prepare data
# trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
# trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
# trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
# trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
# timestamps = trump.date.to_list()
#tweets = trump.text.to_list()
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
timestamps = df.yearp.to_list()
#from bertopic import BERTopic
topic_model_t = topic_model
#topics_t, probs_t = topic_model5.fit_transform(docs)
topics, probs = topic_model_t.fit_transform(docs,embeddings)
topics_over_time = topic_model_t.topics_over_time(docs, timestamps, datetime_format=" %Y",
global_tuning=True, evolution_tuning=True)
topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=20,width=800,height=600).show()
topic_distr, _ = topic_model.approximate_distribution(docs)
topic_model.visualize_distribution(topic_distr[3]).show()
## Figure({
## 'data': [{'marker': {'color': '#C8D2D7', 'line': {'color': '#6E8484', 'width': 1}},
## 'orientation': 'h',
## 'type': 'bar',
## 'x': [0.01598080909848807, 0.23922859485089662,
## 0.029263528726881437, 0.02175471559396236, 0.02933781519610779,
## 0.12833306830258076, 0.01585936076773892, 0.01621391864139845,
## 0.016349629746599766, 0.07395350544559207,
## 0.061629094290419635, 0.023210394570315155,
## 0.03692326835052149, 0.04687560241702505, 0.025632122294021245,
## 0.04582034147254201, 0.02008037635648212, 0.07772908543076634,
## 0.030544349153887748],
## 'y': [<b>Topic 0</b>: traffic_data_urban_model..., <b>Topic 1</b>:
## students_science_educati..., <b>Topic 2</b>:
## statistics_statistical_d..., <b>Topic 4</b>:
## health_clinical_data_car..., <b>Topic 5</b>:
## cloud_computing_data_per..., <b>Topic 6</b>:
## data_science_ethical_eth..., <b>Topic 9</b>:
## iot_security_devices_wir..., <b>Topic 11</b>:
## manufacturing_maintenan..., <b>Topic 12</b>:
## algorithms_clustering_o..., <b>Topic 13</b>:
## research_publications_s..., <b>Topic 14</b>:
## projects_teams_agile_pr..., <b>Topic 16</b>:
## code_notebooks_notebook..., <b>Topic 19</b>:
## big_business_data_roadm..., <b>Topic 21</b>:
## player_players_nba_socc..., <b>Topic 22</b>:
## big_data_analytics_busi..., <b>Topic 27</b>:
## visualization_visual_da..., <b>Topic 28</b>:
## nursing_informatics_nur..., <b>Topic 36</b>:
## data_big_em_natural_lan..., <b>Topic 38</b>:
## design_service_designer...]}],
## 'layout': {'height': 600,
## 'hoverlabel': {'bgcolor': 'white', 'font': {'family': 'Rockwell', 'size': 16}},
## 'template': '...',
## 'title': {'font': {'color': 'Black', 'size': 22},
## 'text': '<b>Topic Probability Distribution</b>',
## 'x': 0.5,
## 'xanchor': 'center',
## 'y': 0.95,
## 'yanchor': 'top'},
## 'width': 800,
## 'xaxis': {'title': {'text': 'Probability'}}}
## })
# from nltk.corpus import wordnet
# #topic_model.reduce_topics(docs,nr_topics=30)