BERT

 # from transformers import AutoTokenizer, AutoModel
 # tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
# model = AutoModel.from_pretrained('allenai/specter')
# from bertopic import BERTopic
# from sentence_transformers import SentenceTransformer
# from cuml.cluster import HDBSCAN
# from cuml.manifold import UMAP
# import collections
# import pandas
# 
# language_model = SentenceTransformer('allenai-specter')
# cluster_model = HDBSCAN(prediction_data=True)
# umap_model = UMAP(n_components = 10)
# topic_model = BERTopic(embedding_model=language_model,
#                             umap_model = umap_model,
#                             hdbscan_model = cluster_model,
#                             language="english",
#                             nr_topics="auto",
#                             min_topic_size = 500,
#                             verbose=verbose)
# df = pd.read_csv("corpus.csv")
# corpus=df.text
# docs=corpus.values.tolist()
# topics, probs = topic_model.fit_transform(docs)
# 
# counter = collections.Counter(topics)
# df = pd.DataFrame(dict(counter).items())
# df.columns = ['Topics', 'Size']
# file_path = 'topic_size.csv'
# df.to_csv(file_path)
import numpy as np
import pandas as pd
np.random.seed(0)
from sentence_transformers import SentenceTransformer
#conda install -c conda-forge sentence-transformers
from bertopic import BERTopic
# conda install -c conda-forge bertopic
from umap import UMAP
# conda install -c conda-forge umap-learn
# from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer

# Prepare embeddings
# ATTENZIONE!! INSERISCI BENE IL PATH DOVE SI TROVA corpus.csv o spostalo nella 
# directory dove è il file .py
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
umap_model = UMAP(n_neighbors=10, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
                  # Count vectorizer
vectorizer_model = CountVectorizer(stop_words="english")

1grams

all-mpnet-base-v2

sentence_model = SentenceTransformer("allenai-specter")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
## 
Batches:   0%|          | 0/109 [00:00<?, ?it/s]
Batches:   1%|          | 1/109 [00:06<11:05,  6.16s/it]
Batches:   2%|1         | 2/109 [00:10<08:39,  4.85s/it]
Batches:   3%|2         | 3/109 [00:14<07:53,  4.47s/it]
Batches:   4%|3         | 4/109 [00:18<07:30,  4.29s/it]
Batches:   5%|4         | 5/109 [00:22<07:12,  4.16s/it]
Batches:   6%|5         | 6/109 [00:25<06:49,  3.97s/it]
Batches:   6%|6         | 7/109 [00:29<06:36,  3.89s/it]
Batches:   7%|7         | 8/109 [00:33<06:33,  3.90s/it]
Batches:   8%|8         | 9/109 [00:36<06:06,  3.67s/it]
Batches:   9%|9         | 10/109 [00:39<05:58,  3.62s/it]
Batches:  10%|#         | 11/109 [00:43<05:51,  3.59s/it]
Batches:  11%|#1        | 12/109 [00:46<05:34,  3.44s/it]
Batches:  12%|#1        | 13/109 [00:49<05:18,  3.32s/it]
Batches:  13%|#2        | 14/109 [00:52<05:05,  3.21s/it]
Batches:  14%|#3        | 15/109 [00:55<04:58,  3.18s/it]
Batches:  15%|#4        | 16/109 [00:58<04:42,  3.03s/it]
Batches:  16%|#5        | 17/109 [01:01<04:35,  2.99s/it]
Batches:  17%|#6        | 18/109 [01:04<04:45,  3.13s/it]
Batches:  17%|#7        | 19/109 [01:07<04:30,  3.01s/it]
Batches:  18%|#8        | 20/109 [01:10<04:25,  2.98s/it]
Batches:  19%|#9        | 21/109 [01:12<04:10,  2.85s/it]
Batches:  20%|##        | 22/109 [01:15<04:12,  2.91s/it]
Batches:  21%|##1       | 23/109 [01:18<04:01,  2.81s/it]
Batches:  22%|##2       | 24/109 [01:21<03:51,  2.72s/it]
Batches:  23%|##2       | 25/109 [01:23<03:44,  2.68s/it]
Batches:  24%|##3       | 26/109 [01:26<03:41,  2.67s/it]
Batches:  25%|##4       | 27/109 [01:29<03:45,  2.74s/it]
Batches:  26%|##5       | 28/109 [01:31<03:39,  2.71s/it]
Batches:  27%|##6       | 29/109 [01:34<03:26,  2.58s/it]
Batches:  28%|##7       | 30/109 [01:36<03:20,  2.54s/it]
Batches:  28%|##8       | 31/109 [01:39<03:16,  2.52s/it]
Batches:  29%|##9       | 32/109 [01:41<03:07,  2.43s/it]
Batches:  30%|###       | 33/109 [01:43<02:59,  2.36s/it]
Batches:  31%|###1      | 34/109 [01:45<02:53,  2.31s/it]
Batches:  32%|###2      | 35/109 [01:47<02:51,  2.32s/it]
Batches:  33%|###3      | 36/109 [01:50<02:43,  2.24s/it]
Batches:  34%|###3      | 37/109 [01:52<02:38,  2.21s/it]
Batches:  35%|###4      | 38/109 [01:54<02:33,  2.17s/it]
Batches:  36%|###5      | 39/109 [01:56<02:36,  2.23s/it]
Batches:  37%|###6      | 40/109 [01:58<02:30,  2.19s/it]
Batches:  38%|###7      | 41/109 [02:00<02:28,  2.18s/it]
Batches:  39%|###8      | 42/109 [02:03<02:26,  2.19s/it]
Batches:  39%|###9      | 43/109 [02:05<02:21,  2.15s/it]
Batches:  40%|####      | 44/109 [02:07<02:17,  2.11s/it]
Batches:  41%|####1     | 45/109 [02:09<02:12,  2.07s/it]
Batches:  42%|####2     | 46/109 [02:11<02:10,  2.07s/it]
Batches:  43%|####3     | 47/109 [02:13<02:07,  2.05s/it]
Batches:  44%|####4     | 48/109 [02:15<02:07,  2.09s/it]
Batches:  45%|####4     | 49/109 [02:17<02:02,  2.04s/it]
Batches:  46%|####5     | 50/109 [02:19<01:54,  1.94s/it]
Batches:  47%|####6     | 51/109 [02:20<01:50,  1.91s/it]
Batches:  48%|####7     | 52/109 [02:22<01:48,  1.90s/it]
Batches:  49%|####8     | 53/109 [02:24<01:46,  1.90s/it]
Batches:  50%|####9     | 54/109 [02:26<01:44,  1.89s/it]
Batches:  50%|#####     | 55/109 [02:28<01:44,  1.93s/it]
Batches:  51%|#####1    | 56/109 [02:30<01:41,  1.92s/it]
Batches:  52%|#####2    | 57/109 [02:32<01:41,  1.95s/it]
Batches:  53%|#####3    | 58/109 [02:34<01:34,  1.84s/it]
Batches:  54%|#####4    | 59/109 [02:35<01:29,  1.79s/it]
Batches:  55%|#####5    | 60/109 [02:37<01:25,  1.75s/it]
Batches:  56%|#####5    | 61/109 [02:38<01:21,  1.69s/it]
Batches:  57%|#####6    | 62/109 [02:40<01:18,  1.67s/it]
Batches:  58%|#####7    | 63/109 [02:42<01:16,  1.67s/it]
Batches:  59%|#####8    | 64/109 [02:44<01:19,  1.77s/it]
Batches:  60%|#####9    | 65/109 [02:45<01:14,  1.70s/it]
Batches:  61%|######    | 66/109 [02:47<01:11,  1.66s/it]
Batches:  61%|######1   | 67/109 [02:49<01:11,  1.70s/it]
Batches:  62%|######2   | 68/109 [02:50<01:06,  1.63s/it]
Batches:  63%|######3   | 69/109 [02:52<01:03,  1.58s/it]
Batches:  64%|######4   | 70/109 [02:53<00:59,  1.54s/it]
Batches:  65%|######5   | 71/109 [02:55<00:59,  1.57s/it]
Batches:  66%|######6   | 72/109 [02:56<00:57,  1.56s/it]
Batches:  67%|######6   | 73/109 [02:58<00:54,  1.51s/it]
Batches:  68%|######7   | 74/109 [02:59<00:51,  1.49s/it]
Batches:  69%|######8   | 75/109 [03:00<00:49,  1.46s/it]
Batches:  70%|######9   | 76/109 [03:02<00:48,  1.48s/it]
Batches:  71%|#######   | 77/109 [03:03<00:47,  1.48s/it]
Batches:  72%|#######1  | 78/109 [03:05<00:45,  1.47s/it]
Batches:  72%|#######2  | 79/109 [03:06<00:44,  1.47s/it]
Batches:  73%|#######3  | 80/109 [03:08<00:42,  1.45s/it]
Batches:  74%|#######4  | 81/109 [03:09<00:39,  1.42s/it]
Batches:  75%|#######5  | 82/109 [03:10<00:37,  1.39s/it]
Batches:  76%|#######6  | 83/109 [03:12<00:35,  1.36s/it]
Batches:  77%|#######7  | 84/109 [03:13<00:34,  1.36s/it]
Batches:  78%|#######7  | 85/109 [03:14<00:32,  1.36s/it]
Batches:  79%|#######8  | 86/109 [03:16<00:31,  1.37s/it]
Batches:  80%|#######9  | 87/109 [03:17<00:28,  1.30s/it]
Batches:  81%|########  | 88/109 [03:18<00:27,  1.30s/it]
Batches:  82%|########1 | 89/109 [03:20<00:25,  1.30s/it]
Batches:  83%|########2 | 90/109 [03:21<00:24,  1.28s/it]
Batches:  83%|########3 | 91/109 [03:22<00:22,  1.24s/it]
Batches:  84%|########4 | 92/109 [03:23<00:20,  1.21s/it]
Batches:  85%|########5 | 93/109 [03:24<00:19,  1.24s/it]
Batches:  86%|########6 | 94/109 [03:25<00:17,  1.19s/it]
Batches:  87%|########7 | 95/109 [03:26<00:15,  1.14s/it]
Batches:  88%|########8 | 96/109 [03:28<00:14,  1.11s/it]
Batches:  89%|########8 | 97/109 [03:29<00:13,  1.09s/it]
Batches:  90%|########9 | 98/109 [03:30<00:11,  1.06s/it]
Batches:  91%|######### | 99/109 [03:31<00:10,  1.05s/it]
Batches:  92%|#########1| 100/109 [03:31<00:09,  1.02s/it]
Batches:  93%|#########2| 101/109 [03:33<00:08,  1.02s/it]
Batches:  94%|#########3| 102/109 [03:33<00:06,  1.01it/s]
Batches:  94%|#########4| 103/109 [03:34<00:05,  1.05it/s]
Batches:  95%|#########5| 104/109 [03:35<00:04,  1.11it/s]
Batches:  96%|#########6| 105/109 [03:36<00:03,  1.10it/s]
Batches:  97%|#########7| 106/109 [03:37<00:02,  1.15it/s]
Batches:  98%|#########8| 107/109 [03:37<00:01,  1.28it/s]
Batches:  99%|#########9| 108/109 [03:38<00:00,  1.47it/s]
Batches: 100%|##########| 109/109 [03:38<00:00,  2.00s/it]

# Train BERTopic
# Initiate BERTopic
topic_model = BERTopic(n_gram_range=(1, 3),umap_model=umap_model, vectorizer_model=vectorizer_model,embedding_model=sentence_model)
# topic_model.save("all_mp_mod")
# topic_model = BERTopic.load("all_mp_mod")
# topic_model.save("SPECTER_mod")
# topic_model = BERTopic.load("SPECTER_mod")
topics, probs = topic_model.fit_transform(docs) 
topic_model.visualize_topics().show()
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings,hide_document_hover=True).show()
hierarchical_topics =topic_model.hierarchical_topics(docs)
## 
  0%|          | 0/45 [00:00<?, ?it/s]
 47%|####6     | 21/45 [00:00<00:00, 204.21it/s]
 96%|#########5| 43/45 [00:00<00:00, 210.28it/s]
100%|##########| 45/45 [00:00<00:00, 208.93it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_barchart(top_n_topics=60,n_words=8).show()
# topic_model2 = BERTopic(verbose=True,vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs,embeddings=embeddings)
#hierarchical_topics = topic_model.hierarchical_topics(docs)
from scipy.cluster import hierarchy as sch
# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)
## 
  0%|          | 0/45 [00:00<?, ?it/s]
 42%|####2     | 19/45 [00:00<00:00, 188.85it/s]
 84%|########4 | 38/45 [00:00<00:00, 186.70it/s]
100%|##########| 45/45 [00:00<00:00, 187.35it/s]
hierarchical_topics = topic_model.hierarchical_topics(docs)
## 
  0%|          | 0/45 [00:00<?, ?it/s]
 49%|####8     | 22/45 [00:00<00:00, 212.39it/s]
100%|##########| 45/45 [00:00<00:00, 215.62it/s]
100%|##########| 45/45 [00:00<00:00, 214.12it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings).show()
import re


# Prepare data
# trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
# trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
# trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
# trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
# timestamps = trump.date.to_list()
#tweets = trump.text.to_list()
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
timestamps = df.yearp.to_list()
#from bertopic import BERTopic

topic_model_t = topic_model
#topics_t, probs_t = topic_model5.fit_transform(docs)
topics, probs = topic_model_t.fit_transform(docs,embeddings)
topics_over_time = topic_model_t.topics_over_time(docs, timestamps, datetime_format=" %Y",
                                                  global_tuning=True, evolution_tuning=True)
topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=20,width=800,height=600).show()
topic_distr, _ = topic_model.approximate_distribution(docs)
topic_model.visualize_distribution(topic_distr[3]).show()
## Figure({
##     'data': [{'marker': {'color': '#C8D2D7', 'line': {'color': '#6E8484', 'width': 1}},
##               'orientation': 'h',
##               'type': 'bar',
##               'x': [0.01598080909848807, 0.23922859485089662,
##                     0.029263528726881437, 0.02175471559396236, 0.02933781519610779,
##                     0.12833306830258076, 0.01585936076773892, 0.01621391864139845,
##                     0.016349629746599766, 0.07395350544559207,
##                     0.061629094290419635, 0.023210394570315155,
##                     0.03692326835052149, 0.04687560241702505, 0.025632122294021245,
##                     0.04582034147254201, 0.02008037635648212, 0.07772908543076634,
##                     0.030544349153887748],
##               'y': [<b>Topic 0</b>: traffic_data_urban_model..., <b>Topic 1</b>:
##                     students_science_educati..., <b>Topic 2</b>:
##                     statistics_statistical_d..., <b>Topic 4</b>:
##                     health_clinical_data_car..., <b>Topic 5</b>:
##                     cloud_computing_data_per..., <b>Topic 6</b>:
##                     data_science_ethical_eth..., <b>Topic 9</b>:
##                     iot_security_devices_wir..., <b>Topic 11</b>:
##                     manufacturing_maintenan..., <b>Topic 12</b>:
##                     algorithms_clustering_o..., <b>Topic 13</b>:
##                     research_publications_s..., <b>Topic 14</b>:
##                     projects_teams_agile_pr..., <b>Topic 16</b>:
##                     code_notebooks_notebook..., <b>Topic 19</b>:
##                     big_business_data_roadm..., <b>Topic 21</b>:
##                     player_players_nba_socc..., <b>Topic 22</b>:
##                     big_data_analytics_busi..., <b>Topic 27</b>:
##                     visualization_visual_da..., <b>Topic 28</b>:
##                     nursing_informatics_nur..., <b>Topic 36</b>:
##                     data_big_em_natural_lan..., <b>Topic 38</b>:
##                     design_service_designer...]}],
##     'layout': {'height': 600,
##                'hoverlabel': {'bgcolor': 'white', 'font': {'family': 'Rockwell', 'size': 16}},
##                'template': '...',
##                'title': {'font': {'color': 'Black', 'size': 22},
##                          'text': '<b>Topic Probability Distribution</b>',
##                          'x': 0.5,
##                          'xanchor': 'center',
##                          'y': 0.95,
##                          'yanchor': 'top'},
##                'width': 800,
##                'xaxis': {'title': {'text': 'Probability'}}}
## })
# from nltk.corpus import wordnet
# #topic_model.reduce_topics(docs,nr_topics=30)