BERT

import numpy as np
import pandas as pd
np.random.seed(0)
from sentence_transformers import SentenceTransformer
#conda install -c conda-forge sentence-transformers
from bertopic import BERTopic
# conda install -c conda-forge bertopic
from umap import UMAP
# conda install -c conda-forge umap-learn
# from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer

# Prepare embeddings
# ATTENZIONE!! INSERISCI BENE IL PATH DOVE SI TROVA corpus.csv o spostalo nella 
# directory dove è il file .py
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
umap_model = UMAP(n_neighbors=11, 
                  n_components=2, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
                  # Count vectorizer
vectorizer_model = CountVectorizer(stop_words="english")

1grams

all-MiniLM-L6-v2

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

# Train BERTopic
# Initiate BERTopic
## 
Batches:   0%|          | 0/109 [00:00<?, ?it/s]
Batches:   1%|          | 1/109 [00:01<02:22,  1.32s/it]
Batches:   2%|1         | 2/109 [00:01<01:19,  1.34it/s]
Batches:   3%|2         | 3/109 [00:02<00:59,  1.78it/s]
Batches:   4%|3         | 4/109 [00:02<00:49,  2.12it/s]
Batches:   5%|4         | 5/109 [00:02<00:44,  2.35it/s]
Batches:   6%|5         | 6/109 [00:03<00:40,  2.53it/s]
Batches:   6%|6         | 7/109 [00:03<00:38,  2.65it/s]
Batches:   7%|7         | 8/109 [00:03<00:36,  2.74it/s]
Batches:   8%|8         | 9/109 [00:04<00:35,  2.81it/s]
Batches:   9%|9         | 10/109 [00:04<00:34,  2.85it/s]
Batches:  10%|#         | 11/109 [00:04<00:34,  2.88it/s]
Batches:  11%|#1        | 12/109 [00:05<00:33,  2.90it/s]
Batches:  12%|#1        | 13/109 [00:05<00:32,  2.92it/s]
Batches:  13%|#2        | 14/109 [00:05<00:32,  2.93it/s]
Batches:  14%|#3        | 15/109 [00:06<00:31,  2.94it/s]
Batches:  15%|#4        | 16/109 [00:06<00:31,  2.95it/s]
Batches:  16%|#5        | 17/109 [00:06<00:31,  2.95it/s]
Batches:  17%|#6        | 18/109 [00:07<00:30,  2.95it/s]
Batches:  17%|#7        | 19/109 [00:07<00:30,  2.94it/s]
Batches:  18%|#8        | 20/109 [00:07<00:30,  2.94it/s]
Batches:  19%|#9        | 21/109 [00:08<00:29,  2.95it/s]
Batches:  20%|##        | 22/109 [00:08<00:29,  2.95it/s]
Batches:  21%|##1       | 23/109 [00:08<00:29,  2.95it/s]
Batches:  22%|##2       | 24/109 [00:09<00:28,  2.96it/s]
Batches:  23%|##2       | 25/109 [00:09<00:28,  2.96it/s]
Batches:  24%|##3       | 26/109 [00:09<00:28,  2.96it/s]
Batches:  25%|##4       | 27/109 [00:10<00:27,  2.96it/s]
Batches:  26%|##5       | 28/109 [00:10<00:27,  2.97it/s]
Batches:  27%|##6       | 29/109 [00:10<00:27,  2.96it/s]
Batches:  28%|##7       | 30/109 [00:11<00:26,  2.96it/s]
Batches:  28%|##8       | 31/109 [00:11<00:26,  2.96it/s]
Batches:  29%|##9       | 32/109 [00:11<00:25,  2.96it/s]
Batches:  30%|###       | 33/109 [00:12<00:25,  2.96it/s]
Batches:  31%|###1      | 34/109 [00:12<00:25,  2.97it/s]
Batches:  32%|###2      | 35/109 [00:12<00:24,  2.96it/s]
Batches:  33%|###3      | 36/109 [00:13<00:24,  2.97it/s]
Batches:  34%|###3      | 37/109 [00:13<00:24,  2.90it/s]
Batches:  35%|###4      | 38/109 [00:13<00:24,  2.89it/s]
Batches:  36%|###5      | 39/109 [00:14<00:24,  2.91it/s]
Batches:  37%|###6      | 40/109 [00:14<00:23,  2.92it/s]
Batches:  38%|###7      | 41/109 [00:14<00:23,  2.94it/s]
Batches:  39%|###8      | 42/109 [00:15<00:22,  2.95it/s]
Batches:  39%|###9      | 43/109 [00:15<00:22,  2.95it/s]
Batches:  40%|####      | 44/109 [00:15<00:22,  2.95it/s]
Batches:  41%|####1     | 45/109 [00:16<00:21,  2.96it/s]
Batches:  42%|####2     | 46/109 [00:16<00:21,  2.95it/s]
Batches:  43%|####3     | 47/109 [00:16<00:21,  2.95it/s]
Batches:  44%|####4     | 48/109 [00:17<00:20,  2.95it/s]
Batches:  45%|####4     | 49/109 [00:17<00:20,  2.94it/s]
Batches:  46%|####5     | 50/109 [00:17<00:20,  2.95it/s]
Batches:  47%|####6     | 51/109 [00:18<00:19,  2.95it/s]
Batches:  48%|####7     | 52/109 [00:18<00:19,  2.94it/s]
Batches:  49%|####8     | 53/109 [00:18<00:18,  2.95it/s]
Batches:  50%|####9     | 54/109 [00:19<00:18,  2.96it/s]
Batches:  50%|#####     | 55/109 [00:19<00:18,  2.96it/s]
Batches:  51%|#####1    | 56/109 [00:19<00:17,  2.96it/s]
Batches:  52%|#####2    | 57/109 [00:20<00:17,  2.96it/s]
Batches:  53%|#####3    | 58/109 [00:20<00:16,  3.00it/s]
Batches:  54%|#####4    | 59/109 [00:20<00:16,  2.99it/s]
Batches:  55%|#####5    | 60/109 [00:21<00:16,  3.03it/s]
Batches:  56%|#####5    | 61/109 [00:21<00:15,  3.00it/s]
Batches:  57%|#####6    | 62/109 [00:21<00:15,  3.05it/s]
Batches:  58%|#####7    | 63/109 [00:22<00:15,  3.02it/s]
Batches:  59%|#####8    | 64/109 [00:22<00:14,  3.00it/s]
Batches:  60%|#####9    | 65/109 [00:22<00:14,  3.03it/s]
Batches:  61%|######    | 66/109 [00:23<00:13,  3.09it/s]
Batches:  61%|######1   | 67/109 [00:23<00:13,  3.04it/s]
Batches:  62%|######2   | 68/109 [00:23<00:13,  3.14it/s]
Batches:  63%|######3   | 69/109 [00:24<00:12,  3.22it/s]
Batches:  64%|######4   | 70/109 [00:24<00:11,  3.25it/s]
Batches:  65%|######5   | 71/109 [00:24<00:12,  3.16it/s]
Batches:  66%|######6   | 72/109 [00:25<00:11,  3.24it/s]
Batches:  67%|######6   | 73/109 [00:25<00:10,  3.35it/s]
Batches:  68%|######7   | 74/109 [00:25<00:10,  3.42it/s]
Batches:  69%|######8   | 75/109 [00:25<00:09,  3.51it/s]
Batches:  70%|######9   | 76/109 [00:26<00:09,  3.49it/s]
Batches:  71%|#######   | 77/109 [00:26<00:09,  3.43it/s]
Batches:  72%|#######1  | 78/109 [00:26<00:08,  3.47it/s]
Batches:  72%|#######2  | 79/109 [00:27<00:08,  3.46it/s]
Batches:  73%|#######3  | 80/109 [00:27<00:08,  3.55it/s]
Batches:  74%|#######4  | 81/109 [00:27<00:07,  3.58it/s]
Batches:  75%|#######5  | 82/109 [00:27<00:07,  3.62it/s]
Batches:  76%|#######6  | 83/109 [00:28<00:07,  3.65it/s]
Batches:  77%|#######7  | 84/109 [00:28<00:06,  3.71it/s]
Batches:  78%|#######7  | 85/109 [00:28<00:06,  3.75it/s]
Batches:  79%|#######8  | 86/109 [00:28<00:06,  3.82it/s]
Batches:  80%|#######9  | 87/109 [00:29<00:05,  3.95it/s]
Batches:  81%|########  | 88/109 [00:29<00:05,  3.86it/s]
Batches:  82%|########1 | 89/109 [00:29<00:05,  3.97it/s]
Batches:  83%|########2 | 90/109 [00:29<00:04,  4.03it/s]
Batches:  83%|########3 | 91/109 [00:30<00:04,  4.13it/s]
Batches:  84%|########4 | 92/109 [00:30<00:03,  4.28it/s]
Batches:  85%|########5 | 93/109 [00:30<00:03,  4.16it/s]
Batches:  86%|########6 | 94/109 [00:30<00:03,  4.41it/s]
Batches:  87%|########7 | 95/109 [00:30<00:03,  4.60it/s]
Batches:  88%|########8 | 96/109 [00:31<00:02,  4.74it/s]
Batches:  89%|########8 | 97/109 [00:31<00:02,  4.77it/s]
Batches:  90%|########9 | 98/109 [00:31<00:02,  4.82it/s]
Batches:  91%|######### | 99/109 [00:31<00:02,  4.84it/s]
Batches:  92%|#########1| 100/109 [00:31<00:01,  5.06it/s]
Batches:  93%|#########2| 101/109 [00:32<00:01,  5.15it/s]
Batches:  94%|#########3| 102/109 [00:32<00:01,  5.33it/s]
Batches:  94%|#########4| 103/109 [00:32<00:01,  5.73it/s]
Batches:  95%|#########5| 104/109 [00:32<00:00,  6.06it/s]
Batches:  96%|#########6| 105/109 [00:32<00:00,  5.92it/s]
Batches:  97%|#########7| 106/109 [00:32<00:00,  6.33it/s]
Batches:  98%|#########8| 107/109 [00:33<00:00,  7.06it/s]
Batches: 100%|##########| 109/109 [00:33<00:00,  3.29it/s]
topic_model = BERTopic(n_gram_range=(1, 1),umap_model=umap_model, vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs,embeddings=embeddings) 
topic_model.visualize_topics().show()
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings,hide_document_hover=True).show()

hierarchical_topics =topic_model.hierarchical_topics(docs)
## 
  0%|          | 0/51 [00:00<?, ?it/s]
 18%|#7        | 9/51 [00:00<00:00, 89.56it/s]
 39%|###9      | 20/51 [00:00<00:00, 97.83it/s]
 63%|######2   | 32/51 [00:00<00:00, 107.47it/s]
 86%|########6 | 44/51 [00:00<00:00, 108.32it/s]
100%|##########| 51/51 [00:00<00:00, 104.58it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_barchart(top_n_topics=60,n_words=8).show()
# topic_model2 = BERTopic(verbose=True,vectorizer_model=vectorizer_model,embedding_model=sentence_model)
topics, probs = topic_model.fit_transform(docs)
hierarchical_topics = topic_model.hierarchical_topics(docs)
## 
  0%|          | 0/51 [00:00<?, ?it/s]
 24%|##3       | 12/51 [00:00<00:00, 117.08it/s]
 47%|####7     | 24/51 [00:00<00:00, 105.51it/s]
 69%|######8   | 35/51 [00:00<00:00, 105.80it/s]
 92%|#########2| 47/51 [00:00<00:00, 107.50it/s]
100%|##########| 51/51 [00:00<00:00, 107.08it/s]
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics).show()
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, embeddings=embeddings).show()
import re


# Prepare data
# trump = pd.read_csv('https://drive.google.com/uc?export=download&id=1xRKHaP-QwACMydlDnyFPEaFdtskJuBa6')
# trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
# trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
# trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
# timestamps = trump.date.to_list()
#tweets = trump.text.to_list()
df = pd.read_csv("corpus.csv")
corpus=df.text
docs=corpus.values.tolist()
timestamps = df.yearp.to_list()
#from bertopic import BERTopic

topic_model_t = topic_model
#topics_t, probs_t = topic_model5.fit_transform(docs)
topics, probs = topic_model_t.fit_transform(docs,embeddings)
topics_over_time = topic_model_t.topics_over_time(docs, timestamps, datetime_format=" %Y",
                                                  global_tuning=True, evolution_tuning=True)
topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=20,width=800,height=600).show()