\[\\[0.2in]\]

Topic Modeling

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(dplyr)
library(ggplot2)
library(wordcloud2)
library(tm)
library(topicmodels)
library(tidytext)
library(Hmisc)

\[\\[0.1in]\]

Read Reddit data from a CSV file

reddit <- read.csv("tut1b.reddit.csv", header = TRUE)

\[\\[0.001in]\]

\[\\[0.01in]\]

Unnesting tokens

reddit.tm <- tidytext::unnest_tokens(reddit, word, cleaner_text)

word.freq <- reddit.tm %>% 
  dplyr::group_by(author) %>%
  dplyr::count(word, sort = TRUE)

word.freq %>%
  dplyr::ungroup() %>% 
  dplyr::count(word, sort = T)  %>% 
  filter(!is.na(word)) %>%
  top_n(25)%>%
  ggplot(aes(x= reorder(word, n), y= n)) +
  geom_bar(stat ="identity", fill= "darkorange")+
  coord_flip()+
  theme_classic()+
  labs(title="Top words (Reddit corpus)",
       x = "Word",
       y = "Frequency")

TF-IDF

tfidf <- word.freq %>% 
  tidytext::bind_tf_idf(word, author, n)

psych::describe(tfidf[3:6])

##        vars     n mean   sd median trimmed  mad  min   max range  skew kurtosis
## n         1 41308 1.29 1.17   1.00    1.07 0.00 1.00 81.00 80.00 18.86   807.05
## tf        2 41308 0.02 0.05   0.01    0.01 0.01 0.00  1.00  1.00  8.54   115.98
## idf       3 41165 4.23 1.57   4.06    4.21 1.77 1.02  6.89  5.87  0.12    -0.91
## tf_idf    4 41165 0.09 0.23   0.03    0.05 0.03 0.00  6.89  6.89 10.63   195.29
##          se
## n      0.01
## tf     0.00
## idf    0.01
## tf_idf 0.00

Document-Term Matrix

# Create Document-Term Matrix

reddit.dtm <- reddit.tm %>%
  dplyr::count(author, word) %>%    
  tidytext::cast_dtm(author, word, n) 

dtm_matrix <- as.matrix(reddit.dtm)

knitr::kable(as.data.frame(as.matrix(reddit.dtm[1:50, 1:10])))

	buy	confusing	cover	dollar	eth	foolishly	gas	im	messing	seventeen
-DapperDuck-	1	1	1	1	1	1	1	1	1	1
-Monero	0	0	0	0	0	0	0	0	0	0
[deleted]	0	0	0	4	0	0	0	0	0	0
*MOM*	0	0	0	0	0	0	0	0	0	0
Commando	0	0	0	0	0	0	0	0	0	0
_DeanRiding	0	0	0	0	0	0	0	0	0	0
_dekappatated	0	0	0	0	0	0	0	0	0	0
jtru	1	0	0	0	0	0	0	0	0	0
_Stealthy	0	0	0	0	4	0	0	0	0	0
TheWolfOfWalmart	1	0	0	6	3	0	0	0	0	0
thewoodsiestoak	0	0	0	0	0	0	0	0	0	0
0ysterhead	0	0	0	0	0	0	0	0	0	0
101100101000100101	0	0	0	0	0	0	0	0	0	0
1078Garage	0	0	0	0	0	0	0	0	0	0
12161986	0	0	0	0	0	0	0	0	0	0
1nfinitus	0	0	0	0	0	0	0	0	0	0
2020redditlurker	0	0	0	0	0	0	0	0	0	0
20yroldentrepreneur	0	0	0	0	0	0	0	0	0	0
218-69	0	0	0	0	0	0	0	0	0	0
2LostFlamingos	0	0	0	0	0	0	0	0	0	0
3utt5lut	0	0	0	0	0	0	0	0	0	0
80UNC3EBACK	3	0	0	0	0	0	0	0	0	0
aastha55	1	0	0	2	3	0	0	0	0	1
abesWaves	0	0	0	20	5	0	0	1	0	0
Abject-Government-13	0	0	0	0	0	0	1	0	0	0
ablablababla	0	0	0	0	0	0	0	0	0	0
Abovemeis	0	0	0	0	0	0	0	0	0	0
Accomplished-Disk-68	0	0	0	0	0	0	0	1	0	0
accountnumber569704	0	0	0	0	0	0	0	0	0	0
AdInternational2534	0	0	0	0	0	0	0	0	0	0
adoxxvegas	0	0	0	0	0	0	0	0	0	0
Adventurous_Dingo351	0	0	0	0	0	0	0	0	0	0
AdvisorSecret5301	1	0	0	2	0	0	0	0	0	0
Aerocryptic	1	0	0	0	4	0	0	2	0	0
Affectionate_Hand_76	0	0	0	0	0	0	0	0	0	0
After_Sock_3550	0	0	0	0	0	0	0	0	0	0
aIIstarz	0	0	0	0	2	0	0	1	0	0
akagi_misha	1	0	0	0	0	0	0	0	0	0
aliensmadeus	0	0	0	0	0	0	0	0	0	0
allstater2007	0	0	0	0	0	0	0	0	0	0
AllThingsEvil	0	0	0	0	0	0	0	0	0	0
alltimecards	1	0	0	1	3	0	0	4	0	0
AllwellBeloved	0	0	0	0	0	0	0	0	0	0
Alpha3K	0	0	0	2	0	0	0	0	0	0
Altruistic_Box4462	3	0	0	0	1	0	2	0	0	0
Altruistic_Duck3485	0	0	0	0	1	0	0	0	0	0
AltTimeHigh	0	0	0	0	0	0	0	0	0	0
ambyent	0	0	0	0	0	0	0	0	0	0
andrew7231	0	0	0	0	0	0	0	0	0	0
Andynr	0	0	0	0	0	0	0	0	0	0

Latent Dirichlet Allocation model

set.seed(1234)
reddit.lda <- LDA(reddit.dtm, method = "Gibbs", k = 10, control = list(alpha = 0.1))

terms(reddit.lda, 25)

##       Topic 1       Topic 2   Topic 3    Topic 4                    
##  [1,] "wallet"      "im"      "bull"     "skeptical"                
##  [2,] "vault"       "dont"    "btc"      "reddit"                   
##  [3,] "metamask"    "deleted" "market"   "tongue"                   
##  [4,] "send"        "removed" "crypto"   "gt"                       
##  [5,] "transfer"    "ive"     "etf"      "sticking"                 
##  [6,] "access"      "ill"     "run"      "https"                    
##  [7,] "network"     "didnt"   "bitcoin"  "fu"                       
##  [8,] "reddit"      NA        "bear"     "questions"                
##  [9,] "seed"        "youre"   "halving"  "rcryptocurrency"          
## [10,] "token"       "whats"   "sec"      "moons"                    
## [11,] "type"        "wont"    "approved" "action"                   
## [12,] "address"     "isnt"    "real"     "bot"                      
## [13,] "rugged"      "doesnt"  "percent"  "automatically"            
## [14,] "sell"        "id"      "spot"     "econ"                     
## [15,] "supply"      "dollar"  "news"     "spez"                     
## [16,] "phrase"      "theyre"  "approval" "excuse"                   
## [17,] "nova"        "wasnt"   "cycle"    "contact"                  
## [18,] "amount"      "stupid"  "pump"     "concerns"                 
## [19,] "transaction" "buying"  "list"     "answer"                   
## [20,] "exchange"    "youll"   "trust"    "moderators"               
## [21,] "trade"       "sadly"   "amp"      "performed"                
## [22,] "donuts"      "wow"     "drop"     "subredditmessagecomposeto"
## [23,] "gas"         "cdc"     "months"   "question"                 
## [24,] "whats"       "couldnt" "start"    "message"                  
## [25,] "meme"        "dips"    "waiting"  "found"                    
##       Topic 5     Topic 6     Topic 7     Topic 8       Topic 9    Topic 10   
##  [1,] "reddit"    "people"    "skeptical" "dollar"      "moons"    "world"    
##  [2,] "moons"     "surprise"  "https"     "thousand"    "crypto"   "hopium"   
##  [3,] "people"    "god"       "moons"     "hundred"     "time"     "watching" 
##  [4,] "money"     "lot"       "thousand"  "btc"         "dollar"   "coming"   
##  [5,] "shit"      "changed"   "hundred"   "eth"         "people"   "bro"      
##  [6,] "bitcoin"   "awesome"   "ama"       "hundredths"  "btc"      "live"     
##  [7,] "community" "yep"       "october"   "coinbase"    "laughing" "rest"     
##  [8,] "crypto"    "remindme"  "burned"    "percent"     "buy"      "doge"     
##  [9,] "fuck"      "sense"     "dollar"    "matic"       "percent"  "deep"     
## [10,] "mods"      "stupid"    "banner"    "dot"         "sell"     "coffee"   
## [11,] "contract"  "black"     "continue"  "liquidity"   "price"    "common"   
## [12,] "users"     "tos"       "questions" "ago"         "moon"     "hahaha"   
## [13,] "literally" "invested"  "team"      "bought"      "loud"     "shit"     
## [14,] "smart"     "dex"       "fore"      "loss"        "day"      "horror"   
## [15,] "token"     "straight"  "reddit"    "sol"         "money"    "pretty"   
## [16,] "program"   "paul"      "november"  "usd"         "eth"      "investors"
## [17,] "control"   "questions" "month"     "asset"       "excuse"   "heart"    
## [18,] "hard"      "pretty"    "live"      "usdt"        "thousand" "whale"    
## [19,] "tokens"    "etf"       "daily"     "minus"       "bull"     "bit"      
## [20,] "lost"      "fud"       "mod"       "trade"       "sticking" "holder"   
## [21,] "projects"  "emmer"     "host"      "ada"         "market"   "spent"    
## [22,] "time"      "dash"      "network"   "thousandths" "hope"     "lambo"    
## [23,] "fucking"   "level"     "amas"      "earned"      "tongue"   "born"     
## [24,] "gold"      "closer"    "booked"    "started"     "yeah"     "night"    
## [25,] "lot"       "luck"      "time"      "token"       NA         "retire"

# Tidy the LDA Results
lda_topics <- tidytext::tidy(reddit.lda, matrix = "beta")

# Get the topics
top_terms_table <- lda_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  arrange(topic, desc(beta)) %>%
  select(topic, term, beta)  # Select the columns you want to include

# Get the top terms
top_terms <- top_terms_table %>%
  dplyr::group_by(topic) %>%
  dplyr::slice_max(beta, n = 10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)

# Get the posterior terms
redditldaresults <- posterior(reddit.lda)

theta <- redditldaresults$topics
doc_topic_matrix <- as.matrix(theta)
doc_topic_df <- as.data.frame(as.table(as.matrix(theta)))

# Top topics
knitr::kable(as.matrix(doc_topic_df[1:50, 1:3]))

Var1	Var2	Freq
-DapperDuck-	1	0.1952380952
-Monero	1	0.0090909091
[deleted]	1	0.0004716981
*MOM*	1	0.0500000000
Commando	1	0.1375000000
_DeanRiding	1	0.0100000000
_dekappatated	1	0.0034482759
jtru	1	0.0333333333
_Stealthy	1	0.0816091954
TheWolfOfWalmart	1	0.2182051282
thewoodsiestoak	1	0.3666666667
0ysterhead	1	0.2833333333
101100101000100101	1	0.4555555556
1078Garage	1	0.0107843137
12161986	1	0.0050000000
1nfinitus	1	0.0100000000
2020redditlurker	1	0.0125000000
20yroldentrepreneur	1	0.0500000000
218-69	1	0.1833333333
2LostFlamingos	1	0.0200000000
3utt5lut	1	0.4114754098
80UNC3EBACK	1	0.0020833333
aastha55	1	0.0012658228
abesWaves	1	0.0003095975
Abject-Government-13	1	0.0085937500
ablablababla	1	0.0100000000
Abovemeis	1	0.0500000000
Accomplished-Disk-68	1	0.1640000000
accountnumber569704	1	0.0047619048
AdInternational2534	1	0.5250000000
adoxxvegas	1	0.0083333333
Adventurous_Dingo351	1	0.0250000000
AdvisorSecret5301	1	0.0028571429
Aerocryptic	1	0.0057894737
Affectionate_Hand_76	1	0.0022222222
After_Sock_3550	1	0.0035714286
aIIstarz	1	0.0174603175
akagi_misha	1	0.0076923077
aliensmadeus	1	0.0034482759
allstater2007	1	0.0034482759
AllThingsEvil	1	0.3637681159
alltimecards	1	0.0006024096
AllwellBeloved	1	0.5500000000
Alpha3K	1	0.0062500000
Altruistic_Box4462	1	0.1794736842
Altruistic_Duck3485	1	0.2584158416
AltTimeHigh	1	0.0125000000
ambyent	1	0.0045454545
andrew7231	1	0.0083333333
Andynr	1	0.0013333333

Topic model results

top_terms <- top_terms_table %>%
  dplyr::group_by(topic) %>%
  dplyr::slice_max(beta, n = 10) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(topic, -beta)

top_terms %>%
  dplyr::mutate(term = tidytext::reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  tidytext::scale_y_reordered() +
  labs(title = "Top 10 tokens per topic",
       x = "Proportion",
       y = "Token") +
  theme_minimal()

# Topic Distribution Barplot
barplot(colSums(doc_topic_matrix), xlab = "Topics", ylab = "Number of Comments", main = "Topic Distribution")

# Topic Distribution heatmap
ggplot(doc_topic_df, aes(x = Var2, y = Var1, fill = Freq)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "Document-Topic Matrix", x = "Topics", y = "") +  # Removed y-axis label
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        axis.text.y = element_blank())  # Hide y-axis text

# Topic Correlations
rcorr(cor(theta))

##        1     2     3     4     5     6     7     8     9    10
## 1   1.00 -0.08 -0.08  0.00 -0.04 -0.09 -0.05 -0.09 -0.17 -0.07
## 2  -0.08  1.00 -0.02 -0.08 -0.12 -0.08 -0.04 -0.11 -0.13 -0.05
## 3  -0.08 -0.02  1.00 -0.04 -0.06 -0.07 -0.04 -0.05 -0.31 -0.04
## 4   0.00 -0.08 -0.04  1.00  0.05 -0.06  0.01  0.01 -0.28 -0.06
## 5  -0.04 -0.12 -0.06  0.05  1.00 -0.02  0.02 -0.09 -0.48 -0.09
## 6  -0.09 -0.08 -0.07 -0.06 -0.02  1.00  0.01 -0.03 -0.21  0.03
## 7  -0.05 -0.04 -0.04  0.01  0.02  0.01  1.00  0.02 -0.32 -0.02
## 8  -0.09 -0.11 -0.05  0.01 -0.09 -0.03  0.02  1.00 -0.24 -0.07
## 9  -0.17 -0.13 -0.31 -0.28 -0.48 -0.21 -0.32 -0.24  1.00 -0.19
## 10 -0.07 -0.05 -0.04 -0.06 -0.09  0.03 -0.02 -0.07 -0.19  1.00
## 
## n= 10 
## 
## 
## P
##    1      2      3      4      5      6      7      8      9      10    
## 1         0.8162 0.8225 0.9908 0.9035 0.8151 0.8893 0.7989 0.6455 0.8379
## 2  0.8162        0.9594 0.8353 0.7386 0.8224 0.9161 0.7581 0.7190 0.8854
## 3  0.8225 0.9594        0.9063 0.8651 0.8413 0.9178 0.8956 0.3911 0.9174
## 4  0.9908 0.8353 0.9063        0.8855 0.8718 0.9856 0.9705 0.4375 0.8695
## 5  0.9035 0.7386 0.8651 0.8855        0.9526 0.9508 0.8009 0.1601 0.7968
## 6  0.8151 0.8224 0.8413 0.8718 0.9526        0.9741 0.9259 0.5570 0.9375
## 7  0.8893 0.9161 0.9178 0.9856 0.9508 0.9741        0.9454 0.3705 0.9634
## 8  0.7989 0.7581 0.8956 0.9705 0.8009 0.9259 0.9454        0.4960 0.8549
## 9  0.6455 0.7190 0.3911 0.4375 0.1601 0.5570 0.3705 0.4960        0.5986
## 10 0.8379 0.8854 0.9174 0.8695 0.7968 0.9375 0.9634 0.8549 0.5986

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

28 November 2023

Topic Modeling

Housekeeping

Loading needed libraries

Read Reddit data from a CSV file

Unnesting tokens

TF-IDF

Document-Term Matrix

Latent Dirichlet Allocation model

Topic model results