\[\\[0.2in]\]
\[\\[0.1in]\]
library(dplyr)
library(ggplot2)
library(wordcloud2)
library(tm)
library(topicmodels)
library(tidytext)
library(Hmisc)
\[\\[0.1in]\]
reddit <- read.csv("tut1b.reddit.csv", header = TRUE)
\[\\[0.001in]\]
\[\\[0.01in]\]
reddit.tm <- tidytext::unnest_tokens(reddit, word, cleaner_text)
word.freq <- reddit.tm %>%
dplyr::group_by(author) %>%
dplyr::count(word, sort = TRUE)
word.freq %>%
dplyr::ungroup() %>%
dplyr::count(word, sort = T) %>%
filter(!is.na(word)) %>%
top_n(25)%>%
ggplot(aes(x= reorder(word, n), y= n)) +
geom_bar(stat ="identity", fill= "darkorange")+
coord_flip()+
theme_classic()+
labs(title="Top words (Reddit corpus)",
x = "Word",
y = "Frequency")
tfidf <- word.freq %>%
tidytext::bind_tf_idf(word, author, n)
psych::describe(tfidf[3:6])
## vars n mean sd median trimmed mad min max range skew kurtosis
## n 1 41308 1.29 1.17 1.00 1.07 0.00 1.00 81.00 80.00 18.86 807.05
## tf 2 41308 0.02 0.05 0.01 0.01 0.01 0.00 1.00 1.00 8.54 115.98
## idf 3 41165 4.23 1.57 4.06 4.21 1.77 1.02 6.89 5.87 0.12 -0.91
## tf_idf 4 41165 0.09 0.23 0.03 0.05 0.03 0.00 6.89 6.89 10.63 195.29
## se
## n 0.01
## tf 0.00
## idf 0.01
## tf_idf 0.00
# Create Document-Term Matrix
reddit.dtm <- reddit.tm %>%
dplyr::count(author, word) %>%
tidytext::cast_dtm(author, word, n)
dtm_matrix <- as.matrix(reddit.dtm)
knitr::kable(as.data.frame(as.matrix(reddit.dtm[1:50, 1:10])))
| buy | confusing | cover | dollar | eth | foolishly | gas | im | messing | seventeen | |
|---|---|---|---|---|---|---|---|---|---|---|
| -DapperDuck- | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| -Monero | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| [deleted] | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
| MOM | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Commando | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| _DeanRiding | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| _dekappatated | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| jtru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| _Stealthy | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 |
| TheWolfOfWalmart | 1 | 0 | 0 | 6 | 3 | 0 | 0 | 0 | 0 | 0 |
| thewoodsiestoak | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0ysterhead | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 101100101000100101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1078Garage | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12161986 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1nfinitus | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2020redditlurker | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20yroldentrepreneur | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 218-69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2LostFlamingos | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3utt5lut | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 80UNC3EBACK | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| aastha55 | 1 | 0 | 0 | 2 | 3 | 0 | 0 | 0 | 0 | 1 |
| abesWaves | 0 | 0 | 0 | 20 | 5 | 0 | 0 | 1 | 0 | 0 |
| Abject-Government-13 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ablablababla | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Abovemeis | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Accomplished-Disk-68 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| accountnumber569704 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AdInternational2534 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| adoxxvegas | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Adventurous_Dingo351 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AdvisorSecret5301 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Aerocryptic | 1 | 0 | 0 | 0 | 4 | 0 | 0 | 2 | 0 | 0 |
| Affectionate_Hand_76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| After_Sock_3550 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| aIIstarz | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 1 | 0 | 0 |
| akagi_misha | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| aliensmadeus | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| allstater2007 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AllThingsEvil | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| alltimecards | 1 | 0 | 0 | 1 | 3 | 0 | 0 | 4 | 0 | 0 |
| AllwellBeloved | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Alpha3K | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| Altruistic_Box4462 | 3 | 0 | 0 | 0 | 1 | 0 | 2 | 0 | 0 | 0 |
| Altruistic_Duck3485 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| AltTimeHigh | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ambyent | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| andrew7231 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Andynr | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
set.seed(1234)
reddit.lda <- LDA(reddit.dtm, method = "Gibbs", k = 10, control = list(alpha = 0.1))
terms(reddit.lda, 25)
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "wallet" "im" "bull" "skeptical"
## [2,] "vault" "dont" "btc" "reddit"
## [3,] "metamask" "deleted" "market" "tongue"
## [4,] "send" "removed" "crypto" "gt"
## [5,] "transfer" "ive" "etf" "sticking"
## [6,] "access" "ill" "run" "https"
## [7,] "network" "didnt" "bitcoin" "fu"
## [8,] "reddit" NA "bear" "questions"
## [9,] "seed" "youre" "halving" "rcryptocurrency"
## [10,] "token" "whats" "sec" "moons"
## [11,] "type" "wont" "approved" "action"
## [12,] "address" "isnt" "real" "bot"
## [13,] "rugged" "doesnt" "percent" "automatically"
## [14,] "sell" "id" "spot" "econ"
## [15,] "supply" "dollar" "news" "spez"
## [16,] "phrase" "theyre" "approval" "excuse"
## [17,] "nova" "wasnt" "cycle" "contact"
## [18,] "amount" "stupid" "pump" "concerns"
## [19,] "transaction" "buying" "list" "answer"
## [20,] "exchange" "youll" "trust" "moderators"
## [21,] "trade" "sadly" "amp" "performed"
## [22,] "donuts" "wow" "drop" "subredditmessagecomposeto"
## [23,] "gas" "cdc" "months" "question"
## [24,] "whats" "couldnt" "start" "message"
## [25,] "meme" "dips" "waiting" "found"
## Topic 5 Topic 6 Topic 7 Topic 8 Topic 9 Topic 10
## [1,] "reddit" "people" "skeptical" "dollar" "moons" "world"
## [2,] "moons" "surprise" "https" "thousand" "crypto" "hopium"
## [3,] "people" "god" "moons" "hundred" "time" "watching"
## [4,] "money" "lot" "thousand" "btc" "dollar" "coming"
## [5,] "shit" "changed" "hundred" "eth" "people" "bro"
## [6,] "bitcoin" "awesome" "ama" "hundredths" "btc" "live"
## [7,] "community" "yep" "october" "coinbase" "laughing" "rest"
## [8,] "crypto" "remindme" "burned" "percent" "buy" "doge"
## [9,] "fuck" "sense" "dollar" "matic" "percent" "deep"
## [10,] "mods" "stupid" "banner" "dot" "sell" "coffee"
## [11,] "contract" "black" "continue" "liquidity" "price" "common"
## [12,] "users" "tos" "questions" "ago" "moon" "hahaha"
## [13,] "literally" "invested" "team" "bought" "loud" "shit"
## [14,] "smart" "dex" "fore" "loss" "day" "horror"
## [15,] "token" "straight" "reddit" "sol" "money" "pretty"
## [16,] "program" "paul" "november" "usd" "eth" "investors"
## [17,] "control" "questions" "month" "asset" "excuse" "heart"
## [18,] "hard" "pretty" "live" "usdt" "thousand" "whale"
## [19,] "tokens" "etf" "daily" "minus" "bull" "bit"
## [20,] "lost" "fud" "mod" "trade" "sticking" "holder"
## [21,] "projects" "emmer" "host" "ada" "market" "spent"
## [22,] "time" "dash" "network" "thousandths" "hope" "lambo"
## [23,] "fucking" "level" "amas" "earned" "tongue" "born"
## [24,] "gold" "closer" "booked" "started" "yeah" "night"
## [25,] "lot" "luck" "time" "token" NA "retire"
# Tidy the LDA Results
lda_topics <- tidytext::tidy(reddit.lda, matrix = "beta")
# Get the topics
top_terms_table <- lda_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, desc(beta)) %>%
select(topic, term, beta) # Select the columns you want to include
# Get the top terms
top_terms <- top_terms_table %>%
dplyr::group_by(topic) %>%
dplyr::slice_max(beta, n = 10) %>%
dplyr::ungroup() %>%
dplyr::arrange(topic, -beta)
# Get the posterior terms
redditldaresults <- posterior(reddit.lda)
theta <- redditldaresults$topics
doc_topic_matrix <- as.matrix(theta)
doc_topic_df <- as.data.frame(as.table(as.matrix(theta)))
# Top topics
knitr::kable(as.matrix(doc_topic_df[1:50, 1:3]))
| Var1 | Var2 | Freq |
|---|---|---|
| -DapperDuck- | 1 | 0.1952380952 |
| -Monero | 1 | 0.0090909091 |
| [deleted] | 1 | 0.0004716981 |
| MOM | 1 | 0.0500000000 |
| Commando | 1 | 0.1375000000 |
| _DeanRiding | 1 | 0.0100000000 |
| _dekappatated | 1 | 0.0034482759 |
| jtru | 1 | 0.0333333333 |
| _Stealthy | 1 | 0.0816091954 |
| TheWolfOfWalmart | 1 | 0.2182051282 |
| thewoodsiestoak | 1 | 0.3666666667 |
| 0ysterhead | 1 | 0.2833333333 |
| 101100101000100101 | 1 | 0.4555555556 |
| 1078Garage | 1 | 0.0107843137 |
| 12161986 | 1 | 0.0050000000 |
| 1nfinitus | 1 | 0.0100000000 |
| 2020redditlurker | 1 | 0.0125000000 |
| 20yroldentrepreneur | 1 | 0.0500000000 |
| 218-69 | 1 | 0.1833333333 |
| 2LostFlamingos | 1 | 0.0200000000 |
| 3utt5lut | 1 | 0.4114754098 |
| 80UNC3EBACK | 1 | 0.0020833333 |
| aastha55 | 1 | 0.0012658228 |
| abesWaves | 1 | 0.0003095975 |
| Abject-Government-13 | 1 | 0.0085937500 |
| ablablababla | 1 | 0.0100000000 |
| Abovemeis | 1 | 0.0500000000 |
| Accomplished-Disk-68 | 1 | 0.1640000000 |
| accountnumber569704 | 1 | 0.0047619048 |
| AdInternational2534 | 1 | 0.5250000000 |
| adoxxvegas | 1 | 0.0083333333 |
| Adventurous_Dingo351 | 1 | 0.0250000000 |
| AdvisorSecret5301 | 1 | 0.0028571429 |
| Aerocryptic | 1 | 0.0057894737 |
| Affectionate_Hand_76 | 1 | 0.0022222222 |
| After_Sock_3550 | 1 | 0.0035714286 |
| aIIstarz | 1 | 0.0174603175 |
| akagi_misha | 1 | 0.0076923077 |
| aliensmadeus | 1 | 0.0034482759 |
| allstater2007 | 1 | 0.0034482759 |
| AllThingsEvil | 1 | 0.3637681159 |
| alltimecards | 1 | 0.0006024096 |
| AllwellBeloved | 1 | 0.5500000000 |
| Alpha3K | 1 | 0.0062500000 |
| Altruistic_Box4462 | 1 | 0.1794736842 |
| Altruistic_Duck3485 | 1 | 0.2584158416 |
| AltTimeHigh | 1 | 0.0125000000 |
| ambyent | 1 | 0.0045454545 |
| andrew7231 | 1 | 0.0083333333 |
| Andynr | 1 | 0.0013333333 |
top_terms <- top_terms_table %>%
dplyr::group_by(topic) %>%
dplyr::slice_max(beta, n = 10) %>%
dplyr::ungroup() %>%
dplyr::arrange(topic, -beta)
top_terms %>%
dplyr::mutate(term = tidytext::reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
tidytext::scale_y_reordered() +
labs(title = "Top 10 tokens per topic",
x = "Proportion",
y = "Token") +
theme_minimal()
# Topic Distribution Barplot
barplot(colSums(doc_topic_matrix), xlab = "Topics", ylab = "Number of Comments", main = "Topic Distribution")
# Topic Distribution heatmap
ggplot(doc_topic_df, aes(x = Var2, y = Var1, fill = Freq)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = "Document-Topic Matrix", x = "Topics", y = "") + # Removed y-axis label
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y = element_blank()) # Hide y-axis text
# Topic Correlations
rcorr(cor(theta))
## 1 2 3 4 5 6 7 8 9 10
## 1 1.00 -0.08 -0.08 0.00 -0.04 -0.09 -0.05 -0.09 -0.17 -0.07
## 2 -0.08 1.00 -0.02 -0.08 -0.12 -0.08 -0.04 -0.11 -0.13 -0.05
## 3 -0.08 -0.02 1.00 -0.04 -0.06 -0.07 -0.04 -0.05 -0.31 -0.04
## 4 0.00 -0.08 -0.04 1.00 0.05 -0.06 0.01 0.01 -0.28 -0.06
## 5 -0.04 -0.12 -0.06 0.05 1.00 -0.02 0.02 -0.09 -0.48 -0.09
## 6 -0.09 -0.08 -0.07 -0.06 -0.02 1.00 0.01 -0.03 -0.21 0.03
## 7 -0.05 -0.04 -0.04 0.01 0.02 0.01 1.00 0.02 -0.32 -0.02
## 8 -0.09 -0.11 -0.05 0.01 -0.09 -0.03 0.02 1.00 -0.24 -0.07
## 9 -0.17 -0.13 -0.31 -0.28 -0.48 -0.21 -0.32 -0.24 1.00 -0.19
## 10 -0.07 -0.05 -0.04 -0.06 -0.09 0.03 -0.02 -0.07 -0.19 1.00
##
## n= 10
##
##
## P
## 1 2 3 4 5 6 7 8 9 10
## 1 0.8162 0.8225 0.9908 0.9035 0.8151 0.8893 0.7989 0.6455 0.8379
## 2 0.8162 0.9594 0.8353 0.7386 0.8224 0.9161 0.7581 0.7190 0.8854
## 3 0.8225 0.9594 0.9063 0.8651 0.8413 0.9178 0.8956 0.3911 0.9174
## 4 0.9908 0.8353 0.9063 0.8855 0.8718 0.9856 0.9705 0.4375 0.8695
## 5 0.9035 0.7386 0.8651 0.8855 0.9526 0.9508 0.8009 0.1601 0.7968
## 6 0.8151 0.8224 0.8413 0.8718 0.9526 0.9741 0.9259 0.5570 0.9375
## 7 0.8893 0.9161 0.9178 0.9856 0.9508 0.9741 0.9454 0.3705 0.9634
## 8 0.7989 0.7581 0.8956 0.9705 0.8009 0.9259 0.9454 0.4960 0.8549
## 9 0.6455 0.7190 0.3911 0.4375 0.1601 0.5570 0.3705 0.4960 0.5986
## 10 0.8379 0.8854 0.9174 0.8695 0.7968 0.9375 0.9634 0.8549 0.5986