1. Create Dataframe from CSV file
posts_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
posts_df$year <- substr(posts_df$date, 1, 4)
posts_df$year <- as.numeric(posts_df$year)
posts_df$date <- as.Date(posts_df$date, format = "%Y-%m-%d")
posts_df <- posts_df %>%
mutate(score_analysis = case_when(
(posts_df$score<300) ~ "low",
(posts_df$score>=300 & posts_df$score <= 600) ~ "medium",
(posts_df$score >600) ~ "high"
))
posts_df <- posts_df %>%
mutate(date_analysis = case_when(
(posts_df$date<='2018-01-08') ~ "jan_one",
(posts_df$date>'2018-01-08' & posts_df$date <= '2018-01-16') ~ "jan_two",
(posts_df$date>'2018-01-16' & posts_df$date <= '2018-01-24') ~ "jan_thr",
(posts_df$date>'2018-01-24') ~ "jan_fou",
))
glimpse(posts_df)
## Rows: 245
## Columns: 10
## $ doc_id <chr> "Jan_2018_FINAL.csv.1", "Jan_2018_FINAL.csv.2", …
## $ text <chr> "Robinhood saved me over $8k in fees! What was …
## $ date <date> 2018-01-01, 2018-01-01, 2018-01-01, 2018-01-02,…
## $ score <int> 1780, 394, 2016, 202, 106, 134, 155, 757, 477, 4…
## $ up_ratio <dbl> 0.95, 0.96, 0.93, 0.97, 0.93, 0.94, 0.94, 0.94, …
## $ total_awards_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ comments <int> 118, 97, 238, 52, 4, 47, 47, 43, 99, 31, 43, 31,…
## $ year <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, …
## $ score_analysis <chr> "high", "medium", "high", "low", "low", "low", "…
## $ date_analysis <chr> "jan_one", "jan_one", "jan_one", "jan_one", "jan…
2. Create Corpus with document level variables
posts_corp <- corpus(posts_df)
summary(posts_corp,5)
## Corpus consisting of 245 documents, showing 5 documents:
##
## Text Types Tokens Sentences date score up_ratio
## Jan_2018_FINAL.csv.1 668 1824 45 2018-01-01 1780 0.95
## Jan_2018_FINAL.csv.2 824 2480 65 2018-01-01 394 0.96
## Jan_2018_FINAL.csv.3 1408 5644 189 2018-01-01 2016 0.93
## Jan_2018_FINAL.csv.4 15 15 1 2018-01-02 202 0.97
## Jan_2018_FINAL.csv.5 13 14 1 2018-01-02 106 0.93
## total_awards_received comments year score_analysis date_analysis
## 0 118 2018 high jan_one
## 0 97 2018 medium jan_one
## 0 238 2018 high jan_one
## 0 52 2018 low jan_one
## 0 4 2018 low jan_one
3. Generate Tokens
myStopWords = c("one", "two", "three", "first", "second", "third","gets","faggot",
"may", "also", "can", "whether","gt","make","good","well","take",
"just", "like", "got","will","lol","right","fucking","actually","wsb",
"A","at", "but","For", "in", "for","how","How","several","man",
"Tue", "but", "doesnt", "from", "The","From","see","much","people",
"like", "just", "shit", "fuck","can","spx","still","go","say","went",
"have", "been", "has", "than","with","To","even","now","since","better",
"use","who","of","to","show","and","look","guy","made","great","someone",
"include", "includes", "including", "included","though","thing","post",
"uses", "using", "used", "comprises","something","probably","yes","don",
"on","said","were","by","that","is","sure","really","way","new","feel",
"as","was","an","it","which","its","think","sub","yeah","need","want","cramer",
"had","are","they","he","be","us","na","back","already","around","going",
"get", "NA", "know", "think","deleted","everyone","gets","many","makes","cramer's")
posts_toks <- tokens(posts_corp, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
tokens_keep(min_nchar = 2)
#Create compound tokens
toks_nostop_col <- textstat_collocations(posts_toks, min_count = 10)
head(toks_nostop_col, 5)
## collocation count count_nested length lambda z
## 1 last year 56 0 2 4.882448 28.40296
## 2 percent change 37 0 2 7.497311 25.24040
## 3 date change 32 0 2 6.698862 25.18586
## 4 nbsp bot 28 0 2 7.136430 24.36797
## 5 __year's low 31 0 2 6.610580 24.24902
posts_toks_comp <- tokens_compound(
posts_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
head(posts_toks_comp, 5)
## Tokens consisting of 5 documents and 8 docvars.
## Jan_2018_FINAL.csv.1 :
## [1] "Robinhood" "saved" "8k" "fees" "riding" "short"
## [7] "bus" "kid" "jelly" "cause" "couldnt" "ride"
## [ ... and 570 more ]
##
## Jan_2018_FINAL.csv.2 :
## [1] "Saved" "commissions" "build" "SP500" "ETF"
## [6] "hand" "kinda" "robinhood" "portfolio" "gives"
## [11] "dividend" "every_single"
## [ ... and 835 more ]
##
## Jan_2018_FINAL.csv.3 :
## [1] "added" "every" "paycheck" "little" "years" "buy"
## [7] "Toyota" "Camry" "Best" "car" "miles" "per"
## [ ... and 1,723 more ]
##
## Jan_2018_FINAL.csv.4 :
## [1] "Best" "Worst" "amp" "stocks"
##
## Jan_2018_FINAL.csv.5 :
## [1] "cellphone" "choice" "autist" "making" "trades"
4. Generate document feature matrix
posts_dfmat <- dfm(posts_toks_comp, tolower = TRUE) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10)
posts_dfmat
## Document-feature matrix of: 245 documents, 1,067 features (91.39% sparse) and 8 docvars.
## features
## docs robinhood fees short kid cause ride planning rope anyone
## Jan_2018_FINAL.csv.1 1 1 6 1 2 1 1 2 2
## Jan_2018_FINAL.csv.2 4 2 1 1 0 1 0 0 1
## Jan_2018_FINAL.csv.3 14 0 1 1 1 0 0 1 3
## Jan_2018_FINAL.csv.4 0 0 0 0 0 0 0 0 0
## Jan_2018_FINAL.csv.5 0 0 0 0 0 0 0 0 0
## Jan_2018_FINAL.csv.6 0 0 0 0 0 0 0 0 0
## features
## docs pretty
## Jan_2018_FINAL.csv.1 1
## Jan_2018_FINAL.csv.2 2
## Jan_2018_FINAL.csv.3 1
## Jan_2018_FINAL.csv.4 0
## Jan_2018_FINAL.csv.5 0
## Jan_2018_FINAL.csv.6 0
## [ reached max_ndoc ... 239 more documents, reached max_nfeat ... 1,057 more features ]
#See top 30 features
topfeatures(posts_dfmat, 30)
## buy money time amd sell options market stock
## 460 456 325 295 285 271 264 263
## bought calls long day price year short gains
## 223 217 211 206 203 192 159 153
## mu put stocks never company trading today pretty
## 153 151 149 147 146 145 144 141
## earnings buying sold call real work
## 138 138 138 136 133 131
5. Generate Relative Frequency analysis
posts_tstat_key <- textstat_keyness(
posts_dfmat, target = posts_dfmat$score <= 500)
textplot_keyness(posts_tstat_key, n=10, min_count = 10)

5a. Identify number of topics
#Convert document-feature matrix to document-term representations used in topicmodels
posts_dtmat = quanteda::convert(posts_dfmat, to="topicmodels")
posts_dtmat
## <<DocumentTermMatrix (documents: 238, terms: 1067)>>
## Non-/sparse entries: 22511/231435
## Sparsity : 91%
## Maximal term length: 17
## Weighting : term frequency (tf)
train_posts_dtmat <- corpus_subset(posts_corp)[1:125,] %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE) %>%
dfm(tolower = TRUE) %>%
dfm_remove(myStopWords) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
quanteda::convert(to="topicmodels")
test_posts_dtmat <- corpus_subset(posts_corp)[126:245,] %>%
tokens(remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE) %>%
dfm(tolower = TRUE) %>%
dfm_remove(myStopWords) %>%
dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
quanteda::convert(to="topicmodels")
train_posts_lda5 <- LDA(train_posts_dtmat, k = 5, control = list(seed = 123))
perplexity(train_posts_lda5, test_posts_dtmat) # perplexity when k = 5
## [1] 250.8326
n_topics_vec = 2:5 # try different num of topics: 2, 3, 4, 5
# this analysis will take a while
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
message(kk)
train_posts_ldaK <- LDA(train_posts_dtmat, k = kk, control = list(seed = 123))
perp = perplexity(train_posts_ldaK, test_posts_dtmat)
})
## 2
## 3
## 4
## 5
lda_perplexity_result = tibble(
n_topics = n_topics_vec, perplexity = perplexity_vec
)
plot(lda_perplexity_result, type="l")
lda_ldatuning_result <- FindTopicsNumber(
posts_dtmat, topics = n_topics_vec,
metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
FindTopicsNumber_plot(lda_ldatuning_result)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.


6. Visualize topics as word cloud
K = 5 #number of topics
posts_lda5 <- LDA(posts_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(posts_lda5, 10)
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "company" "long" "amd" "options" "buy"
## [2,] "buy" "money" "intel" "money" "sell"
## [3,] "money" "short" "buy" "price" "market"
## [4,] "day" "time" "earnings" "buy" "day"
## [5,] "time" "year" "calls" "put" "money"
## [6,] "stock" "years" "bought" "call" "options"
## [7,] "market" "never" "time" "stock" "calls"
## [8,] "news" "point" "mu" "option" "gains"
## [9,] "shop" "real" "puts" "time" "bought"
## [10,] "cash" "true" "stock" "sell" "stocks"
tmResult <- posterior(posts_lda5)
top5termsPerTopic <- topicmodels::terms(posts_lda5, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz <- 11 # change for your own topic of interest
topicToViz <- grep('intel', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
words <- names(top40terms)
# extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

7. Visualize top terms in topics
posts_lda5_betas <- broom::tidy(posts_lda5)
top_terms_in_topics <- posts_lda5_betas %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_in_topics %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

8. Advanced Topic Modeling
8.1. Correlated Topic Model (CTM)
stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173
out <- prepDocuments(
stm_posts_dfmat$documents,
stm_posts_dfmat$vocab,
stm_posts_dfmat$meta
)
str(out, max.level = 1)
## List of 7
## $ documents :List of 238
## $ vocab : chr [1:1067] "10k" "1k" "20k" "2k" ...
## $ meta :'data.frame': 238 obs. of 8 variables:
## $ words.removed : chr(0)
## $ docs.removed : NULL
## $ tokens.removed: int 0
## $ wordcounts : int [1:1067] 11 11 12 11 11 12 29 13 18 11 ...
K=5
posts_tmod_ctm <- stm(out$documents, out$vocab, K = K,
seed = 123, emtol = 1e-3, max.em.its = 150)
## Beginning Spectral Initialization
## Calculating the gram matrix...
## Finding anchor words...
## .....
## Recovering initialization...
## ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 1 (approx. per word bound = -6.665)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 2 (approx. per word bound = -6.653, relative change = 1.842e-03)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 3 (approx. per word bound = -6.646, relative change = 1.065e-03)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Model Converged
plot(posts_tmod_ctm)

8.1.1 Plots - CTM
#1. summary - plots topic proportions and names.
plot(posts_tmod_ctm, type = "summary", n = 5,
labeltype = "frex", main = "Reddit Stock Topics", text.cex = 0.8)

#2. labels - plots the top words for a specific topic.
plot(posts_tmod_ctm, type="labels", topics=c(1,2), n=4)

#3. perspectives - compares two topics' words.
plot(posts_tmod_ctm, type="perspectives", topics=c(1,2))

#4. hist - a histogram of the expected topic proportions across documents for a topic.
plot(posts_tmod_ctm, type="hist")

8.1.2 Semantic Coherence & Exclusivity (Two ways of measuring topic “interpretability”)
topicQuality(posts_tmod_ctm, out$documents)
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

8.2 Structural Topic Model (STM)
names(docvars(posts_corp))
## [1] "date" "score" "up_ratio"
## [4] "total_awards_received" "comments" "year"
## [7] "score_analysis" "date_analysis"
stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173
out <- prepDocuments(
stm_posts_dfmat$documents, stm_posts_dfmat$vocab, stm_posts_dfmat$meta)
8.2.1 Topical prevalence and topical content.
# In STM, metadata can be entered in the topic model in two ways: topical prevalence and topical content.
posts_tmob_stm <- stm(
out$documents, out$vocab, K=K,
prevalence=~date_analysis+score_analysis+s(comments), #for topical prevalence; s() = b-spline
data=out$meta, #for topical content
init.type="Spectral", seed=123)
## Beginning Spectral Initialization
## Calculating the gram matrix...
## Finding anchor words...
## .....
## Recovering initialization...
## ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 1 (approx. per word bound = -6.665)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 2 (approx. per word bound = -6.652, relative change = 1.879e-03)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 3 (approx. per word bound = -6.645, relative change = 1.067e-03)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 4 (approx. per word bound = -6.641, relative change = 6.509e-04)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 5 (approx. per word bound = -6.638, relative change = 4.336e-04)
## Topic 1: sell, options, trading, earnings, call
## Topic 2: money, buy, time, stock, calls
## Topic 3: day, short, stocks, company, every
## Topic 4: market, price, year, gains, put
## Topic 5: amd, bought, mu, pretty, real
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 6 (approx. per word bound = -6.636, relative change = 3.021e-04)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 7 (approx. per word bound = -6.635, relative change = 2.137e-04)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 8 (approx. per word bound = -6.634, relative change = 1.521e-04)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 9 (approx. per word bound = -6.633, relative change = 1.091e-04)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 10 (approx. per word bound = -6.632, relative change = 7.918e-05)
## Topic 1: sell, options, trading, earnings, call
## Topic 2: money, buy, time, stock, calls
## Topic 3: day, short, stocks, company, every
## Topic 4: market, price, year, gains, put
## Topic 5: amd, bought, mu, pretty, real
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 11 (approx. per word bound = -6.632, relative change = 5.832e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 12 (approx. per word bound = -6.632, relative change = 4.389e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 13 (approx. per word bound = -6.632, relative change = 3.351e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 14 (approx. per word bound = -6.631, relative change = 2.619e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 15 (approx. per word bound = -6.631, relative change = 2.085e-05)
## Topic 1: sell, options, trading, earnings, call
## Topic 2: money, buy, time, stock, calls
## Topic 3: day, short, stocks, company, every
## Topic 4: market, price, year, gains, put
## Topic 5: amd, bought, mu, pretty, real
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 16 (approx. per word bound = -6.631, relative change = 1.692e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 17 (approx. per word bound = -6.631, relative change = 1.407e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 18 (approx. per word bound = -6.631, relative change = 1.191e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Completing Iteration 19 (approx. per word bound = -6.631, relative change = 1.025e-05)
## .......................................................................................................................
## Completed E-Step (0 seconds).
## Completed M-Step.
## Model Converged
8.2.2 Explore STM results using the LDAvis package.
toLDAvis(mod=posts_tmob_stm, docs=out$documents)
## Loading required namespace: servr
8.2.3 Summary plot
plot(posts_tmob_stm, type="summary", n=5)

plot(posts_tmob_stm, type="perspectives", topics=c(1,2))

8.2.4 Visualize top words in each topic
posts_tmob_stm_beta <- tidytext::tidy(posts_tmob_stm)
posts_tmob_stm_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>%
mutate(topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~topic, scales="free_y") + coord_flip() + scale_x_reordered()

8.2.5 Comparison b/w CTM and STM
topicQuality(posts_tmod_ctm, out$documents)
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

topicQuality(posts_tmob_stm, out$documents)
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.499497 9.269133 9.133405 9.330438 9.211293

8.3 Keyword Assisted Topic Model
keyATM_docs <- keyATM_read(texts = posts_dfmat)
## Using quanteda dfm.
## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 7
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 20, 25, 51, 76, 82, 128, 173
summary(keyATM_docs)
## keyATM_docs object of: 245 documents.
## Length of documents:
## Avg: 160.812
## Min: 0
## Max: 1519
## SD: 284.391
## Number of unique words: 1521
8.3.1 Keywords
posts_key_list = list(
top_stocks = c("amd", "intel", "apple","microsoft","amazon","amzn"),
options = c("gain", "loss", "options", "profit","short","calls","put","call","puts"),
stock_market = c("stock","short","share","price","people","market","money","know"),
buy_sell = c("buy","dip","sell","order","bought","buying"),
hold_position = c("hold","line"),
trading_apps = c("robinhood","account","app","trade","webull","trading","fidelity","platform")
)
posts_key_viz <- visualize_keywords(docs = keyATM_docs, keywords = posts_key_list)
## Warning in check_keywords(unique(unlisted), keywords, prune): Keywords will be
## pruned because they do not appear in documents: people, know, webull, fidelity
posts_key_viz

8.3.2 KeyATM Base
# will take a while
posts_tmod_keyatm_base <- keyATM(
docs = keyATM_docs, # text input
no_keyword_topics = 3,
keywords = posts_key_list,
model = "base",
options = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Initializing the model...
## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity
## Fitting the model. 1500 iterations...
## Creating an output object. It may take time...
top_words(posts_tmod_keyatm_base, 5)
## 1_top_stocks 2_options 3_stock_market 4_buy_sell 5_hold_position
## 1 amd [✓] options [✓] money [✓] buy [✓] keep
## 2 intel [✓] calls [✓] time sell [✓] pay
## 3 earnings call [✓] price [✓] market [3] hold [✓]
## 4 apple [✓] puts [✓] stock [✓] bought [✓] might
## 5 amzn [✓] week year gains month
## 6_trading_apps Other_1 Other_2 Other_3
## 1 trading [✓] high never long
## 2 real business company mean
## 3 robinhood [✓] job pretty tendies
## 4 account [✓] tell years gonna
## 5 thought reddit amp short [2]
8.3.3 KeyATM Covariates
vars <- docvars(posts_corp)
head(vars)
## date score up_ratio total_awards_received comments year score_analysis
## 1 2018-01-01 1780 0.95 0 118 2018 high
## 2 2018-01-01 394 0.96 0 97 2018 medium
## 3 2018-01-01 2016 0.93 0 238 2018 high
## 4 2018-01-02 202 0.97 0 52 2018 low
## 5 2018-01-02 106 0.93 0 4 2018 low
## 6 2018-01-02 134 0.94 0 47 2018 low
## date_analysis
## 1 jan_one
## 2 jan_one
## 3 jan_one
## 4 jan_one
## 5 jan_one
## 6 jan_one
vars_selected <- vars %>%
as_tibble() %>%
mutate(period = case_when(comments_analysis = (comments ==0 ~ "nodiscussion"),
(comments >0 & comments <=50 ~ "discussion"),
(comments>50 ~ "topdiscussion")
))
posts_tmod_keyatm_cov <- keyATM(
docs = keyATM_docs,
no_keyword_topics = 3,
keywords = posts_key_list,
model = "covariates",
model_settings = list(covariates_data = vars_selected,
covariates_formula = ~date_analysis+score_analysis+period),
options = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Convert covariates data using `model_settings$covariates_formula`.
## Initializing the model...
## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity
## Fitting the model. 1500 iterations...
## Creating an output object. It may take time...
8.3.4 Comparison b/w KeyATM Base and KeyATM Covariates
top_words(posts_tmod_keyatm_base, 5)
## 1_top_stocks 2_options 3_stock_market 4_buy_sell 5_hold_position
## 1 amd [✓] options [✓] money [✓] buy [✓] keep
## 2 intel [✓] calls [✓] time sell [✓] pay
## 3 earnings call [✓] price [✓] market [3] hold [✓]
## 4 apple [✓] puts [✓] stock [✓] bought [✓] might
## 5 amzn [✓] week year gains month
## 6_trading_apps Other_1 Other_2 Other_3
## 1 trading [✓] high never long
## 2 real business company mean
## 3 robinhood [✓] job pretty tendies
## 4 account [✓] tell years gonna
## 5 thought reddit amp short [2]
top_words(posts_tmod_keyatm_cov, 5)
## 1_top_stocks 2_options 3_stock_market 4_buy_sell 5_hold_position
## 1 amd [✓] options [✓] money [✓] buy [✓] work
## 2 intel [✓] calls [✓] market [✓] sell [✓] long
## 3 earnings put [✓] stock [✓] gains keep
## 4 amazon [✓] call [✓] price [✓] bought [✓] hold [✓]
## 5 apple [✓] puts [✓] time today companies
## 6_trading_apps Other_1 Other_2 Other_3
## 1 trading [✓] day point cash
## 2 stocks literally pretty please
## 3 years best big stop
## 4 robinhood [✓] reddit days instead
## 5 pay game amp anyone
Comparison with Secondary data
#Reddit data analysis/comparison with Secondary data
postsanalysis_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
postsanalysis_df <- postsanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
cnbcanalysis_df <- readtext("cnbc_headlines.csv", text_field = "text")
cnbcanalysis_df <- cnbcanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
#Corpus
postsanalysis_corp <- corpus(postsanalysis_df)
cnbcanalysis_corp <- corpus(cnbcanalysis_df)
postsanalysis_toks <- tokens(postsanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
tokens_keep(min_nchar = 2)
#Compound tokens
toks_nostop_col <- textstat_collocations(postsanalysis_toks, min_count = 10)
postsanalysis_toks_comp <- tokens_compound(
postsanalysis_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
cnbcanalysis_toks <- tokens(cnbcanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
tokens_keep(min_nchar = 2)
#document feature matrix
postsanalysis_dfmat <- dfm(postsanalysis_toks_comp, tolower = TRUE)
cnbcanalysis_dfmat <- dfm(cnbcanalysis_toks, tolower = TRUE)
#document term matrix
postsanalysis_dtmat = quanteda::convert(postsanalysis_dfmat, to="topicmodels")
cnbcanalysis_dtmat = quanteda::convert(cnbcanalysis_dfmat, to="topicmodels")
##Comparison b/w Reddit data and CNBC News Headline
#Word Cloud for Reddit data
K = 2 #number of topics
postsanalysis_lda5_reddit <- LDA(postsanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(postsanalysis_lda5_reddit, 10)
## Topic 1 Topic 2
## [1,] "amd" "buy"
## [2,] "money" "amd"
## [3,] "buy" "money"
## [4,] "time" "stock"
## [5,] "market" "market"
## [6,] "intel" "time"
## [7,] "options" "calls"
## [8,] "stock" "sell"
## [9,] "calls" "price"
## [10,] "sell" "options"
tmResult_reddit <- posterior(postsanalysis_lda5_reddit)
top5termsPerTopic_reddit <- topicmodels::terms(postsanalysis_lda5_reddit, 5)
topicNames_reddit <- apply(top5termsPerTopic_reddit, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_reddit <- 11 # change for your own topic of interest
topicToViz_reddit <- grep('amd', topicNames_reddit)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
words_reddit <- names(top40terms_reddit)
# extract the probabilites of each of the 40 terms
probabilities_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_reddit <- brewer.pal(8, "Dark2")
#Word Cloud for CNBC News Headlines(Secondary Data)
K = 2 #number of topics
cnbcanalysis_lda5_cnbc <- LDA(cnbcanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(cnbcanalysis_lda5_cnbc, 10)
## Topic 1 Topic 2
## [1,] "amd" "amd"
## [2,] "lightning" "jim"
## [3,] "round" "says"
## [4,] "amd's" "nvidia"
## [5,] "intel" "intel"
## [6,] "earnings" "charts"
## [7,] "buy" "qualcomm"
## [8,] "alphabet" "lot"
## [9,] "ceo" "room"
## [10,] "products" "run"
tmResult_cnbc <- posterior(cnbcanalysis_lda5_cnbc)
top5termsPerTopic_cnbc <- topicmodels::terms(cnbcanalysis_lda5_cnbc, 5)
topicNames_cnbc <- apply(top5termsPerTopic_cnbc, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_cnbc <- 11 # change for your own topic of interest
topicToViz_cnbc <- grep('amd', topicNames_cnbc)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
words_cnbc <- names(top40terms_cnbc)
# extract the probabilites of each of the 40 terms
probabilities_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_cnbc <- brewer.pal(8, "Dark2")
# Comparison b/w Reddit data and CNBC News Headlines using WordCloud
wordcloud(words_reddit, probabilities_reddit, random.order = FALSE, color = mycolors_reddit)

wordcloud(words_cnbc, probabilities_cnbc, random.order = FALSE, color = mycolors_cnbc)
