1. Create Dataframe from CSV file

posts_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
posts_df$year <- substr(posts_df$date, 1, 4)
posts_df$year <- as.numeric(posts_df$year)
posts_df$date <- as.Date(posts_df$date, format =  "%Y-%m-%d")
posts_df <- posts_df %>%
  mutate(score_analysis = case_when(
    (posts_df$score<300) ~ "low",
    (posts_df$score>=300 & posts_df$score <= 600) ~ "medium",
    (posts_df$score >600) ~ "high"
    ))
posts_df <- posts_df %>%
  mutate(date_analysis = case_when(
    (posts_df$date<='2018-01-08') ~ "jan_one",
    (posts_df$date>'2018-01-08' & posts_df$date <= '2018-01-16') ~ "jan_two",
    (posts_df$date>'2018-01-16' & posts_df$date <= '2018-01-24') ~ "jan_thr",
    (posts_df$date>'2018-01-24') ~ "jan_fou",
    ))
glimpse(posts_df)
## Rows: 245
## Columns: 10
## $ doc_id                <chr> "Jan_2018_FINAL.csv.1", "Jan_2018_FINAL.csv.2", …
## $ text                  <chr> "Robinhood saved me over $8k in fees!  What was …
## $ date                  <date> 2018-01-01, 2018-01-01, 2018-01-01, 2018-01-02,…
## $ score                 <int> 1780, 394, 2016, 202, 106, 134, 155, 757, 477, 4…
## $ up_ratio              <dbl> 0.95, 0.96, 0.93, 0.97, 0.93, 0.94, 0.94, 0.94, …
## $ total_awards_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ comments              <int> 118, 97, 238, 52, 4, 47, 47, 43, 99, 31, 43, 31,…
## $ year                  <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, …
## $ score_analysis        <chr> "high", "medium", "high", "low", "low", "low", "…
## $ date_analysis         <chr> "jan_one", "jan_one", "jan_one", "jan_one", "jan…

2. Create Corpus with document level variables

posts_corp <- corpus(posts_df)
summary(posts_corp,5)
## Corpus consisting of 245 documents, showing 5 documents:
## 
##                  Text Types Tokens Sentences       date score up_ratio
##  Jan_2018_FINAL.csv.1   668   1824        45 2018-01-01  1780     0.95
##  Jan_2018_FINAL.csv.2   824   2480        65 2018-01-01   394     0.96
##  Jan_2018_FINAL.csv.3  1408   5644       189 2018-01-01  2016     0.93
##  Jan_2018_FINAL.csv.4    15     15         1 2018-01-02   202     0.97
##  Jan_2018_FINAL.csv.5    13     14         1 2018-01-02   106     0.93
##  total_awards_received comments year score_analysis date_analysis
##                      0      118 2018           high       jan_one
##                      0       97 2018         medium       jan_one
##                      0      238 2018           high       jan_one
##                      0       52 2018            low       jan_one
##                      0        4 2018            low       jan_one

3. Generate Tokens

myStopWords = c("one", "two", "three", "first", "second", "third","gets","faggot",
                "may", "also", "can", "whether","gt","make","good","well","take",
                "just", "like", "got","will","lol","right","fucking","actually","wsb",
                "A","at", "but","For", "in", "for","how","How","several","man",
                "Tue", "but", "doesnt", "from", "The","From","see","much","people",
                "like", "just", "shit", "fuck","can","spx","still","go","say","went",
                "have", "been", "has", "than","with","To","even","now","since","better",
                "use","who","of","to","show","and","look","guy","made","great","someone",
                "include", "includes", "including", "included","though","thing","post",
                "uses", "using", "used", "comprises","something","probably","yes","don",
                "on","said","were","by","that","is","sure","really","way","new","feel",
                "as","was","an","it","which","its","think","sub","yeah","need","want","cramer",
                "had","are","they","he","be","us","na","back","already","around","going",
                "get", "NA", "know", "think","deleted","everyone","gets","many","makes","cramer's")
posts_toks <- tokens(posts_corp, remove_punct = TRUE, remove_numbers = TRUE,
                     remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)

#Create compound tokens
toks_nostop_col <- textstat_collocations(posts_toks, min_count = 10)
head(toks_nostop_col, 5)
##      collocation count count_nested length   lambda        z
## 1      last year    56            0      2 4.882448 28.40296
## 2 percent change    37            0      2 7.497311 25.24040
## 3    date change    32            0      2 6.698862 25.18586
## 4       nbsp bot    28            0      2 7.136430 24.36797
## 5   __year's low    31            0      2 6.610580 24.24902
posts_toks_comp <- tokens_compound(
  posts_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
head(posts_toks_comp, 5)
## Tokens consisting of 5 documents and 8 docvars.
## Jan_2018_FINAL.csv.1 :
##  [1] "Robinhood" "saved"     "8k"        "fees"      "riding"    "short"    
##  [7] "bus"       "kid"       "jelly"     "cause"     "couldnt"   "ride"     
## [ ... and 570 more ]
## 
## Jan_2018_FINAL.csv.2 :
##  [1] "Saved"        "commissions"  "build"        "SP500"        "ETF"         
##  [6] "hand"         "kinda"        "robinhood"    "portfolio"    "gives"       
## [11] "dividend"     "every_single"
## [ ... and 835 more ]
## 
## Jan_2018_FINAL.csv.3 :
##  [1] "added"    "every"    "paycheck" "little"   "years"    "buy"     
##  [7] "Toyota"   "Camry"    "Best"     "car"      "miles"    "per"     
## [ ... and 1,723 more ]
## 
## Jan_2018_FINAL.csv.4 :
## [1] "Best"   "Worst"  "amp"    "stocks"
## 
## Jan_2018_FINAL.csv.5 :
## [1] "cellphone" "choice"    "autist"    "making"    "trades"

4. Generate document feature matrix

posts_dfmat <- dfm(posts_toks_comp, tolower = TRUE) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10)
posts_dfmat
## Document-feature matrix of: 245 documents, 1,067 features (91.39% sparse) and 8 docvars.
##                       features
## docs                   robinhood fees short kid cause ride planning rope anyone
##   Jan_2018_FINAL.csv.1         1    1     6   1     2    1        1    2      2
##   Jan_2018_FINAL.csv.2         4    2     1   1     0    1        0    0      1
##   Jan_2018_FINAL.csv.3        14    0     1   1     1    0        0    1      3
##   Jan_2018_FINAL.csv.4         0    0     0   0     0    0        0    0      0
##   Jan_2018_FINAL.csv.5         0    0     0   0     0    0        0    0      0
##   Jan_2018_FINAL.csv.6         0    0     0   0     0    0        0    0      0
##                       features
## docs                   pretty
##   Jan_2018_FINAL.csv.1      1
##   Jan_2018_FINAL.csv.2      2
##   Jan_2018_FINAL.csv.3      1
##   Jan_2018_FINAL.csv.4      0
##   Jan_2018_FINAL.csv.5      0
##   Jan_2018_FINAL.csv.6      0
## [ reached max_ndoc ... 239 more documents, reached max_nfeat ... 1,057 more features ]
#See top 30 features
topfeatures(posts_dfmat, 30)
##      buy    money     time      amd     sell  options   market    stock 
##      460      456      325      295      285      271      264      263 
##   bought    calls     long      day    price     year    short    gains 
##      223      217      211      206      203      192      159      153 
##       mu      put   stocks    never  company  trading    today   pretty 
##      153      151      149      147      146      145      144      141 
## earnings   buying     sold     call     real     work 
##      138      138      138      136      133      131

5. Generate Relative Frequency analysis

posts_tstat_key <- textstat_keyness(
  posts_dfmat, target = posts_dfmat$score <= 500)
textplot_keyness(posts_tstat_key, n=10, min_count = 10)

5a. Identify number of topics

#Convert document-feature matrix to document-term representations used in topicmodels  
posts_dtmat = quanteda::convert(posts_dfmat, to="topicmodels")
posts_dtmat
## <<DocumentTermMatrix (documents: 238, terms: 1067)>>
## Non-/sparse entries: 22511/231435
## Sparsity           : 91%
## Maximal term length: 17
## Weighting          : term frequency (tf)
train_posts_dtmat <- corpus_subset(posts_corp)[1:125,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(myStopWords) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

test_posts_dtmat <- corpus_subset(posts_corp)[126:245,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(myStopWords) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

train_posts_lda5 <- LDA(train_posts_dtmat, k = 5, control = list(seed = 123))
perplexity(train_posts_lda5, test_posts_dtmat) # perplexity when k = 5
## [1] 250.8326
n_topics_vec = 2:5 # try different num of topics: 2, 3, 4, 5
# this analysis will take a while
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
  message(kk)
  train_posts_ldaK <- LDA(train_posts_dtmat, k = kk, control = list(seed = 123))
  perp = perplexity(train_posts_ldaK, test_posts_dtmat)
})
## 2
## 3
## 4
## 5
lda_perplexity_result = tibble(
  n_topics = n_topics_vec, perplexity = perplexity_vec
)

plot(lda_perplexity_result, type="l")


lda_ldatuning_result <- FindTopicsNumber(
  posts_dtmat, topics = n_topics_vec,
  metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.
FindTopicsNumber_plot(lda_ldatuning_result)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

6. Visualize topics as word cloud

K = 5 #number of topics
posts_lda5 <- LDA(posts_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(posts_lda5, 10)
##       Topic 1   Topic 2 Topic 3    Topic 4   Topic 5  
##  [1,] "company" "long"  "amd"      "options" "buy"    
##  [2,] "buy"     "money" "intel"    "money"   "sell"   
##  [3,] "money"   "short" "buy"      "price"   "market" 
##  [4,] "day"     "time"  "earnings" "buy"     "day"    
##  [5,] "time"    "year"  "calls"    "put"     "money"  
##  [6,] "stock"   "years" "bought"   "call"    "options"
##  [7,] "market"  "never" "time"     "stock"   "calls"  
##  [8,] "news"    "point" "mu"       "option"  "gains"  
##  [9,] "shop"    "real"  "puts"     "time"    "bought" 
## [10,] "cash"    "true"  "stock"    "sell"    "stocks"
tmResult <- posterior(posts_lda5)

top5termsPerTopic <- topicmodels::terms(posts_lda5, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")

# visualize topics as word cloud
topicToViz <- 11 # change for your own topic of interest
topicToViz <- grep('intel', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
words <- names(top40terms)
# extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud

mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

7. Visualize top terms in topics

posts_lda5_betas <- broom::tidy(posts_lda5)

top_terms_in_topics <- posts_lda5_betas %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms_in_topics %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

8. Advanced Topic Modeling

8.1. Correlated Topic Model (CTM)

stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173
out <- prepDocuments(
  stm_posts_dfmat$documents,
  stm_posts_dfmat$vocab,
  stm_posts_dfmat$meta
)
str(out, max.level = 1)
## List of 7
##  $ documents     :List of 238
##  $ vocab         : chr [1:1067] "10k" "1k" "20k" "2k" ...
##  $ meta          :'data.frame':  238 obs. of  8 variables:
##  $ words.removed : chr(0) 
##  $ docs.removed  : NULL
##  $ tokens.removed: int 0
##  $ wordcounts    : int [1:1067] 11 11 12 11 11 12 29 13 18 11 ...
K=5
posts_tmod_ctm <- stm(out$documents, out$vocab, K = K,
                   seed = 123, emtol = 1e-3, max.em.its = 150)
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -6.665) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -6.653, relative change = 1.842e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -6.646, relative change = 1.065e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged
plot(posts_tmod_ctm)

8.1.1 Plots - CTM
#1. summary - plots topic proportions and names.
plot(posts_tmod_ctm, type = "summary", n = 5,
     labeltype = "frex", main = "Reddit Stock Topics", text.cex = 0.8)

#2. labels - plots the top words for a specific topic.
plot(posts_tmod_ctm, type="labels", topics=c(1,2), n=4)

#3. perspectives - compares two topics' words.
plot(posts_tmod_ctm, type="perspectives", topics=c(1,2))

#4. hist - a histogram of the expected topic proportions across documents for a topic.
plot(posts_tmod_ctm, type="hist")

8.1.2 Semantic Coherence & Exclusivity (Two ways of measuring topic “interpretability”)
topicQuality(posts_tmod_ctm, out$documents)
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

8.2 Structural Topic Model (STM)

names(docvars(posts_corp))
## [1] "date"                  "score"                 "up_ratio"             
## [4] "total_awards_received" "comments"              "year"                 
## [7] "score_analysis"        "date_analysis"
stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")
## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173
out <- prepDocuments(
  stm_posts_dfmat$documents, stm_posts_dfmat$vocab, stm_posts_dfmat$meta)
8.2.1 Topical prevalence and topical content.
#  In STM, metadata can be entered in the topic model in two ways: topical prevalence and topical content.
posts_tmob_stm <- stm(
  out$documents, out$vocab, K=K,
  prevalence=~date_analysis+score_analysis+s(comments), #for topical prevalence; s() = b-spline
  data=out$meta, #for topical content
  init.type="Spectral", seed=123)
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -6.665) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -6.652, relative change = 1.879e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -6.645, relative change = 1.067e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -6.641, relative change = 6.509e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -6.638, relative change = 4.336e-04) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -6.636, relative change = 3.021e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -6.635, relative change = 2.137e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -6.634, relative change = 1.521e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -6.633, relative change = 1.091e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 10 (approx. per word bound = -6.632, relative change = 7.918e-05) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 11 (approx. per word bound = -6.632, relative change = 5.832e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 12 (approx. per word bound = -6.632, relative change = 4.389e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 13 (approx. per word bound = -6.632, relative change = 3.351e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 14 (approx. per word bound = -6.631, relative change = 2.619e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 15 (approx. per word bound = -6.631, relative change = 2.085e-05) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 16 (approx. per word bound = -6.631, relative change = 1.692e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 17 (approx. per word bound = -6.631, relative change = 1.407e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 18 (approx. per word bound = -6.631, relative change = 1.191e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 19 (approx. per word bound = -6.631, relative change = 1.025e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged
8.2.2 Explore STM results using the LDAvis package.
toLDAvis(mod=posts_tmob_stm, docs=out$documents)
## Loading required namespace: servr
8.2.3 Summary plot
plot(posts_tmob_stm, type="summary", n=5)

plot(posts_tmob_stm, type="perspectives", topics=c(1,2))

8.2.4 Visualize top words in each topic
posts_tmob_stm_beta <- tidytext::tidy(posts_tmob_stm)
posts_tmob_stm_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~topic, scales="free_y") + coord_flip() + scale_x_reordered()

8.2.5 Comparison b/w CTM and STM
topicQuality(posts_tmod_ctm, out$documents) 
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

topicQuality(posts_tmob_stm, out$documents)
## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.499497 9.269133 9.133405 9.330438 9.211293

8.3 Keyword Assisted Topic Model

keyATM_docs <- keyATM_read(texts = posts_dfmat)
## Using quanteda dfm.
## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 7
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 20, 25, 51, 76, 82, 128, 173
summary(keyATM_docs)
## keyATM_docs object of: 245 documents.
## Length of documents:
##   Avg: 160.812
##   Min: 0
##   Max: 1519
##    SD: 284.391
## Number of unique words: 1521
8.3.1 Keywords
posts_key_list = list(
  top_stocks = c("amd", "intel", "apple","microsoft","amazon","amzn"),
  options = c("gain", "loss", "options", "profit","short","calls","put","call","puts"),
  stock_market = c("stock","short","share","price","people","market","money","know"),
  buy_sell = c("buy","dip","sell","order","bought","buying"),
  hold_position = c("hold","line"),
  trading_apps = c("robinhood","account","app","trade","webull","trading","fidelity","platform")
)
posts_key_viz <- visualize_keywords(docs = keyATM_docs, keywords = posts_key_list)
## Warning in check_keywords(unique(unlisted), keywords, prune): Keywords will be
## pruned because they do not appear in documents: people, know, webull, fidelity
posts_key_viz

8.3.2 KeyATM Base
# will take a while
posts_tmod_keyatm_base <- keyATM(
  docs = keyATM_docs, # text input
  no_keyword_topics = 3,
  keywords = posts_key_list,
  model = "base",
  options = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Initializing the model...
## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity
## Fitting the model. 1500 iterations...
## Creating an output object. It may take time...
top_words(posts_tmod_keyatm_base, 5)
##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            keep
## 2    intel [✓]   calls [✓]           time   sell [✓]             pay
## 3     earnings    call [✓]      price [✓] market [3]        hold [✓]
## 4    apple [✓]    puts [✓]      stock [✓] bought [✓]           might
## 5     amzn [✓]        week           year      gains           month
##   6_trading_apps  Other_1 Other_2   Other_3
## 1    trading [✓]     high   never      long
## 2           real business company      mean
## 3  robinhood [✓]      job  pretty   tendies
## 4    account [✓]     tell   years     gonna
## 5        thought   reddit     amp short [2]
8.3.3 KeyATM Covariates
vars <- docvars(posts_corp)
head(vars)
##         date score up_ratio total_awards_received comments year score_analysis
## 1 2018-01-01  1780     0.95                     0      118 2018           high
## 2 2018-01-01   394     0.96                     0       97 2018         medium
## 3 2018-01-01  2016     0.93                     0      238 2018           high
## 4 2018-01-02   202     0.97                     0       52 2018            low
## 5 2018-01-02   106     0.93                     0        4 2018            low
## 6 2018-01-02   134     0.94                     0       47 2018            low
##   date_analysis
## 1       jan_one
## 2       jan_one
## 3       jan_one
## 4       jan_one
## 5       jan_one
## 6       jan_one
vars_selected <- vars %>%
  as_tibble() %>%
  mutate(period = case_when(comments_analysis = (comments ==0 ~ "nodiscussion"),
                            (comments >0 & comments <=50 ~ "discussion"),
                            (comments>50 ~ "topdiscussion")
                            ))
posts_tmod_keyatm_cov <- keyATM(
  docs              = keyATM_docs,
  no_keyword_topics = 3,
  keywords          = posts_key_list,
  model             = "covariates",
  model_settings    = list(covariates_data    = vars_selected,
                           covariates_formula = ~date_analysis+score_analysis+period),
  options           = list(seed = 123))
## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.
## Convert covariates data using `model_settings$covariates_formula`.
## Initializing the model...
## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity
## Fitting the model. 1500 iterations...
## Creating an output object. It may take time...
8.3.4 Comparison b/w KeyATM Base and KeyATM Covariates
top_words(posts_tmod_keyatm_base, 5)
##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            keep
## 2    intel [✓]   calls [✓]           time   sell [✓]             pay
## 3     earnings    call [✓]      price [✓] market [3]        hold [✓]
## 4    apple [✓]    puts [✓]      stock [✓] bought [✓]           might
## 5     amzn [✓]        week           year      gains           month
##   6_trading_apps  Other_1 Other_2   Other_3
## 1    trading [✓]     high   never      long
## 2           real business company      mean
## 3  robinhood [✓]      job  pretty   tendies
## 4    account [✓]     tell   years     gonna
## 5        thought   reddit     amp short [2]
top_words(posts_tmod_keyatm_cov, 5) 
##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            work
## 2    intel [✓]   calls [✓]     market [✓]   sell [✓]            long
## 3     earnings     put [✓]      stock [✓]      gains            keep
## 4   amazon [✓]    call [✓]      price [✓] bought [✓]        hold [✓]
## 5    apple [✓]    puts [✓]           time      today       companies
##   6_trading_apps   Other_1 Other_2 Other_3
## 1    trading [✓]       day   point    cash
## 2         stocks literally  pretty  please
## 3          years      best     big    stop
## 4  robinhood [✓]    reddit    days instead
## 5            pay      game     amp  anyone

Comparison with Secondary data

#Reddit data analysis/comparison with Secondary data
postsanalysis_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
postsanalysis_df <- postsanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
cnbcanalysis_df <- readtext("cnbc_headlines.csv", text_field = "text")
cnbcanalysis_df <- cnbcanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
#Corpus
postsanalysis_corp <- corpus(postsanalysis_df)
cnbcanalysis_corp <- corpus(cnbcanalysis_df)
postsanalysis_toks <- tokens(postsanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
                     remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)
#Compound tokens
toks_nostop_col <- textstat_collocations(postsanalysis_toks, min_count = 10)
postsanalysis_toks_comp <- tokens_compound(
  postsanalysis_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
cnbcanalysis_toks <- tokens(cnbcanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
                    remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)
#document feature matrix
postsanalysis_dfmat <- dfm(postsanalysis_toks_comp, tolower = TRUE)
cnbcanalysis_dfmat <- dfm(cnbcanalysis_toks, tolower = TRUE)
#document term matrix
postsanalysis_dtmat = quanteda::convert(postsanalysis_dfmat, to="topicmodels")
cnbcanalysis_dtmat = quanteda::convert(cnbcanalysis_dfmat, to="topicmodels")

##Comparison b/w Reddit data and CNBC News Headline
#Word Cloud for Reddit data
K = 2 #number of topics
postsanalysis_lda5_reddit <- LDA(postsanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(postsanalysis_lda5_reddit, 10)
##       Topic 1   Topic 2  
##  [1,] "amd"     "buy"    
##  [2,] "money"   "amd"    
##  [3,] "buy"     "money"  
##  [4,] "time"    "stock"  
##  [5,] "market"  "market" 
##  [6,] "intel"   "time"   
##  [7,] "options" "calls"  
##  [8,] "stock"   "sell"   
##  [9,] "calls"   "price"  
## [10,] "sell"    "options"
tmResult_reddit <- posterior(postsanalysis_lda5_reddit)
top5termsPerTopic_reddit <- topicmodels::terms(postsanalysis_lda5_reddit, 5)
topicNames_reddit <- apply(top5termsPerTopic_reddit, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_reddit <- 11 # change for your own topic of interest
topicToViz_reddit <- grep('amd', topicNames_reddit)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
words_reddit <- names(top40terms_reddit)
# extract the probabilites of each of the 40 terms
probabilities_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_reddit <- brewer.pal(8, "Dark2")

#Word Cloud for CNBC News Headlines(Secondary Data)
K = 2 #number of topics
cnbcanalysis_lda5_cnbc <- LDA(cnbcanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(cnbcanalysis_lda5_cnbc, 10)
##       Topic 1     Topic 2   
##  [1,] "amd"       "amd"     
##  [2,] "lightning" "jim"     
##  [3,] "round"     "says"    
##  [4,] "amd's"     "nvidia"  
##  [5,] "intel"     "intel"   
##  [6,] "earnings"  "charts"  
##  [7,] "buy"       "qualcomm"
##  [8,] "alphabet"  "lot"     
##  [9,] "ceo"       "room"    
## [10,] "products"  "run"
tmResult_cnbc <- posterior(cnbcanalysis_lda5_cnbc)
top5termsPerTopic_cnbc <- topicmodels::terms(cnbcanalysis_lda5_cnbc, 5)
topicNames_cnbc <- apply(top5termsPerTopic_cnbc, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_cnbc <- 11 # change for your own topic of interest
topicToViz_cnbc <- grep('amd', topicNames_cnbc)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
words_cnbc <- names(top40terms_cnbc)
# extract the probabilites of each of the 40 terms
probabilities_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_cnbc <- brewer.pal(8, "Dark2")

# Comparison b/w Reddit data and CNBC News Headlines using WordCloud
wordcloud(words_reddit, probabilities_reddit, random.order = FALSE, color = mycolors_reddit)

wordcloud(words_cnbc, probabilities_cnbc, random.order = FALSE, color = mycolors_cnbc)