Group5_FinalProject

1. Create Dataframe from CSV file

posts_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
posts_df$year <- substr(posts_df$date, 1, 4)
posts_df$year <- as.numeric(posts_df$year)
posts_df$date <- as.Date(posts_df$date, format =  "%Y-%m-%d")
posts_df <- posts_df %>%
  mutate(score_analysis = case_when(
    (posts_df$score<300) ~ "low",
    (posts_df$score>=300 & posts_df$score <= 600) ~ "medium",
    (posts_df$score >600) ~ "high"
    ))
posts_df <- posts_df %>%
  mutate(date_analysis = case_when(
    (posts_df$date<='2018-01-08') ~ "jan_one",
    (posts_df$date>'2018-01-08' & posts_df$date <= '2018-01-16') ~ "jan_two",
    (posts_df$date>'2018-01-16' & posts_df$date <= '2018-01-24') ~ "jan_thr",
    (posts_df$date>'2018-01-24') ~ "jan_fou",
    ))
glimpse(posts_df)

## Rows: 245
## Columns: 10
## $ doc_id                <chr> "Jan_2018_FINAL.csv.1", "Jan_2018_FINAL.csv.2", …
## $ text                  <chr> "Robinhood saved me over $8k in fees!  What was …
## $ date                  <date> 2018-01-01, 2018-01-01, 2018-01-01, 2018-01-02,…
## $ score                 <int> 1780, 394, 2016, 202, 106, 134, 155, 757, 477, 4…
## $ up_ratio              <dbl> 0.95, 0.96, 0.93, 0.97, 0.93, 0.94, 0.94, 0.94, …
## $ total_awards_received <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ comments              <int> 118, 97, 238, 52, 4, 47, 47, 43, 99, 31, 43, 31,…
## $ year                  <dbl> 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, …
## $ score_analysis        <chr> "high", "medium", "high", "low", "low", "low", "…
## $ date_analysis         <chr> "jan_one", "jan_one", "jan_one", "jan_one", "jan…

2. Create Corpus with document level variables

posts_corp <- corpus(posts_df)
summary(posts_corp,5)

## Corpus consisting of 245 documents, showing 5 documents:
## 
##                  Text Types Tokens Sentences       date score up_ratio
##  Jan_2018_FINAL.csv.1   668   1824        45 2018-01-01  1780     0.95
##  Jan_2018_FINAL.csv.2   824   2480        65 2018-01-01   394     0.96
##  Jan_2018_FINAL.csv.3  1408   5644       189 2018-01-01  2016     0.93
##  Jan_2018_FINAL.csv.4    15     15         1 2018-01-02   202     0.97
##  Jan_2018_FINAL.csv.5    13     14         1 2018-01-02   106     0.93
##  total_awards_received comments year score_analysis date_analysis
##                      0      118 2018           high       jan_one
##                      0       97 2018         medium       jan_one
##                      0      238 2018           high       jan_one
##                      0       52 2018            low       jan_one
##                      0        4 2018            low       jan_one

3. Generate Tokens

myStopWords = c("one", "two", "three", "first", "second", "third","gets","faggot",
                "may", "also", "can", "whether","gt","make","good","well","take",
                "just", "like", "got","will","lol","right","fucking","actually","wsb",
                "A","at", "but","For", "in", "for","how","How","several","man",
                "Tue", "but", "doesnt", "from", "The","From","see","much","people",
                "like", "just", "shit", "fuck","can","spx","still","go","say","went",
                "have", "been", "has", "than","with","To","even","now","since","better",
                "use","who","of","to","show","and","look","guy","made","great","someone",
                "include", "includes", "including", "included","though","thing","post",
                "uses", "using", "used", "comprises","something","probably","yes","don",
                "on","said","were","by","that","is","sure","really","way","new","feel",
                "as","was","an","it","which","its","think","sub","yeah","need","want","cramer",
                "had","are","they","he","be","us","na","back","already","around","going",
                "get", "NA", "know", "think","deleted","everyone","gets","many","makes","cramer's")
posts_toks <- tokens(posts_corp, remove_punct = TRUE, remove_numbers = TRUE,
                     remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)

#Create compound tokens
toks_nostop_col <- textstat_collocations(posts_toks, min_count = 10)
head(toks_nostop_col, 5)

##      collocation count count_nested length   lambda        z
## 1      last year    56            0      2 4.882448 28.40296
## 2 percent change    37            0      2 7.497311 25.24040
## 3    date change    32            0      2 6.698862 25.18586
## 4       nbsp bot    28            0      2 7.136430 24.36797
## 5   __year's low    31            0      2 6.610580 24.24902

posts_toks_comp <- tokens_compound(
  posts_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
head(posts_toks_comp, 5)

## Tokens consisting of 5 documents and 8 docvars.
## Jan_2018_FINAL.csv.1 :
##  [1] "Robinhood" "saved"     "8k"        "fees"      "riding"    "short"    
##  [7] "bus"       "kid"       "jelly"     "cause"     "couldnt"   "ride"     
## [ ... and 570 more ]
## 
## Jan_2018_FINAL.csv.2 :
##  [1] "Saved"        "commissions"  "build"        "SP500"        "ETF"         
##  [6] "hand"         "kinda"        "robinhood"    "portfolio"    "gives"       
## [11] "dividend"     "every_single"
## [ ... and 835 more ]
## 
## Jan_2018_FINAL.csv.3 :
##  [1] "added"    "every"    "paycheck" "little"   "years"    "buy"     
##  [7] "Toyota"   "Camry"    "Best"     "car"      "miles"    "per"     
## [ ... and 1,723 more ]
## 
## Jan_2018_FINAL.csv.4 :
## [1] "Best"   "Worst"  "amp"    "stocks"
## 
## Jan_2018_FINAL.csv.5 :
## [1] "cellphone" "choice"    "autist"    "making"    "trades"

4. Generate document feature matrix

posts_dfmat <- dfm(posts_toks_comp, tolower = TRUE) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10)
posts_dfmat

## Document-feature matrix of: 245 documents, 1,067 features (91.39% sparse) and 8 docvars.
##                       features
## docs                   robinhood fees short kid cause ride planning rope anyone
##   Jan_2018_FINAL.csv.1         1    1     6   1     2    1        1    2      2
##   Jan_2018_FINAL.csv.2         4    2     1   1     0    1        0    0      1
##   Jan_2018_FINAL.csv.3        14    0     1   1     1    0        0    1      3
##   Jan_2018_FINAL.csv.4         0    0     0   0     0    0        0    0      0
##   Jan_2018_FINAL.csv.5         0    0     0   0     0    0        0    0      0
##   Jan_2018_FINAL.csv.6         0    0     0   0     0    0        0    0      0
##                       features
## docs                   pretty
##   Jan_2018_FINAL.csv.1      1
##   Jan_2018_FINAL.csv.2      2
##   Jan_2018_FINAL.csv.3      1
##   Jan_2018_FINAL.csv.4      0
##   Jan_2018_FINAL.csv.5      0
##   Jan_2018_FINAL.csv.6      0
## [ reached max_ndoc ... 239 more documents, reached max_nfeat ... 1,057 more features ]

#See top 30 features
topfeatures(posts_dfmat, 30)

##      buy    money     time      amd     sell  options   market    stock 
##      460      456      325      295      285      271      264      263 
##   bought    calls     long      day    price     year    short    gains 
##      223      217      211      206      203      192      159      153 
##       mu      put   stocks    never  company  trading    today   pretty 
##      153      151      149      147      146      145      144      141 
## earnings   buying     sold     call     real     work 
##      138      138      138      136      133      131

5. Generate Relative Frequency analysis

posts_tstat_key <- textstat_keyness(
  posts_dfmat, target = posts_dfmat$score <= 500)
textplot_keyness(posts_tstat_key, n=10, min_count = 10)

5a. Identify number of topics

#Convert document-feature matrix to document-term representations used in topicmodels  
posts_dtmat = quanteda::convert(posts_dfmat, to="topicmodels")
posts_dtmat

## <<DocumentTermMatrix (documents: 238, terms: 1067)>>
## Non-/sparse entries: 22511/231435
## Sparsity           : 91%
## Maximal term length: 17
## Weighting          : term frequency (tf)

train_posts_dtmat <- corpus_subset(posts_corp)[1:125,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(myStopWords) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

test_posts_dtmat <- corpus_subset(posts_corp)[126:245,] %>%
  tokens(remove_punct = TRUE, remove_numbers = TRUE,
         remove_symbols = TRUE, remove_url = TRUE) %>%
  dfm(tolower = TRUE) %>%
  dfm_remove(myStopWords) %>%
  dfm_trim(min_termfreq = 5, min_docfreq = 10) %>%
  quanteda::convert(to="topicmodels")

train_posts_lda5 <- LDA(train_posts_dtmat, k = 5, control = list(seed = 123))
perplexity(train_posts_lda5, test_posts_dtmat) # perplexity when k = 5

## [1] 250.8326

n_topics_vec = 2:5 # try different num of topics: 2, 3, 4, 5
# this analysis will take a while
perplexity_vec = map_dbl(n_topics_vec, function(kk) {
  message(kk)
  train_posts_ldaK <- LDA(train_posts_dtmat, k = kk, control = list(seed = 123))
  perp = perplexity(train_posts_ldaK, test_posts_dtmat)
})

## 2

## 3

## 4

## 5

lda_perplexity_result = tibble(
  n_topics = n_topics_vec, perplexity = perplexity_vec
)

plot(lda_perplexity_result, type="l")


lda_ldatuning_result <- FindTopicsNumber(
  posts_dtmat, topics = n_topics_vec,
  metrics = c("CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "VEM", control = list(seed = 123), mc.cores = 4L, verbose = TRUE
)

## fit models... done.
## calculate metrics:
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.

FindTopicsNumber_plot(lda_ldatuning_result)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

6. Visualize topics as word cloud

K = 5 #number of topics
posts_lda5 <- LDA(posts_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(posts_lda5, 10)

##       Topic 1   Topic 2 Topic 3    Topic 4   Topic 5  
##  [1,] "company" "long"  "amd"      "options" "buy"    
##  [2,] "buy"     "money" "intel"    "money"   "sell"   
##  [3,] "money"   "short" "buy"      "price"   "market" 
##  [4,] "day"     "time"  "earnings" "buy"     "day"    
##  [5,] "time"    "year"  "calls"    "put"     "money"  
##  [6,] "stock"   "years" "bought"   "call"    "options"
##  [7,] "market"  "never" "time"     "stock"   "calls"  
##  [8,] "news"    "point" "mu"       "option"  "gains"  
##  [9,] "shop"    "real"  "puts"     "time"    "bought" 
## [10,] "cash"    "true"  "stock"    "sell"    "stocks"

tmResult <- posterior(posts_lda5)

top5termsPerTopic <- topicmodels::terms(posts_lda5, 5)
topicNames <- apply(top5termsPerTopic, 2, paste, collapse=" ")

# visualize topics as word cloud
topicToViz <- 11 # change for your own topic of interest
topicToViz <- grep('intel', topicNames)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
words <- names(top40terms)
# extract the probabilites of each of the 40 terms
probabilities <- sort(tmResult$terms[topicToViz,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud

mycolors <- brewer.pal(8, "Dark2")
wordcloud(words, probabilities, random.order = FALSE, color = mycolors)

7. Visualize top terms in topics

posts_lda5_betas <- broom::tidy(posts_lda5)

top_terms_in_topics <- posts_lda5_betas %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms_in_topics %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

8. Advanced Topic Modeling

8.1. Correlated Topic Model (CTM)

stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")

## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173

out <- prepDocuments(
  stm_posts_dfmat$documents,
  stm_posts_dfmat$vocab,
  stm_posts_dfmat$meta
)
str(out, max.level = 1)

## List of 7
##  $ documents     :List of 238
##  $ vocab         : chr [1:1067] "10k" "1k" "20k" "2k" ...
##  $ meta          :'data.frame':  238 obs. of  8 variables:
##  $ words.removed : chr(0) 
##  $ docs.removed  : NULL
##  $ tokens.removed: int 0
##  $ wordcounts    : int [1:1067] 11 11 12 11 11 12 29 13 18 11 ...

K=5
posts_tmod_ctm <- stm(out$documents, out$vocab, K = K,
                   seed = 123, emtol = 1e-3, max.em.its = 150)

## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -6.665) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -6.653, relative change = 1.842e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -6.646, relative change = 1.065e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged

plot(posts_tmod_ctm)

8.1.1 Plots - CTM

#1. summary - plots topic proportions and names.
plot(posts_tmod_ctm, type = "summary", n = 5,
     labeltype = "frex", main = "Reddit Stock Topics", text.cex = 0.8)

#2. labels - plots the top words for a specific topic.
plot(posts_tmod_ctm, type="labels", topics=c(1,2), n=4)

#3. perspectives - compares two topics' words.
plot(posts_tmod_ctm, type="perspectives", topics=c(1,2))

#4. hist - a histogram of the expected topic proportions across documents for a topic.
plot(posts_tmod_ctm, type="hist")

8.1.2 Semantic Coherence & Exclusivity (Two ways of measuring topic “interpretability”)

topicQuality(posts_tmod_ctm, out$documents)

## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

8.2 Structural Topic Model (STM)

names(docvars(posts_corp))

## [1] "date"                  "score"                 "up_ratio"             
## [4] "total_awards_received" "comments"              "year"                 
## [7] "score_analysis"        "date_analysis"

stm_posts_dfmat <- quanteda::convert(posts_dfmat, to = "stm")

## Warning in dfm2stm(x, docvars, omit_empty = TRUE): Dropped empty document(s):
## Jan_2018_FINAL.csv.20, Jan_2018_FINAL.csv.25, Jan_2018_FINAL.csv.51,
## Jan_2018_FINAL.csv.76, Jan_2018_FINAL.csv.82, Jan_2018_FINAL.csv.128,
## Jan_2018_FINAL.csv.173

out <- prepDocuments(
  stm_posts_dfmat$documents, stm_posts_dfmat$vocab, stm_posts_dfmat$meta)

8.2.1 Topical prevalence and topical content.

#  In STM, metadata can be entered in the topic model in two ways: topical prevalence and topical content.
posts_tmob_stm <- stm(
  out$documents, out$vocab, K=K,
  prevalence=~date_analysis+score_analysis+s(comments), #for topical prevalence; s() = b-spline
  data=out$meta, #for topical content
  init.type="Spectral", seed=123)

## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      .....
##   Recovering initialization...
##      ..........
## Initialization complete.
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -6.665) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -6.652, relative change = 1.879e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -6.645, relative change = 1.067e-03) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -6.641, relative change = 6.509e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -6.638, relative change = 4.336e-04) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -6.636, relative change = 3.021e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -6.635, relative change = 2.137e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -6.634, relative change = 1.521e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -6.633, relative change = 1.091e-04) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 10 (approx. per word bound = -6.632, relative change = 7.918e-05) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 11 (approx. per word bound = -6.632, relative change = 5.832e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 12 (approx. per word bound = -6.632, relative change = 4.389e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 13 (approx. per word bound = -6.632, relative change = 3.351e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 14 (approx. per word bound = -6.631, relative change = 2.619e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 15 (approx. per word bound = -6.631, relative change = 2.085e-05) 
## Topic 1: sell, options, trading, earnings, call 
##  Topic 2: money, buy, time, stock, calls 
##  Topic 3: day, short, stocks, company, every 
##  Topic 4: market, price, year, gains, put 
##  Topic 5: amd, bought, mu, pretty, real 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 16 (approx. per word bound = -6.631, relative change = 1.692e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 17 (approx. per word bound = -6.631, relative change = 1.407e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 18 (approx. per word bound = -6.631, relative change = 1.191e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 19 (approx. per word bound = -6.631, relative change = 1.025e-05) 
## .......................................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged

8.2.2 Explore STM results using the LDAvis package.

toLDAvis(mod=posts_tmob_stm, docs=out$documents)

## Loading required namespace: servr

8.2.3 Summary plot

plot(posts_tmob_stm, type="summary", n=5)

plot(posts_tmob_stm, type="perspectives", topics=c(1,2))

8.2.4 Visualize top words in each topic

posts_tmob_stm_beta <- tidytext::tidy(posts_tmob_stm)
posts_tmob_stm_beta %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~topic, scales="free_y") + coord_flip() + scale_x_reordered()

8.2.5 Comparison b/w CTM and STM

topicQuality(posts_tmod_ctm, out$documents)

## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.483095 9.274230 9.143238 9.306454 9.199831

topicQuality(posts_tmob_stm, out$documents)

## [1] -30.01094 -28.36449 -32.61878 -35.74098 -30.38467
## [1] 9.499497 9.269133 9.133405 9.330438 9.211293

8.3 Keyword Assisted Topic Model

keyATM_docs <- keyATM_read(texts = posts_dfmat)

## Using quanteda dfm.

## Warning in get_doc_index(W_raw, check = TRUE): Number of documents with 0 length: 7
## This may cause invalid covariates or time index.
## Please review the preprocessing steps.
## Document index to check: 20, 25, 51, 76, 82, 128, 173

summary(keyATM_docs)

## keyATM_docs object of: 245 documents.
## Length of documents:
##   Avg: 160.812
##   Min: 0
##   Max: 1519
##    SD: 284.391
## Number of unique words: 1521

8.3.1 Keywords

posts_key_list = list(
  top_stocks = c("amd", "intel", "apple","microsoft","amazon","amzn"),
  options = c("gain", "loss", "options", "profit","short","calls","put","call","puts"),
  stock_market = c("stock","short","share","price","people","market","money","know"),
  buy_sell = c("buy","dip","sell","order","bought","buying"),
  hold_position = c("hold","line"),
  trading_apps = c("robinhood","account","app","trade","webull","trading","fidelity","platform")
)
posts_key_viz <- visualize_keywords(docs = keyATM_docs, keywords = posts_key_list)

## Warning in check_keywords(unique(unlisted), keywords, prune): Keywords will be
## pruned because they do not appear in documents: people, know, webull, fidelity

posts_key_viz

8.3.2 KeyATM Base

# will take a while
posts_tmod_keyatm_base <- keyATM(
  docs = keyATM_docs, # text input
  no_keyword_topics = 3,
  keywords = posts_key_list,
  model = "base",
  options = list(seed = 123))

## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.

## Initializing the model...

## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity

## Fitting the model. 1500 iterations...

## Creating an output object. It may take time...

top_words(posts_tmod_keyatm_base, 5)

##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            keep
## 2    intel [✓]   calls [✓]           time   sell [✓]             pay
## 3     earnings    call [✓]      price [✓] market [3]        hold [✓]
## 4    apple [✓]    puts [✓]      stock [✓] bought [✓]           might
## 5     amzn [✓]        week           year      gains           month
##   6_trading_apps  Other_1 Other_2   Other_3
## 1    trading [✓]     high   never      long
## 2           real business company      mean
## 3  robinhood [✓]      job  pretty   tendies
## 4    account [✓]     tell   years     gonna
## 5        thought   reddit     amp short [2]

8.3.3 KeyATM Covariates

vars <- docvars(posts_corp)
head(vars)

##         date score up_ratio total_awards_received comments year score_analysis
## 1 2018-01-01  1780     0.95                     0      118 2018           high
## 2 2018-01-01   394     0.96                     0       97 2018         medium
## 3 2018-01-01  2016     0.93                     0      238 2018           high
## 4 2018-01-02   202     0.97                     0       52 2018            low
## 5 2018-01-02   106     0.93                     0        4 2018            low
## 6 2018-01-02   134     0.94                     0       47 2018            low
##   date_analysis
## 1       jan_one
## 2       jan_one
## 3       jan_one
## 4       jan_one
## 5       jan_one
## 6       jan_one

vars_selected <- vars %>%
  as_tibble() %>%
  mutate(period = case_when(comments_analysis = (comments ==0 ~ "nodiscussion"),
                            (comments >0 & comments <=50 ~ "discussion"),
                            (comments>50 ~ "topdiscussion")
                            ))
posts_tmod_keyatm_cov <- keyATM(
  docs              = keyATM_docs,
  no_keyword_topics = 3,
  keywords          = posts_key_list,
  model             = "covariates",
  model_settings    = list(covariates_data    = vars_selected,
                           covariates_formula = ~date_analysis+score_analysis+period),
  options           = list(seed = 123))

## Warning in keyATM_fit(docs, model, no_keyword_topics, keywords,
## model_settings, : Some documents have 0 length. Please review the preprocessing
## steps.

## Convert covariates data using `model_settings$covariates_formula`.

## Initializing the model...

## Warning in check_keywords(info$wd_names, keywords, options$prune): Keywords
## will be pruned because they do not appear in documents: people, know, webull,
## fidelity

## Fitting the model. 1500 iterations...

## Creating an output object. It may take time...

8.3.4 Comparison b/w KeyATM Base and KeyATM Covariates

top_words(posts_tmod_keyatm_base, 5)

##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            keep
## 2    intel [✓]   calls [✓]           time   sell [✓]             pay
## 3     earnings    call [✓]      price [✓] market [3]        hold [✓]
## 4    apple [✓]    puts [✓]      stock [✓] bought [✓]           might
## 5     amzn [✓]        week           year      gains           month
##   6_trading_apps  Other_1 Other_2   Other_3
## 1    trading [✓]     high   never      long
## 2           real business company      mean
## 3  robinhood [✓]      job  pretty   tendies
## 4    account [✓]     tell   years     gonna
## 5        thought   reddit     amp short [2]

top_words(posts_tmod_keyatm_cov, 5)

##   1_top_stocks   2_options 3_stock_market 4_buy_sell 5_hold_position
## 1      amd [✓] options [✓]      money [✓]    buy [✓]            work
## 2    intel [✓]   calls [✓]     market [✓]   sell [✓]            long
## 3     earnings     put [✓]      stock [✓]      gains            keep
## 4   amazon [✓]    call [✓]      price [✓] bought [✓]        hold [✓]
## 5    apple [✓]    puts [✓]           time      today       companies
##   6_trading_apps   Other_1 Other_2 Other_3
## 1    trading [✓]       day   point    cash
## 2         stocks literally  pretty  please
## 3          years      best     big    stop
## 4  robinhood [✓]    reddit    days instead
## 5            pay      game     amp  anyone

Comparison with Secondary data

#Reddit data analysis/comparison with Secondary data
postsanalysis_df <- readtext("Jan_2018_FINAL.csv", text_field = "final_document")
postsanalysis_df <- postsanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
cnbcanalysis_df <- readtext("cnbc_headlines.csv", text_field = "text")
cnbcanalysis_df <- cnbcanalysis_df %>% select(doc_id, text) %>% filter(text %like% 'AMD')
#Corpus
postsanalysis_corp <- corpus(postsanalysis_df)
cnbcanalysis_corp <- corpus(cnbcanalysis_df)
postsanalysis_toks <- tokens(postsanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
                     remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)
#Compound tokens
toks_nostop_col <- textstat_collocations(postsanalysis_toks, min_count = 10)
postsanalysis_toks_comp <- tokens_compound(
  postsanalysis_toks, pattern = toks_nostop_col[toks_nostop_col$z > 10])
cnbcanalysis_toks <- tokens(cnbcanalysis_corp, remove_punct = TRUE, remove_numbers = TRUE,
                    remove_symbols = TRUE, remove_url = TRUE, split_hyphens = FALSE) %>%
  tokens_remove(pattern = c(stopwords("en"), myStopWords)) %>%
  tokens_keep(min_nchar = 2)
#document feature matrix
postsanalysis_dfmat <- dfm(postsanalysis_toks_comp, tolower = TRUE)
cnbcanalysis_dfmat <- dfm(cnbcanalysis_toks, tolower = TRUE)
#document term matrix
postsanalysis_dtmat = quanteda::convert(postsanalysis_dfmat, to="topicmodels")
cnbcanalysis_dtmat = quanteda::convert(cnbcanalysis_dfmat, to="topicmodels")

##Comparison b/w Reddit data and CNBC News Headline
#Word Cloud for Reddit data
K = 2 #number of topics
postsanalysis_lda5_reddit <- LDA(postsanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(postsanalysis_lda5_reddit, 10)

##       Topic 1   Topic 2  
##  [1,] "amd"     "buy"    
##  [2,] "money"   "amd"    
##  [3,] "buy"     "money"  
##  [4,] "time"    "stock"  
##  [5,] "market"  "market" 
##  [6,] "intel"   "time"   
##  [7,] "options" "calls"  
##  [8,] "stock"   "sell"   
##  [9,] "calls"   "price"  
## [10,] "sell"    "options"

tmResult_reddit <- posterior(postsanalysis_lda5_reddit)
top5termsPerTopic_reddit <- topicmodels::terms(postsanalysis_lda5_reddit, 5)
topicNames_reddit <- apply(top5termsPerTopic_reddit, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_reddit <- 11 # change for your own topic of interest
topicToViz_reddit <- grep('amd', topicNames_reddit)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
words_reddit <- names(top40terms_reddit)
# extract the probabilites of each of the 40 terms
probabilities_reddit <- sort(tmResult_reddit$terms[topicToViz_reddit,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_reddit <- brewer.pal(8, "Dark2")

#Word Cloud for CNBC News Headlines(Secondary Data)
K = 2 #number of topics
cnbcanalysis_lda5_cnbc <- LDA(cnbcanalysis_dtmat, k = K, control = list(seed = 123))
topicmodels::terms(cnbcanalysis_lda5_cnbc, 10)

##       Topic 1     Topic 2   
##  [1,] "amd"       "amd"     
##  [2,] "lightning" "jim"     
##  [3,] "round"     "says"    
##  [4,] "amd's"     "nvidia"  
##  [5,] "intel"     "intel"   
##  [6,] "earnings"  "charts"  
##  [7,] "buy"       "qualcomm"
##  [8,] "alphabet"  "lot"     
##  [9,] "ceo"       "room"    
## [10,] "products"  "run"

tmResult_cnbc <- posterior(cnbcanalysis_lda5_cnbc)
top5termsPerTopic_cnbc <- topicmodels::terms(cnbcanalysis_lda5_cnbc, 5)
topicNames_cnbc <- apply(top5termsPerTopic_cnbc, 2, paste, collapse=" ")
# visualize topics as word cloud
topicToViz_cnbc <- 11 # change for your own topic of interest
topicToViz_cnbc <- grep('amd', topicNames_cnbc)[1] # Or select a topic by a term contained in its name
# select to 40 most probable terms from the topic by sorting the term-topic-probability vector in decreasing order
top40terms_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
words_cnbc <- names(top40terms_cnbc)
# extract the probabilites of each of the 40 terms
probabilities_cnbc <- sort(tmResult_cnbc$terms[topicToViz_cnbc,], decreasing=TRUE)[1:40]
# visualize the terms as wordcloud
mycolors_cnbc <- brewer.pal(8, "Dark2")

# Comparison b/w Reddit data and CNBC News Headlines using WordCloud
wordcloud(words_reddit, probabilities_reddit, random.order = FALSE, color = mycolors_reddit)

wordcloud(words_cnbc, probabilities_cnbc, random.order = FALSE, color = mycolors_cnbc)

Group5_FinalProject_v2

Narender Reddy Konuganti

10/1/2022

1. Create Dataframe from CSV file

2. Create Corpus with document level variables

3. Generate Tokens

4. Generate document feature matrix

5. Generate Relative Frequency analysis

5a. Identify number of topics

6. Visualize topics as word cloud

7. Visualize top terms in topics

8. Advanced Topic Modeling

8.1. Correlated Topic Model (CTM)

8.1.1 Plots - CTM

8.1.2 Semantic Coherence & Exclusivity (Two ways of measuring topic “interpretability”)

8.2 Structural Topic Model (STM)

8.2.1 Topical prevalence and topical content.

8.2.2 Explore STM results using the LDAvis package.

8.2.3 Summary plot

8.2.4 Visualize top words in each topic

8.2.5 Comparison b/w CTM and STM

8.3 Keyword Assisted Topic Model

8.3.1 Keywords

8.3.2 KeyATM Base

8.3.3 KeyATM Covariates

8.3.4 Comparison b/w KeyATM Base and KeyATM Covariates

Comparison with Secondary data