1 Recap the basics of the regression modelling in R

2 Read relevant literature

3 Learn (textual analysis) modelling by example

A. Download and import the datasheet

  • All files are available at this shared folder

  • For this Tutorial, the sub-folder S04 is relevant; download the txt files to your local computer. Please note that the files are the Management Discussion and Analysis by US listed firms. The files were extracted from the EDGAR database using the R-package EDGAR

  • Set a working directory using the command setwd(“C:/…/…/…/…”); where you have the files downloaded (stored) and where outputs will be exported (e.g., setwd(“C:/Users/MyPC/Documents/R/Paper01”)

  • Read the txt files from the working directory

library(quanteda)
docs <- list.files(path="C:/Users/MyPC/Documents/R/Paper01", pattern=".txt", all.files=TRUE, full.names=TRUE)
  • Convert the files into a corpus
library(readtext)
corpus2019 <- corpus(readtext(docs))

B. Analyse the data

  1. Inspect the corpus
summary(corpus2019)
## Corpus consisting of 30 documents, showing 30 documents:
## 
##              Text Types Tokens Sentences
##           ABB.txt  2191  11734       336
##      ALPHABET.txt  1730  10806       304
##        AMAZON.txt  1547   9359       267
##        ANTHEM.txt  2673  20146       591
##         APPLE.txt  1273   5886       155
##        BOEING.txt  2350  15144       497
##   CATERPILLAR.txt  3338  26786       850
##         CISCO.txt  2201  14270       387
##      COCACOLA.txt  3178  31491       986
##       COMCAST.txt  2318  16984       385
##        CONOCO.txt  3338  26223       879
##        COSTCO.txt  1291   5975       193
##        DISNEY.txt  2316  18861       418
##         EXXON.txt  3185  16939       576
##      FACEBOOK.txt  1745  11375       325
##     HOMEDEPOT.txt  1182   5877       206
##           IBM.txt   896   5902       160
##       JOHNSON.txt  2398  13456       427
##      LOCKHEED.txt  2895  26751       843
##         LOWES.txt  1954  12748       378
##         MERCK.txt  3142  26948       834
##          NIKE.txt  2125  14331       433
##        ORACLE.txt  2086  15634       347
##       PEPSICO.txt  4195  44393      1111
##       PROCTER.txt  2482  18564       652
##         TESLA.txt  4473  48860      1368
##       TMOBILE.txt  4008  36932       919
##  UNITEDHEALTH.txt  1788   7936       228
##           UPS.txt  2831  24017       704
##       WALMART.txt  1837  11890       322

Note: If you like to see the text of the report, e.g. for AMAZON, use the code as.character(corpus2019)[“AMAZON.txt”] or as.character(corpus2019)[3].

  1. Reorder the corpus based on the number of words (tokens)
tokeninfo2019 <- summary(corpus2019)
tokeninfo2019$Text <- with(tokeninfo2019, reorder(Text, -Tokens))
  1. Plot the length of the documents (based on the number of tokens)
library(ggplot2)
ggplot(tokeninfo2019, aes(x = Text, y = Tokens)) +
  geom_point() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(x = "FIRM", y = "MD&A words length") + geom_hline(yintercept = mean(tokeninfo2019$Tokens), colour = "#b10061")

  1. Find all occurrences of the word “merger”, including its context
tokens2019 <- tokens(corpus2019)
kwic(tokens2019, pattern = "*merger*")
## Keyword-in-context with 57 matches.                                                                              
##     [ANTHEM.txt, 3312]                  the Agreement and Plan of |   Merger  
##     [ANTHEM.txt, 3316]                        of Merger, or Cigna |   Merger  
##     [ANTHEM.txt, 3357]                     the terms of the Cigna |   Merger  
##     [ANTHEM.txt, 3371]            litigation related to the Cigna |   Merger  
##     [ANTHEM.txt, 3389] Regulatory Proceedings - Cigna Corporation |   Merger  
##     [ANTHEM.txt, 4385]                   as a result of strategic |  mergers  
##     [ANTHEM.txt, 8876]            related to the terminated Cigna |   Merger  
##     [ANTHEM.txt, 9060]            related to the terminated Cigna |   Merger  
##     [ANTHEM.txt, 9298]            related to the terminated Cigna |   Merger  
##  [COCACOLA.txt, 11692]            , capital improvement programs, |   merger  
##        [IBM.txt, 1432]                    . Agreement and Plan of |   Merger  
##    [PEPSICO.txt, 2923]               otherwise noted, reflect all |  mergers  
##    [PEPSICO.txt, 3929]          impairment charges 0.18 0.16 0.09 |   Merger  
##    [PEPSICO.txt, 5028]            gains or losses associated with |  mergers  
##    [PEPSICO.txt, 5387]               and 2014 Productivity Plans, |   merger  
##    [PEPSICO.txt, 6384]                            272 36 56 1 251 |   Merger  
##    [PEPSICO.txt, 7458]     our consolidated financial statements. |   Merger  
##    [PEPSICO.txt, 7467]                       In 2018, we incurred |   merger  
##   [PEPSICO.txt, 10764]                               c ) 63 53 60 |   Merger  
##   [PEPSICO.txt, 11056]                  Table of Contents and the |   merger  
##   [PEPSICO.txt, 14681]             impairment charges 0.4 0.3 0.1 |   Merger  
##   [PEPSICO.txt, 39852]               Stock In connection with our |   merger  
##   [PEPSICO.txt, 41159]      accounting for business combinations, |   merger  
##   [PEPSICO.txt, 41192]                       In 2018, we incurred |   merger  
##   [PEPSICO.txt, 43641]         Acquisitions and divestitures: all |  mergers  
##     [TMOBILE.txt, 136]                         as a result of the |   Merger  
##     [TMOBILE.txt, 168]              and broadband industries. The |   Merger  
##    [TMOBILE.txt, 4007]                  expenses. Joint ventures, |  mergers  
##    [TMOBILE.txt, 7347]                   of spectrum; engaging in |  mergers  
##    [TMOBILE.txt, 8504]                acquisition, investment, or |   merger  
##    [TMOBILE.txt, 8529]                      of, investments in or |  mergers  
##    [TMOBILE.txt, 8836]                acquisition, investment, or |   merger  
##   [TMOBILE.txt, 12022]                   incorporation, a sale or |   merger  
##   [TMOBILE.txt, 12279]          conclusion of the proposed Sprint |   Merger  
##   [TMOBILE.txt, 13014]          consolidation, or other strategic |  mergers  
##   [TMOBILE.txt, 13043]          activities, including the pending |   Merger  
##   [TMOBILE.txt, 13056]           likelihood the conditions to the |   Merger  
##   [TMOBILE.txt, 13066]            the anticipated benefits of the |   Merger  
##   [TMOBILE.txt, 13381]            stock repurchase program if the |   Merger  
##   [TMOBILE.txt, 14110]                 after giving effect to the |   Merger  
##   [TMOBILE.txt, 14306]                  . Failure to complete the |   Merger  
##   [TMOBILE.txt, 14332]                      of operations. If the |   Merger  
##   [TMOBILE.txt, 14373]               a market assumption that the |   Merger  
##   [TMOBILE.txt, 14455]                    is expected that if the |   Merger  
##   [TMOBILE.txt, 14497]                          . Further, if the |   Merger  
##   [TMOBILE.txt, 16064]                   the full benefits of the |   Merger  
##   [TMOBILE.txt, 16147]          projected accretive effect of the |   Merger  
##   [TMOBILE.txt, 16160]             our common stock following the |   Merger  
##   [TMOBILE.txt, 16275]                 expect to conduct, certain | pre-Merger
##   [TMOBILE.txt, 16306]                 After giving effect to the | pre-Merger
##   [TMOBILE.txt, 17120]                   of spectrum; engaging in |  mergers  
##   [TMOBILE.txt, 17454]          long-term financings prior to the |   merger  
##   [TMOBILE.txt, 19987]                         as a result of the |   Merger  
##   [TMOBILE.txt, 20021]      industries. Immediately following the |   Merger  
##   [TMOBILE.txt, 20096]                              31, 2018. The |   Merger  
##   [TMOBILE.txt, 31135]                 to the consummation of the |   Merger  
##   [TMOBILE.txt, 31188]               Comprehensive Income. If the |   Merger  
##                                               
##  | , or Cigna Merger Agreement                
##  | Agreement, between us and                  
##  | Agreement. For additional information      
##  | Agreement, see Note 13                     
##  | Litigation, of the Notes                   
##  | and acquisitions, as well                  
##  | Agreement. Our selling,                    
##  | Agreement. For the year                    
##  | Agreement, the excess tax                  
##  | and acquisition plans, and                 
##  | , dated as of October                      
##  | and acquisitions activity, including       
##  | and integration charges 0.05 Net           
##  | , acquisitions, divestitures and           
##  | and integration charges associated with    
##  | and integration charges ( 75               
##  | and Integration Charges In 2018            
##  | and integration charges of $               
##  | and integration charges ( c                
##  | and integration charges related to         
##  | and integration charges 0.1 Net            
##  | with The Quaker Oats Company               
##  | and integration costs are not              
##  | and integration charges of $               
##  | and acquisitions activity, including       
##  | , is expected to be                        
##  | is subject to regulatory approvals         
##  | , acquisitions and strategic alliances     
##  | , acquisitions, business combinations      
##  | may subject us to significant              
##  | with businesses, technologies,             
##  | may have a material adverse                
##  | of our Company and other                   
##  | . The current royalty rate                 
##  | or acquisition activities involving us     
##  | and views of market participants           
##  | will be satisfied and the                  
##  | will be realized; disruptions              
##  | fails to close. Our                        
##  | ) from at least two                        
##  | could negatively impact us and             
##  | is not completed for any                   
##  | will be completed. In                      
##  | is not completed, we                       
##  | is not completed, it                       
##  | may not be realized,                       
##  | , and negatively impact the                
##  | . As a result,                             
##  | financing transactions, which will         
##  | financing transactions and the Transactions
##  | , acquisitions, business combinations      
##  | closing. However, there                    
##  | , is expected to be                        
##  | , it is anticipated that                   
##  | is subject to regulatory approvals         
##  | . 47 Table of Contents                     
##  | is consummated, we will
  1. Find 50 most frequent words
dfm2019 <- corpus2019 |>  tokens(remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE) |>  
  tokens_remove(stopwords("en")) |> dfm()
topfeatures(dfm2019, 50)
##          net      million          tax        sales    operating    financial 
##         2329         2290         2036         2015         1905         1883 
##         cash          due      billion        costs       income    primarily 
##         1881         1735         1705         1650         1599         1475 
##     increase      related    increased      revenue     december      company 
##         1345         1313         1277         1273         1269         1256 
##     business          may consolidated         year   operations       fiscal 
##         1216         1178         1158         1138         1097         1086 
##       assets      certain         rate        total   statements        value 
##         1079         1052         1052         1050         1008          986 
##    including       higher      expense       impact      results         cost 
##          984          977          969          968          958          949 
##          u.s     products     compared     interest     expenses     revenues 
##          941          935          932          898          886          882 
##       growth      changes         debt       offset       future        table 
##          879          832          826          802          779          749 
##     services   activities 
##          743          723
  1. Plot 50 most frequent words
library(quanteda.textstats)
features2019 <- textstat_frequency(dfm2019, n = 50)
features2019$feature <- with(features2019, reorder(feature, -frequency))
ggplot(features2019, aes(x = feature, y = frequency)) +
  geom_point(colour = "#b10061") + labs(x = "Word", y = "Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

  1. Find 50 most frequent bigrams
library(tidytext)
library(dplyr)
Bigram2019 <- tidy(dfm2019) %>% unnest_tokens(output = bigram, input = term, token = "ngrams", n = 2) 
print(Bigram2019 %>% count(bigram, sort=TRUE), n = 50)
## # A tibble: 1,331 × 2
##    bigram               n
##    <chr>            <int>
##  1 <NA>             45404
##  2 10 k                30
##  3 long term           30
##  4 short term          24
##  5 off balance         23
##  6 third party         23
##  7 2019 02             18
##  8 non gaap            18
##  9 one time            18
## 10 non cash            16
## 11 indefinite lived    15
## 12 weighted average    15
## 13 forward looking     14
## 14 over year           14
## 15 year end            14
## 16 year over           14
## 17 long lived          13
## 18 stock based         13
## 19 multi year          12
## 20 percentage point    12
## 21 pre tax             12
## 22 share based         12
## 23 straight line       12
## 24 five year           11
## 25 fixed rate          10
## 26 for sale            10
## 27 high quality        10
## 28 near term           10
## 29 non u.s             10
## 30 three year          10
## 31 available for        9
## 32 write down           9
## 33 364 day              8
## 34 after tax            8
## 35 to market            8
## 36 write downs          8
## 37 low taxed            7
## 38 non recurring        7
## 39 self insurance       7
## 40 year to              7
## 41 1 3                  6
## 42 2014 09              6
## 43 8 k                  6
## 44 full year            6
## 45 mark to              6
## 46 non operating        6
## 47 one year             6
## 48 self insured         6
## 49 4 5                  5
## 50 a 1                  5
## # ℹ 1,281 more rows
  1. Find 25 most frequent trigrams
Trigram2019 <- tidy(dfm2019) %>% unnest_tokens(output = bigram, input = term, token = "ngrams", n = 3) 
print(Trigram2019 %>% count(bigram, sort=TRUE), n = 25)
## # A tibble: 185 × 2
##    bigram                   n
##    <chr>                <int>
##  1 <NA>                 47210
##  2 year over year          14
##  3 available for sale       9
##  4 mark to market           6
##  5 period to period         5
##  6 year to year             5
##  7 direct to consumer       4
##  8 other than temporary     4
##  9 2019 02 27               3
## 10 build to suit            3
## 11 end to end               3
## 12 over the counter         3
## 13 2019 01 31               2
## 14 2019 02 08               2
## 15 2019 02 19               2
## 16 2019 02 20               2
## 17 2019 02 21               2
## 18 2019 03 28               2
## 19 go to market             2
## 20 hart scott rodino        2
## 21 out of date              2
## 22 period over period       2
## 23 point of sale            2
## 24 right of use             2
## 25 year on year             2
## # ℹ 160 more rows
  1. Plot the dispersion of the terms “cash flow”, “income”, “profit” within the documents
library(quanteda.textplots)
idx1 <- index(tokens(corpus2019), phrase ("cash flow"))
idx2 <- index(tokens(corpus2019), phrase ("income"))
idx3 <- index(tokens(corpus2019), phrase ("profit"))
textplot_xray(kwic(tokens(corpus2019), index = idx1), kwic(tokens(corpus2019), index = idx2), kwic(tokens(corpus2019), index = idx3), sort = TRUE)

  1. Identify 10 most frequently occurring topics
library(stm)
quant2019 <- dfm_trim(tokens(corpus2019, remove_punct = TRUE, remove_numbers = TRUE) |>
                        tokens_remove(stopwords("en")) |>
                        dfm(), min_termfreq = 10) 
set.seed(100)
plot(stm(quant2019, K = 10, verbose = FALSE))

  1. Visualise the connections among words
network2019 <-
  tokens(corpus2019, remove_punct = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("english"), padding = FALSE) %>%
  fcm(context = "window", window = 5, tri = FALSE)
topfeats2019 <- names(topfeatures(network2019, 50))
set.seed(100)
textplot_network(fcm_select(network2019, topfeats2019), min_freq = 10, edge_color = "#b10061") + ggtitle("Words network") 

  1. Calculate (cosine) similarity between each pair of the documents
dfmat2019 <- corpus2019 |>
  tokens(remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE) |>
  tokens_wordstem(language = "en") |>
  tokens_remove(stopwords("en")) |>
  dfm()
tstat2019 <- textstat_simil(dfmat2019, margin = "documents", method = "cosine")
  1. Extract the similarity indices for the APPLE company and visualise them
as.list(tstat2019)$"APPLE"
##      JOHNSON.txt        MERCK.txt        LOWES.txt      PROCTER.txt 
##        0.7610193        0.7014893        0.6778258        0.6666915 
##       AMAZON.txt     COCACOLA.txt       COSTCO.txt      PEPSICO.txt 
##        0.6639099        0.6537156        0.6523390        0.6503047 
##          ABB.txt  CATERPILLAR.txt      WALMART.txt        CISCO.txt 
##        0.6407577        0.6386842        0.6358225        0.6117830 
##     FACEBOOK.txt     LOCKHEED.txt    HOMEDEPOT.txt       ORACLE.txt 
##        0.5918962        0.5886400        0.5776497        0.5740082 
##      TMOBILE.txt       CONOCO.txt       ANTHEM.txt         NIKE.txt 
##        0.5687208        0.5652326        0.5606723        0.5432352 
##       DISNEY.txt      COMCAST.txt       BOEING.txt UNITEDHEALTH.txt 
##        0.5367939        0.5097931        0.5087365        0.5054977 
##          UPS.txt     ALPHABET.txt        TESLA.txt        EXXON.txt 
##        0.4889719        0.4720312        0.4612674        0.4224644 
##          IBM.txt 
##        0.2208358
dotchart(as.list(tstat2019)$"APPLE", cex = 0.6, pt.cex = 1.0, pch = 19, lcolor = "#b10061")

  1. Convert Document-feature matrix (quanteda package) to Document-term matrix (tm package)
library(tm)
dtm2019 <- as.DocumentTermMatrix(dfm2019, weighting = weightTf)
  1. Tokenise the document-term matrix
library(tidytext)
td2019 <- tidy(dtm2019)
  1. Assess the sentiment of the reports, using a general lexicon, e.g. Bing
library(dplyr)
sentiments_Bing <- td2019 %>%
  inner_join(get_sentiments("bing"), by = c(term = "word"))
sentiments_Bing
## # A tibble: 3,842 × 4
##    document         term       count sentiment
##    <chr>            <chr>      <dbl> <chr>    
##  1 ABB.txt          dedicated      1 positive 
##  2 COCACOLA.txt     dedicated      1 positive 
##  3 EXXON.txt        dedicated      1 positive 
##  4 PROCTER.txt      dedicated      1 positive 
##  5 TMOBILE.txt      dedicated      1 positive 
##  6 UNITEDHEALTH.txt dedicated      2 positive 
##  7 ABB.txt          innovation     2 positive 
##  8 CISCO.txt        innovation     2 positive 
##  9 COCACOLA.txt     innovation     5 positive 
## 10 JOHNSON.txt      innovation     1 positive 
## # ℹ 3,832 more rows
  1. Visualise the most frequent positive and negative words for the Bing lexicon
sentiments_Bing %>%
  count(sentiment, term, wt = count) %>%
  ungroup() %>%
  filter(n >= 100) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values=c("black", 
                             "#b10061")) +
  xlab("") +
  ylab("Contribution to sentiment") +
  coord_flip()

Note: Are the terms such as “cloud” or “gross” really negative in the context of financial reporting?

  1. Assess the sentiment of the reports, using a financial lexicon, e.g., that developed by Loughran&McDonald (2011)
library(dplyr)
sentiments_Loughran <- td2019 %>%
  inner_join(get_sentiments("loughran"), by = c(term = "word"))
sentiments_Loughran
## # A tibble: 5,264 × 4
##    document        term      count sentiment
##    <chr>           <chr>     <dbl> <chr>    
##  1 ABB.txt         accession     1 litigious
##  2 ALPHABET.txt    accession     1 litigious
##  3 AMAZON.txt      accession     1 litigious
##  4 ANTHEM.txt      accession     1 litigious
##  5 APPLE.txt       accession     1 litigious
##  6 BOEING.txt      accession     1 litigious
##  7 CATERPILLAR.txt accession     1 litigious
##  8 CISCO.txt       accession     1 litigious
##  9 COCACOLA.txt    accession     1 litigious
## 10 COMCAST.txt     accession     1 litigious
## # ℹ 5,254 more rows
  1. Visualise the most frequent positive and negative words for the Loughran&McDonald lexicon
sentiments_Loughran %>%
  count(sentiment, term, wt = count) %>%
  ungroup() %>%
  filter(n >= 100) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  xlab("") +
  ylab("Contribution to sentiment") +
  coord_flip()

  • or alternatively
sentiments_Loughran %>%
  count(term, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

  1. Extract sentiment data by firm for a follow-up analysis
group_by(sentiments_Bing %>% count(sentiment, document, wt = count), document, sentiment) %>% summarise(words = sum(n)) # Bing
## # A tibble: 60 × 3
## # Groups:   document [30]
##    document     sentiment words
##    <chr>        <chr>     <dbl>
##  1 ABB.txt      negative    117
##  2 ABB.txt      positive    250
##  3 ALPHABET.txt negative     70
##  4 ALPHABET.txt positive    113
##  5 AMAZON.txt   negative     84
##  6 AMAZON.txt   positive    199
##  7 ANTHEM.txt   negative    219
##  8 ANTHEM.txt   positive    437
##  9 APPLE.txt    negative     87
## 10 APPLE.txt    positive     55
## # ℹ 50 more rows
group_by(sentiments_Loughran %>% count(sentiment, document, wt = count), document, sentiment) %>% summarise(words = sum(n)) # Loughran
## # A tibble: 165 × 3
## # Groups:   document [30]
##    document     sentiment    words
##    <chr>        <chr>        <dbl>
##  1 ABB.txt      constraining    68
##  2 ABB.txt      litigious       43
##  3 ABB.txt      negative       100
##  4 ABB.txt      positive       119
##  5 ABB.txt      superfluous      8
##  6 ABB.txt      uncertainty    154
##  7 ALPHABET.txt constraining    32
##  8 ALPHABET.txt litigious       22
##  9 ALPHABET.txt negative        80
## 10 ALPHABET.txt positive        49
## # ℹ 155 more rows