1 Overview

In this post, textmining is conducted on the Fed Beige Book to compare the sentiments of reports across time. We will be focusing on the use of the sentometrics package in performing the said analysis.

library(tidytext)
library(stringr)
library(dplyr)
library(tidyverse)
library(lubridate)
library(quanteda)
library(sentometrics)
library(data.table)
library(ggplot2)
library(gridExtra)
library(kableExtra)
library(pdftools)

2 Data Import

We will explore the Beige Book, which gathers anecdotal information on current economic conditions across the Federal Reserve Districts. As these reports collect anedotes from various business across the country they seem like a grate candidate for fruitful text analysis.

path_root="."
path_data=file.path(path_root, "data")
path_pdfs=file.path(path_data, "pdfs")

load(file=file.path(path_data, "fed_raw_text.Rdata"))
beige_links %>% select(url, report=report.date) %>% format.dt.f(.)

3 Data Hierachy

Lets split the data into two hierarchical levels:

  • Line Level - A more granular line level dataset to understand the relationship and occurrences of words within the document.

  • Document Level - Overall document level dataset with text for each report aggregated into a row

Let’s take a look at how the two dataset looks like:

#== Line Level ======
sentocorp.line <- raw_text %>% 
  select(date = rdate, texts = ctext) %>% 
  mutate(
    id = row_number(),
    fed = 1) %>% 
  select(id,date,texts,fed) %>% 
  arrange(id)
# sentocorp.line

#== Document Level ======
sentocorp.doc <- raw_text %>% 
  select(id = report, date = rdate, texts = ctext) %>% 
  group_by(id,date) %>% 
  summarise(texts = paste0(texts, collapse = "")) %>% 
  ungroup() %>% 
  mutate(fed = 1) %>% 
  select(id,date,texts,fed) %>% 
  arrange(id)
# sentocorp.doc

Once we are done, we use the sento_corpus() function to convert these datasets into a sento_corpus object

fedcorpus.line <- sento_corpus(corpusdf = sentocorp.line,do.clean = TRUE)
fedcorpus.doc <- sento_corpus(corpusdf = sentocorp.doc,do.clean = TRUE)

As shows below, our document level dataset comprises of 101 documents.

fedcorpus.doc
A sento_corpus consisting of 115 documents and 2 docvars.

3.1 Frequency of Words Occurrence

Now that we have prepared the two hierarchical levels of the corpus. We will first attempt to use the line-level dataset to extract the number of times certain expressions or words has been used in the document. We will examine the number of occurrence of words related to the follow topics:

  1. Election & Policies
  2. Pandemic & Market Fear
  3. Market Strength & Weakness

For each of the topics, we will examine the appearance of related words and attempt to derive some insights from the reults.

3.1.1 Election & Policies

# == Regular expression =====

# 1.Election related words
# Election & Party related expressions
election.regex <- paste0("\\belection[s]?\\b|\\brepublican[s]?\\b|\\bdemocrats[s]?\\b")

# Add Features
fedcorpus_1 <- add_features(
  fedcorpus.line,
  keywords = list(
    election = election.regex,
    policies = c('policies', 'policy','administration')
  ),
  do.binary = FALSE, do.regex = c(TRUE,FALSE)
)
docvars(fedcorpus_1, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_1, by = "month")

summ$plots$feature_plot +
  theme(
    legend.direction = "vertical",
    legend.box = "horizontal",
    legend.position = "right",
    plot.title = element_text(size=13)
  )+
  labs(
    title = "Feature count of words related to Elections & Policies",
    caption = "Source: US FED Beige Book"
  )

While it is intuitive to see that election related words has a highest frequency every few years (coincide with the elections every four years), it is surprising to see words related to policies spike during this period as well.

Perhaps one can infer that a large majority of policies are laid out at the beginning of every presidential electoral term.

3.1.2 Pandemic & Market Fear

Now let’s take a look at pandemic & market fears across the years:

# 2. pandemic and market related
# Words used in reference to covid & uncertainty 
covid.regex <- paste0("\\bcovid[.-]?\\b|\\bpandemic[.-]?\\b|\\bvirus\\b")
in.regex <- paste0("\\buncertainty\\b|\\buncertainties\\b|\\b|\\bbubble\\b")

fedcorpus_2 <- add_features(
  fedcorpus.line,
  keywords = list(pandemic = covid.regex, uncertainty = in.regex),
  do.binary = FALSE, 
  do.regex = c(TRUE, TRUE)
)
docvars(fedcorpus_2, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_2, by = "month")

summ$plots$feature_plot +
  theme(
    legend.direction = "vertical",
    legend.box = "horizontal",
    legend.position = "right",
    plot.title = element_text(size=13)
  )+
  labs(
    title = "Feature count of words related to Pandemic & Market Fears",
    caption = "Source: US FED Beige Book"
  )

It is interesting to see a large spike in the use of words related to uncertainties from \(2016\)-\(2017\). Evidently the use of words related to the covid pandemic is occurs in year \(2020\).

3.1.3 Market Strength & Weakeness

We will be using words that are usually used in conjuncture with the market/economy strength.

# 3. Strength of market
# Words used in reference to strong and weak 
strong.regex <- c("strong","stronger","strongest","strengthen",
                  "strengthened","strongly","strength","strengthening")

weak.regex <- c("weak","weaken","weakened","weaker","weakest","weakly","weakening")

fedcorpus_3 <- add_features(
  fedcorpus.line,
  keywords = list(strong = strong.regex, weak = weak.regex),
  do.binary = FALSE, 
  do.regex = c(FALSE, FALSE)
)
docvars(fedcorpus_3, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_3, by = "month")

summ$plots$feature_plot +
  theme(
    legend.direction = "vertical",
    legend.box = "horizontal",
    legend.position = "right",
    plot.title = element_text(size=13)
  )+
  labs(
    title = "Feature count of words showing strength & weakness",
    caption = "Source: US FED Beige Book"
  )

It is interesting to see that we can see an inverse relationship between the use of the “strong” and “weak” words from the graph. Moreover, we see that the largest dip in market strength occurs in 2008 and \(2020\) which coincides with the 2008 Gobal Financial Crisis as well as the \(2020\) Covid-Pandemic. Furthermore, the we also see a sharp increase in “strong” words in the later half of \(2020\) which also coincide with the booming equity market during this period.

3.2 Total Number of Words across the years

In this segment we will use the document level data to illustrate how we can extract the total number of words. Thankfully,the sentometric package allows us to simply extract the stats after we have applied the corpus_summarize() function:

docvars(fedcorpus.doc, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus.doc, by = "month")

summ$stats %>%
  rmarkdown::paged_table()

Evidently, as there is only 1 document per period, only the totalTokens is relavant to us. Let’s extract this data from the plot and plot out our results:

token_count <- summ$plots$token_plot$data %>% 
  as.data.frame() %>% 
  filter(variable == "maxTokens")

ggplot(data = token_count, aes(x=date, y = value))+
  geom_line(color = "#27408b")+
  geom_point(shape = 21, fill = "white", color = "#27408b", size = 1, stroke = 1.1)+
  scale_y_continuous(labels = scales::comma)+
  theme(plot.title = element_text(size=13))+
  labs(
    x = "year", 
    y = "number of words",
    title = "Number of words in Federal Reserve Biege Book",
    subtitle = "March 2008 - Sept 2022",
    caption = "Source: US FED Beige Book"
  )

Surprisingly, we see a sharp drop in total number of words in one of the month in the year \(2019\), but apart from that, the total number of words in the document remain consistent at around \(14000\)~\(16000\) words.

Evidently, the sentometric package makes it easy to extract insights based on the occurence of the dataset as well a document level insight.

4 Sentiment Computation

We supply the built in Loughran & McDonald and Henry word lists, and the more generic Harvard General Inquirer word list. We also add six other lexicons from the R package lexicon: the NRC lexicon, the Hu & Liu lexicon, the SentiWord lexicon, the Jockers lexicon, the SenticNet lexicon, and the SOCAL lexicon. This gives L = 9.

data("list_valence_shifters", package = "sentometrics")
data("list_lexicons", package = "sentometrics")
lexiconsIn <- c(
  list_lexicons[c("LM_en", "HENRY_en", "GI_en")],
  list(
    NRC = lexicon::hash_sentiment_nrc,
    HULIU = lexicon::hash_sentiment_huliu,
    SENTIWORD = lexicon::hash_sentiment_sentiword,
    JOCKERS = lexicon::hash_sentiment_jockers,
    SENTICNET = lexicon::hash_sentiment_senticnet
  )
)

# Store lexicon lists
lex <- sento_lexicons(
  lexiconsIn = lexiconsIn,
  valenceIn = list_valence_shifters[["en"]]
)

4.1 Aggregation & Measures

The sentometrics package brings a lot of different weighting methods to compute sentiment and aggregate them into document-level sentiment and time series. These weighting methods can be accessed with the function get_hows().

get_hows()
$words
[1] "counts"                 "proportional"          
[3] "proportionalPol"        "proportionalSquareRoot"
[5] "UShaped"                "inverseUShaped"        
[7] "exponential"            "inverseExponential"    
[9] "TFIDF"                 

$docs
[1] "equal_weight"        "proportional"       
[3] "inverseProportional" "exponential"        
[5] "inverseExponential" 

$time
[1] "equal_weight" "almon"        "beta"        
[4] "linear"       "exponential"  "own"         

To create sentiment time series, one needs a well specified aggregation setup defined via the control function ctr_agg(). To compute the measures in one go, the sento_measures() function is to be used. Sentiment time series allow to use the entire scope of the package.

ctrAgg <- ctr_agg(
  howWithin = "counts", howDocs = "proportional",
  howTime = c("exponential", "equal_weight"), 
  do.ignoreZeros = TRUE,
  by = "month", fill = "zero", 
  lag = 30, alphasExp = 0.2, 
  do.sentence = TRUE
)
sentMeas <- sento_measures(fedcorpus.doc, lexicons = lex, ctr = ctrAgg)

sentMeasFill <- measures_fill(
  sentMeas, 
  fill = "latest",
  dateBefore = "2021-01-01"
)
rmarkdown::paged_table(as.data.table(sentMeasFill) %>% head())
plot(
  sentMeasFill, group = "lexicons")+
  theme(
    legend.direction = "vertical",
    legend.box = "horizontal",
    legend.position = "right",
    plot.title = element_text(size=13)
  )+
  labs(
    title = "Sentiment through Federal Reserve Biege Book",
    subtitle = "Sentiments across various sentiment measurement metrics",
    caption = "Source: US FED Beige Book"
  )

plot(
  sentMeasFill, group = "time"
)+
  theme(
    legend.direction = "vertical",
    legend.box = "horizontal",
    legend.position = "right",
    plot.title = element_text(size=13)
  )+
  labs(
    title = "Sentiment through Federal Reserve Biege Book",
    subtitle = "Weighted vs Exponential Smoothing",
    caption = "Source: US FED Beige Book"
  )

Let’s take a specific look at the sentiment analysis using the Henry (Henry 2008) word lists:

df_Hry <- sentMeasFill$measures[,c(1,4)] %>% 
  as.data.frame() %>% 
  set_names(c("date","Henry_sentiment")) %>% 
  mutate_at("Henry_sentiment", ~(scale(.) %>% as.vector))

ggplot(df_Hry,  aes(date, Henry_sentiment, fill = Henry_sentiment>0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=c("red","#27408b"))+
  #facet_wrap(~report, ncol = 8, scales = "free_x")+
  labs(x="report (~8 per year)",y="sentiment",
       title="Sentiment in Federal Reserve Beige Book",
       subtitle="Sentiments extracted from the Henry lexicon word lists (Normalized)",
       caption="Source: US FED Beige Book\nDotted line represents Sept 2022 value" )+
  geom_hline(data=. %>% filter(date==max(date)), aes(yintercept=Henry_sentiment),linetype=2)+
  theme(plot.caption=element_text(hjust=0))

5 Similarities Analysis

In this segment, we will attempt to understand the relationship and similarity

5.1 Sentometric Analysis

Using the sentometric package, we can use the use the “TFIDF” function in the hows() method to perform a tf-idf weighting of the corpus object. Let us take a look at the similarity plot across the different lexicon method using the compute_sentiment() function

# from a sento_corpus object - unigrams approach with tf-idf weighting
sent_itidf <- compute_sentiment(fedcorpus.doc, lexicons = lex, how = "TFIDF")
plot(sent_itidf[,4:7])

plot(sent_itidf[,8:11])

5.2 TidyText Analysis

While the sentometric package provide us with a simple way to extract insights from our unstructured datasets, further word-level analysis would require a more manual cleaning & data manipulation process. Let’s start by expanding the typical stopwords that is was already provided:

custom_stop_words <- 
  bind_rows(
    tibble(
      word = c(
        tolower(month.abb), 
        "one","two","three","four","five","six",
        "seven","eight","nine","ten","eleven","twelve","mam","ered",
        "produc","ing","quar","ters","sug","quar",'fmam',"sug",
        "cient","thirty","pter","ate","aver","attribut",
        "partic","ble","partic","ately","im","ad","hoc",
        "cent","averag","cus","consum",
        tolower(month.abb),tolower(month.name),"ventories","delin",
        "ing","con","slid","ly","ed","er","tion","de","ture","ers",
        # new 'words' fragments
        "ty","manufactur","estly","increas","tinued","transporta",
        "sc","md","struction","cial","manufac","crease","wva","mercial",
        "ness","commer","al","indus","dis","creases","ported","idential",
        "er","es","ers","ii","ued","de","mand","ment","moder","contin",
        "con","tacts", "manu","ments","construc","creased","busi",
        "mod","tions","mained","ed","va","nc","tive","ly",
        "charlottesville","vermont","oregon","antic","condi",
        "na","ern","pre","fort","worth","grand","forks",
        "antici","pres","facturing","tial","pro","confi","activi","als",
        # end new words
        "pants","ter","ening","ances","www.federalreserve.gov",
        "tion","fig","ure","figure","src"
      ),
      lexicon = c("custom")
    ),
    stop_words
  )

We first unnest the tokens from the clean text into a single column and then manually calculate the tf-idf value of the word within the document.

Therefore, we use the cast_sparse function to create a sparse matrix of this unigram:

fed_text <- raw_text %>% 
  dplyr::select(report,rdate,page,line,col_id,ctext) %>%
  as_tibble() %>%
  unnest_tokens(word,ctext)


fed_text_by_month2 <- fed_text %>%
  # filter(rdate !="2019-07-17") %>%
  # keep only letters (drop numbers and special symbols)
  mutate(word = gsub("[^A-Za-z ]","",word)) %>%  
  filter(word != "") %>%
  anti_join(custom_stop_words, by="word") %>%
  count(rdate,word,sort=TRUE) %>%
  bind_tf_idf(word, rdate, n) %>%
  arrange(desc(rdate),desc(tf_idf))


sparse_matrix <- 
  fed_text_by_month2 %>%
  cast_sparse(rdate, word, tf)

We then use the sim2 function to extract the similarities of the words based on the unigram it-idf value as reflected in the sparse matrix and view the results

sim2 <- function(speciesData) {t(apply(speciesData,1,sample))}

similarities <- sim2(sparse_matrix) 

get_similar_letters <- function(similarities, reference_letter, n_recommendations = 3){
  sort(similarities[reference_letter, ], decreasing = TRUE)[1:(2 + n_recommendations)]
}

get_similar_letters(similarities, 1)
[1] 0.0163192 0.0144241 0.0122131 0.0122131 0.0113708

df_sim <- data.frame(rdate=unique(fed_text_by_month2$rdate)) %>% mutate(id=row_number())

myf <- function(i=1,j=1){similarities [i,j]}

df2 <- expand_grid(i=1:nrow(df_sim),j=1:nrow(df_sim)) %>% 
  left_join(df_sim,by=c("i"="id")) %>%
  left_join(rename(df_sim,rdate2=rdate), by=c("j"="id") ) %>%
  mutate(sim=map2(i,j,myf)) %>% unnest(sim)

5.2.1 Similarity Plot

Let’s take a look at the similarity based on the tf-idf statistics between the book reports:

ggplot(
  data=filter(df2),
  aes(x=-i,y=-j,fill=sim)
)+geom_tile()+
  scale_fill_viridis_c(
    option="C",
    name="Cosine Similarity",
    breaks=c(0.85,0.98),
    labels=c("Less Similar","More Similar")
  )+
  scale_x_continuous(
    labels=(unique(df2$rdate))[c(seq(1,98,8))],
    breaks=c(seq(-1,-98,-8)),expand=c(0.02,0.01)
  )+
  scale_y_continuous(
    labels=(unique(df2$rdate))[c(seq(1,98,8))],
    breaks=c(seq(-1,-98,-8)),expand=c(0,0)
  )+
  # devtools::install_github("lenkiefer/darklyplot")
  # darklyplot::theme_dark2(base_size=18,base_family="Arial")+
  theme(
    legend.position="top",legend.direction="horizontal",
    legend.key.width=unit(2,"cm"),
    plot.caption=element_text(hjust=0)
  )+
  labs(
    x="report date",y="report date",
    title="Similarity between Beige Book Reports",
    subtitle = "Cosine Similarity based on tf-idf statistics",
    caption="Source: US FED Beige Book"
  )

Let’s group them by year then analyse then compare the results again:

df2b <- df2 %>% 
  group_by(
    rdate,
    year2=year(rdate2)
  ) %>% 
  arrange(rdate, rdate2) %>%
  mutate(rnum=row_number()) %>%
  ungroup()
ggplot(
  data=filter(df2b, rdate>="2019-10-16"), 
  aes(
    x=factor(rnum),
    y=year(rdate2),
    fill=sim
  )
)+
  scale_fill_viridis_c(
    option="C",
    name="Cosine Similarity",
    breaks=c(0.85,0.98),
    labels=c("Less Similar","More Similar")
  )+
  geom_tile()+
  facet_wrap(~rdate)+
  # darklyplot::theme_dark2(base_size=18,base_family="Arial")+
  theme(
    legend.position="top",
    legend.direction="horizontal",
    legend.key.width=unit(2,"cm"),
    plot.caption=element_text(hjust=0)
  )+
  labs(
    x="report number",y="report date",
    title="Similarity between Beige Book Reports",
    subtitle = "Cosine Similarity based on tf-idf statistics (~8 reports a year)",
    caption="Source: US FED Beige Book"
  )