In this post, textmining is conducted on the Fed Beige Book to compare the sentiments of reports across time. We will be focusing on the use of the sentometrics package in performing the said analysis.
library(tidytext)
library(stringr)
library(dplyr)
library(tidyverse)
library(lubridate)
library(quanteda)
library(sentometrics)
library(data.table)
library(ggplot2)
library(gridExtra)
library(kableExtra)
library(pdftools)
We will explore the Beige Book, which gathers anecdotal information on current economic conditions across the Federal Reserve Districts. As these reports collect anedotes from various business across the country they seem like a grate candidate for fruitful text analysis.
path_root="."
path_data=file.path(path_root, "data")
path_pdfs=file.path(path_data, "pdfs")
load(file=file.path(path_data, "fed_raw_text.Rdata"))
beige_links %>% select(url, report=report.date) %>% format.dt.f(.)
Lets split the data into two hierarchical levels:
Line Level - A more granular line level dataset to understand the relationship and occurrences of words within the document.
Document Level - Overall document level dataset with text for each report aggregated into a row
Let’s take a look at how the two dataset looks like:
#== Line Level ======
sentocorp.line <- raw_text %>%
select(date = rdate, texts = ctext) %>%
mutate(
id = row_number(),
fed = 1) %>%
select(id,date,texts,fed) %>%
arrange(id)
# sentocorp.line
#== Document Level ======
sentocorp.doc <- raw_text %>%
select(id = report, date = rdate, texts = ctext) %>%
group_by(id,date) %>%
summarise(texts = paste0(texts, collapse = "")) %>%
ungroup() %>%
mutate(fed = 1) %>%
select(id,date,texts,fed) %>%
arrange(id)
# sentocorp.doc
Once we are done, we use the sento_corpus() function to convert these datasets into a sento_corpus object
fedcorpus.line <- sento_corpus(corpusdf = sentocorp.line,do.clean = TRUE)
fedcorpus.doc <- sento_corpus(corpusdf = sentocorp.doc,do.clean = TRUE)
As shows below, our document level dataset comprises of 101 documents.
fedcorpus.doc
A sento_corpus consisting of 115 documents and 2 docvars.
Now that we have prepared the two hierarchical levels of the corpus. We will first attempt to use the line-level dataset to extract the number of times certain expressions or words has been used in the document. We will examine the number of occurrence of words related to the follow topics:
For each of the topics, we will examine the appearance of related words and attempt to derive some insights from the reults.
# == Regular expression =====
# 1.Election related words
# Election & Party related expressions
election.regex <- paste0("\\belection[s]?\\b|\\brepublican[s]?\\b|\\bdemocrats[s]?\\b")
# Add Features
fedcorpus_1 <- add_features(
fedcorpus.line,
keywords = list(
election = election.regex,
policies = c('policies', 'policy','administration')
),
do.binary = FALSE, do.regex = c(TRUE,FALSE)
)
docvars(fedcorpus_1, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_1, by = "month")
summ$plots$feature_plot +
theme(
legend.direction = "vertical",
legend.box = "horizontal",
legend.position = "right",
plot.title = element_text(size=13)
)+
labs(
title = "Feature count of words related to Elections & Policies",
caption = "Source: US FED Beige Book"
)
While it is intuitive to see that election related words has a highest frequency every few years (coincide with the elections every four years), it is surprising to see words related to policies spike during this period as well.
Perhaps one can infer that a large majority of policies are laid out at the beginning of every presidential electoral term.
Now let’s take a look at pandemic & market fears across the years:
# 2. pandemic and market related
# Words used in reference to covid & uncertainty
covid.regex <- paste0("\\bcovid[.-]?\\b|\\bpandemic[.-]?\\b|\\bvirus\\b")
in.regex <- paste0("\\buncertainty\\b|\\buncertainties\\b|\\b|\\bbubble\\b")
fedcorpus_2 <- add_features(
fedcorpus.line,
keywords = list(pandemic = covid.regex, uncertainty = in.regex),
do.binary = FALSE,
do.regex = c(TRUE, TRUE)
)
docvars(fedcorpus_2, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_2, by = "month")
summ$plots$feature_plot +
theme(
legend.direction = "vertical",
legend.box = "horizontal",
legend.position = "right",
plot.title = element_text(size=13)
)+
labs(
title = "Feature count of words related to Pandemic & Market Fears",
caption = "Source: US FED Beige Book"
)
It is interesting to see a large spike in the use of words related to uncertainties from \(2016\)-\(2017\). Evidently the use of words related to the covid pandemic is occurs in year \(2020\).
We will be using words that are usually used in conjuncture with the market/economy strength.
# 3. Strength of market
# Words used in reference to strong and weak
strong.regex <- c("strong","stronger","strongest","strengthen",
"strengthened","strongly","strength","strengthening")
weak.regex <- c("weak","weaken","weakened","weaker","weakest","weakly","weakening")
fedcorpus_3 <- add_features(
fedcorpus.line,
keywords = list(strong = strong.regex, weak = weak.regex),
do.binary = FALSE,
do.regex = c(FALSE, FALSE)
)
docvars(fedcorpus_3, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus_3, by = "month")
summ$plots$feature_plot +
theme(
legend.direction = "vertical",
legend.box = "horizontal",
legend.position = "right",
plot.title = element_text(size=13)
)+
labs(
title = "Feature count of words showing strength & weakness",
caption = "Source: US FED Beige Book"
)
It is interesting to see that we can see an inverse relationship between the use of the “strong” and “weak” words from the graph. Moreover, we see that the largest dip in market strength occurs in 2008 and \(2020\) which coincides with the 2008 Gobal Financial Crisis as well as the \(2020\) Covid-Pandemic. Furthermore, the we also see a sharp increase in “strong” words in the later half of \(2020\) which also coincide with the booming equity market during this period.
In this segment we will use the document level data to illustrate how
we can extract the total number of words. Thankfully,the
sentometric
package allows us to simply extract the stats
after we have applied the corpus_summarize() function:
docvars(fedcorpus.doc, "fed") <- NULL # removing features
summ <- corpus_summarize(fedcorpus.doc, by = "month")
summ$stats %>%
rmarkdown::paged_table()
Evidently, as there is only 1 document per period, only the totalTokens is relavant to us. Let’s extract this data from the plot and plot out our results:
token_count <- summ$plots$token_plot$data %>%
as.data.frame() %>%
filter(variable == "maxTokens")
ggplot(data = token_count, aes(x=date, y = value))+
geom_line(color = "#27408b")+
geom_point(shape = 21, fill = "white", color = "#27408b", size = 1, stroke = 1.1)+
scale_y_continuous(labels = scales::comma)+
theme(plot.title = element_text(size=13))+
labs(
x = "year",
y = "number of words",
title = "Number of words in Federal Reserve Biege Book",
subtitle = "March 2008 - Sept 2022",
caption = "Source: US FED Beige Book"
)
Surprisingly, we see a sharp drop in total number of words in one of the month in the year \(2019\), but apart from that, the total number of words in the document remain consistent at around \(14000\)~\(16000\) words.
Evidently, the sentometric
package makes it easy to
extract insights based on the occurence of the dataset as well a
document level insight.
We supply the built in Loughran & McDonald and Henry word lists, and the more generic Harvard General Inquirer word list. We also add six other lexicons from the R package lexicon: the NRC lexicon, the Hu & Liu lexicon, the SentiWord lexicon, the Jockers lexicon, the SenticNet lexicon, and the SOCAL lexicon. This gives L = 9.
data("list_valence_shifters", package = "sentometrics")
data("list_lexicons", package = "sentometrics")
lexiconsIn <- c(
list_lexicons[c("LM_en", "HENRY_en", "GI_en")],
list(
NRC = lexicon::hash_sentiment_nrc,
HULIU = lexicon::hash_sentiment_huliu,
SENTIWORD = lexicon::hash_sentiment_sentiword,
JOCKERS = lexicon::hash_sentiment_jockers,
SENTICNET = lexicon::hash_sentiment_senticnet
)
)
# Store lexicon lists
lex <- sento_lexicons(
lexiconsIn = lexiconsIn,
valenceIn = list_valence_shifters[["en"]]
)
The sentometrics
package brings a lot
of different weighting methods to compute sentiment and aggregate them
into document-level sentiment and time series. These weighting methods
can be accessed with the function get_hows()
.
get_hows()
$words
[1] "counts" "proportional"
[3] "proportionalPol" "proportionalSquareRoot"
[5] "UShaped" "inverseUShaped"
[7] "exponential" "inverseExponential"
[9] "TFIDF"
$docs
[1] "equal_weight" "proportional"
[3] "inverseProportional" "exponential"
[5] "inverseExponential"
$time
[1] "equal_weight" "almon" "beta"
[4] "linear" "exponential" "own"
To create sentiment time series, one needs a well specified aggregation setup defined via the control function ctr_agg(). To compute the measures in one go, the sento_measures() function is to be used. Sentiment time series allow to use the entire scope of the package.
ctrAgg <- ctr_agg(
howWithin = "counts", howDocs = "proportional",
howTime = c("exponential", "equal_weight"),
do.ignoreZeros = TRUE,
by = "month", fill = "zero",
lag = 30, alphasExp = 0.2,
do.sentence = TRUE
)
sentMeas <- sento_measures(fedcorpus.doc, lexicons = lex, ctr = ctrAgg)
sentMeasFill <- measures_fill(
sentMeas,
fill = "latest",
dateBefore = "2021-01-01"
)
rmarkdown::paged_table(as.data.table(sentMeasFill) %>% head())
plot(
sentMeasFill, group = "lexicons")+
theme(
legend.direction = "vertical",
legend.box = "horizontal",
legend.position = "right",
plot.title = element_text(size=13)
)+
labs(
title = "Sentiment through Federal Reserve Biege Book",
subtitle = "Sentiments across various sentiment measurement metrics",
caption = "Source: US FED Beige Book"
)
plot(
sentMeasFill, group = "time"
)+
theme(
legend.direction = "vertical",
legend.box = "horizontal",
legend.position = "right",
plot.title = element_text(size=13)
)+
labs(
title = "Sentiment through Federal Reserve Biege Book",
subtitle = "Weighted vs Exponential Smoothing",
caption = "Source: US FED Beige Book"
)
Let’s take a specific look at the sentiment analysis using the Henry (Henry 2008) word lists:
df_Hry <- sentMeasFill$measures[,c(1,4)] %>%
as.data.frame() %>%
set_names(c("date","Henry_sentiment")) %>%
mutate_at("Henry_sentiment", ~(scale(.) %>% as.vector))
ggplot(df_Hry, aes(date, Henry_sentiment, fill = Henry_sentiment>0)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=c("red","#27408b"))+
#facet_wrap(~report, ncol = 8, scales = "free_x")+
labs(x="report (~8 per year)",y="sentiment",
title="Sentiment in Federal Reserve Beige Book",
subtitle="Sentiments extracted from the Henry lexicon word lists (Normalized)",
caption="Source: US FED Beige Book\nDotted line represents Sept 2022 value" )+
geom_hline(data=. %>% filter(date==max(date)), aes(yintercept=Henry_sentiment),linetype=2)+
theme(plot.caption=element_text(hjust=0))
In this segment, we will attempt to understand the relationship and similarity
Using the sentometric
package, we can use the use the
“TFIDF” function in the hows() method to perform a tf-idf
weighting of the corpus object. Let us take a look at the similarity
plot across the different lexicon method using the
compute_sentiment() function
# from a sento_corpus object - unigrams approach with tf-idf weighting
sent_itidf <- compute_sentiment(fedcorpus.doc, lexicons = lex, how = "TFIDF")
plot(sent_itidf[,4:7])
plot(sent_itidf[,8:11])
While the sentometric
package provide us with a simple
way to extract insights from our unstructured datasets, further
word-level analysis would require a more manual cleaning & data
manipulation process. Let’s start by expanding the typical stopwords
that is was already provided:
custom_stop_words <-
bind_rows(
tibble(
word = c(
tolower(month.abb),
"one","two","three","four","five","six",
"seven","eight","nine","ten","eleven","twelve","mam","ered",
"produc","ing","quar","ters","sug","quar",'fmam',"sug",
"cient","thirty","pter","ate","aver","attribut",
"partic","ble","partic","ately","im","ad","hoc",
"cent","averag","cus","consum",
tolower(month.abb),tolower(month.name),"ventories","delin",
"ing","con","slid","ly","ed","er","tion","de","ture","ers",
# new 'words' fragments
"ty","manufactur","estly","increas","tinued","transporta",
"sc","md","struction","cial","manufac","crease","wva","mercial",
"ness","commer","al","indus","dis","creases","ported","idential",
"er","es","ers","ii","ued","de","mand","ment","moder","contin",
"con","tacts", "manu","ments","construc","creased","busi",
"mod","tions","mained","ed","va","nc","tive","ly",
"charlottesville","vermont","oregon","antic","condi",
"na","ern","pre","fort","worth","grand","forks",
"antici","pres","facturing","tial","pro","confi","activi","als",
# end new words
"pants","ter","ening","ances","www.federalreserve.gov",
"tion","fig","ure","figure","src"
),
lexicon = c("custom")
),
stop_words
)
We first unnest the tokens from the clean text into a single column and then manually calculate the tf-idf value of the word within the document.
Therefore, we use the cast_sparse function to create a sparse matrix of this unigram:
fed_text <- raw_text %>%
dplyr::select(report,rdate,page,line,col_id,ctext) %>%
as_tibble() %>%
unnest_tokens(word,ctext)
fed_text_by_month2 <- fed_text %>%
# filter(rdate !="2019-07-17") %>%
# keep only letters (drop numbers and special symbols)
mutate(word = gsub("[^A-Za-z ]","",word)) %>%
filter(word != "") %>%
anti_join(custom_stop_words, by="word") %>%
count(rdate,word,sort=TRUE) %>%
bind_tf_idf(word, rdate, n) %>%
arrange(desc(rdate),desc(tf_idf))
sparse_matrix <-
fed_text_by_month2 %>%
cast_sparse(rdate, word, tf)
We then use the sim2 function to extract the similarities of the words based on the unigram it-idf value as reflected in the sparse matrix and view the results
sim2 <- function(speciesData) {t(apply(speciesData,1,sample))}
similarities <- sim2(sparse_matrix)
get_similar_letters <- function(similarities, reference_letter, n_recommendations = 3){
sort(similarities[reference_letter, ], decreasing = TRUE)[1:(2 + n_recommendations)]
}
get_similar_letters(similarities, 1)
[1] 0.0163192 0.0144241 0.0122131 0.0122131 0.0113708
df_sim <- data.frame(rdate=unique(fed_text_by_month2$rdate)) %>% mutate(id=row_number())
myf <- function(i=1,j=1){similarities [i,j]}
df2 <- expand_grid(i=1:nrow(df_sim),j=1:nrow(df_sim)) %>%
left_join(df_sim,by=c("i"="id")) %>%
left_join(rename(df_sim,rdate2=rdate), by=c("j"="id") ) %>%
mutate(sim=map2(i,j,myf)) %>% unnest(sim)
Let’s take a look at the similarity based on the tf-idf statistics between the book reports:
ggplot(
data=filter(df2),
aes(x=-i,y=-j,fill=sim)
)+geom_tile()+
scale_fill_viridis_c(
option="C",
name="Cosine Similarity",
breaks=c(0.85,0.98),
labels=c("Less Similar","More Similar")
)+
scale_x_continuous(
labels=(unique(df2$rdate))[c(seq(1,98,8))],
breaks=c(seq(-1,-98,-8)),expand=c(0.02,0.01)
)+
scale_y_continuous(
labels=(unique(df2$rdate))[c(seq(1,98,8))],
breaks=c(seq(-1,-98,-8)),expand=c(0,0)
)+
# devtools::install_github("lenkiefer/darklyplot")
# darklyplot::theme_dark2(base_size=18,base_family="Arial")+
theme(
legend.position="top",legend.direction="horizontal",
legend.key.width=unit(2,"cm"),
plot.caption=element_text(hjust=0)
)+
labs(
x="report date",y="report date",
title="Similarity between Beige Book Reports",
subtitle = "Cosine Similarity based on tf-idf statistics",
caption="Source: US FED Beige Book"
)
Let’s group them by year then analyse then compare the results again:
df2b <- df2 %>%
group_by(
rdate,
year2=year(rdate2)
) %>%
arrange(rdate, rdate2) %>%
mutate(rnum=row_number()) %>%
ungroup()
ggplot(
data=filter(df2b, rdate>="2019-10-16"),
aes(
x=factor(rnum),
y=year(rdate2),
fill=sim
)
)+
scale_fill_viridis_c(
option="C",
name="Cosine Similarity",
breaks=c(0.85,0.98),
labels=c("Less Similar","More Similar")
)+
geom_tile()+
facet_wrap(~rdate)+
# darklyplot::theme_dark2(base_size=18,base_family="Arial")+
theme(
legend.position="top",
legend.direction="horizontal",
legend.key.width=unit(2,"cm"),
plot.caption=element_text(hjust=0)
)+
labs(
x="report number",y="report date",
title="Similarity between Beige Book Reports",
subtitle = "Cosine Similarity based on tf-idf statistics (~8 reports a year)",
caption="Source: US FED Beige Book"
)