Read Data
fomcStatements <-readRDS(file = "fomc_corrected_data_v1.rds") %>% select(statement.dates, statement.content)
Exploratory Analysis
fomcS <- fomcStatements %>% mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content) %>%
unnest(text) %>% unnest_tokens(word, text) %>% anti_join(stop_words)%>%
count(date, year, word, sort = TRUE)%>% mutate(frequency = n) %>% select(date, year, word, frequency)
## Joining, by = "word"
head(fomcS)
## # A tibble: 6 x 4
## date year word frequency
## <chr> <dbl> <chr> <int>
## 1 20140319 2014 committee 19
## 2 20140430 2014 committee 18
## 3 20140618 2014 committee 18
## 4 20141029 2014 committee 18
## 5 20141217 2014 committee 18
## 6 20130619 2013 committee 17
tf-idf
fomc_x <- fomcS %>%
bind_tf_idf(word, year, frequency) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(year) %>%
mutate(id=row_number()) %>%
ungroup()
head(fomc_x)
## # A tibble: 6 x 8
## date year word frequency tf idf tf_idf id
## <chr> <dbl> <fct> <int> <dbl> <dbl> <dbl> <int>
## 1 20081008 2008 central 7 0.00392 1.18 0.00462 1
## 2 20190320 2019 result 1 0.00152 2.56 0.00390 1
## 3 20091216 2009 2010 5 0.00240 1.47 0.00352 1
## 4 20120620 2012 continuation 3 0.00127 2.56 0.00326 1
## 5 20081008 2008 canada 2 0.00112 2.56 0.00287 2
## 6 20081008 2008 england 2 0.00112 2.56 0.00287 3
tail(fomc_x)
## # A tibble: 6 x 8
## date year word frequency tf idf tf_idf id
## <chr> <dbl> <fct> <int> <dbl> <dbl> <dbl> <int>
## 1 20190130 2019 committee 6 0.00913 -2.06 -0.0188 433
## 2 20190320 2019 committee 6 0.00913 -2.06 -0.0188 434
## 3 20190501 2019 committee 6 0.00913 -2.06 -0.0188 435
## 4 20190130 2019 inflation 9 0.0137 -2.05 -0.0281 436
## 5 20190320 2019 inflation 9 0.0137 -2.05 -0.0281 437
## 6 20190501 2019 inflation 9 0.0137 -2.05 -0.0281 438
What are they talking about?
fed_text <-
fomcStatements %>% mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content) %>%
unnest(text) %>%
unnest_tokens(word,text)%>%mutate(word = stripWhitespace(gsub("[^A-Za-z ]"," ",word))) %>%
filter(word != "") %>% filter(word != " ") %>%
anti_join(stop_words) %>% select(date, year, word)
## Joining, by = "word"
f_text <- fed_text %>% group_by(year) %>% count(word,sort=TRUE) %>% mutate(rank=row_number()) %>%
ungroup() %>% arrange(rank,year)
head(f_text)
## # A tibble: 6 x 4
## year word n rank
## <dbl> <chr> <int> <int>
## 1 2007 inflation 34 1
## 2 2008 federal 48 1
## 3 2009 federal 67 1
## 4 2010 committee 50 1
## 5 2011 committee 89 1
## 6 2012 committee 87 1
gg <- f_text %>% filter(rank<11)%>%
ggplot(aes(y=n,x=fct_reorder(word,n))) +
geom_col(fill="#27408b")+
facet_wrap(~year,scales="free", ncol=4)+
coord_flip()+
theme_ridges(font_size=11)+
labs(x="",y="",
title="Most Frequent Words in FOMC Statements grouped by years (2007 - 2019")
gg

wordcloud(words = f_text$word, freq = f_text$n, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

Recession period (2008)
fed_textb <-
fed_text %>%
count(date, year,word,sort=TRUE) %>%
bind_tf_idf(word, date, n) %>%
arrange(desc(tf_idf))
fT <- fed_textb %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(date) %>%
mutate(id=row_number()) %>%
ungroup() %>% arrange(desc(n)) %>% filter(id<100)
dt <- subset(fT, year == 2008)%>% filter(id < 100)
ggplot(head(dt, 180), aes(word, tf_idf, fill = date)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~date,scales="free", ncol=4)+
coord_flip()+
theme_ridges(font_size=10)+
theme(axis.text.x=element_blank())+
labs(x="",y ="tf-idf",
title="Highest words in FOMC Statements during recession (2008)")

Sentiment classification using the bing lexicon library
fed_sentiment <-
fed_text %>%
inner_join(get_sentiments("bing")) %>%
mutate(index=row_number()) %>%
count(index, year, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
head(fed_text)
## date year word
## 1 20070131 2007 federal
## 2 20070131 2007 market
## 3 20070131 2007 committee
## 4 20070131 2007 decided
## 5 20070131 2007 target
## 6 20070131 2007 federal
ff <- fed_text %>%
inner_join(get_sentiments("bing")) %>%
mutate(linenumber=row_number()) %>%
count(date, year, index = linenumber %/% 10, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
head(ff)
## # A tibble: 6 x 6
## date year index negative positive sentiment
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20070131 2007 0 4 2 -2
## 2 20070321 2007 0 3 0 -3
## 3 20070321 2007 1 1 0 -1
## 4 20070509 2007 1 5 0 -5
## 5 20070628 2007 1 2 2 0
## 6 20070628 2007 2 2 0 -2
ggplot(ff, aes(index, sentiment, fill = sentiment>0)) +
geom_col() +
facet_wrap(~year,scales = "free", ncol = 4)

Sentiment Analysis
fomcX <- fomcStatements %>% mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content)%>% select(date, year, text)
Unoptimised
sentiment1 <- analyzeSentiment(fomcX$text, language = "english", aggregate = fomcX$year, removeStopwords = TRUE, stemming = TRUE)
head(sentiment1)
## WordCount SentimentGI NegativityGI PositivityGI SentimentHE NegativityHE
## 1 118 0.08474576 0.07627119 0.1610169 0.025423729 0.01694915
## 2 109 0.07339450 0.09174312 0.1651376 0.009174312 0.01834862
## 3 111 0.08108108 0.07207207 0.1531532 0.018018018 0.01801802
## 4 115 0.10434783 0.06956522 0.1739130 0.026086957 0.01739130
## 5 134 0.09701493 0.06716418 0.1641791 0.044776119 0.02238806
## 6 83 0.06024096 0.04819277 0.1084337 0.024096386 0.03614458
## PositivityHE SentimentLM NegativityLM PositivityLM RatioUncertaintyLM
## 1 0.04237288 0.025423729 0.008474576 0.033898305 0.08474576
## 2 0.02752294 -0.027522936 0.036697248 0.009174312 0.04587156
## 3 0.03603604 -0.027027027 0.045045045 0.018018018 0.04504505
## 4 0.04347826 -0.008695652 0.034782609 0.026086957 0.03478261
## 5 0.06716418 -0.029850746 0.044776119 0.014925373 0.05223881
## 6 0.06024096 -0.012048193 0.036144578 0.024096386 0.06024096
## SentimentQDAP NegativityQDAP PositivityQDAP
## 1 0.05932203 0.07627119 0.13559322
## 2 0.06422018 0.07339450 0.13761468
## 3 0.05405405 0.08108108 0.13513514
## 4 0.07826087 0.06956522 0.14782609
## 5 0.05970149 0.08208955 0.14179104
## 6 0.00000000 0.07228916 0.07228916
Generate analysis based on the Loughran-McDonald’s Financial dictionary
Optimised - runs faster
sentiment <- analyzeSentiment(fomcX$text, language = "english", aggregate = fomcX$year, removeStopwords = TRUE, stemming = TRUE, rules=list("SentimentLM"=list(ruleSentiment, loadDictionaryLM())))
head(sentiment)
## SentimentLM
## 1 0.025423729
## 2 -0.027522936
## 3 -0.027027027
## 4 -0.008695652
## 5 -0.029850746
## 6 -0.012048193
summary(sentiment$SentimentLM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.062937 -0.025385 -0.014851 -0.016910 -0.008362 0.025424
Count positive and negative words
table(convertToBinaryResponse(sentiment$SentimentLM))
##
## negative positive
## 89 13
Quick plot on the results
sentimentData <- sentiment$SentimentLM
plotSentiment(sentimentData)

hist(sentiment$SentimentLM, probability=TRUE,
main="Density of Distribution for Standardized Sentiment Variable")
lines(density(sentiment$SentimentLM))

Compute cross-correlation with other dictionaries
cor(sentiment1[, c("SentimentLM", "SentimentHE", "SentimentQDAP")])
## SentimentLM SentimentHE SentimentQDAP
## SentimentLM 1.00000000 0.3515640 -0.04534553
## SentimentHE 0.35156396 1.0000000 -0.21018848
## SentimentQDAP -0.04534553 -0.2101885 1.00000000
plotSentiment(sentiment$SentimentLM, x=fomcX$year, cumsum=TRUE, xlab = "year")
