Read Data
fomcStatements <-readRDS(file = "fomc_corrected_data_v1.rds") %>% select(statement.dates, statement.content)
Exploratory Analysis
fomcS <- fomcStatements %>%  mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content) %>%
  unnest(text) %>% unnest_tokens(word, text) %>% anti_join(stop_words)%>%
  count(date, year, word, sort = TRUE)%>% mutate(frequency = n)  %>% select(date, year, word, frequency)
## Joining, by = "word"
head(fomcS)
## # A tibble: 6 x 4
##   date      year word      frequency
##   <chr>    <dbl> <chr>         <int>
## 1 20140319  2014 committee        19
## 2 20140430  2014 committee        18
## 3 20140618  2014 committee        18
## 4 20141029  2014 committee        18
## 5 20141217  2014 committee        18
## 6 20130619  2013 committee        17
tf-idf
fomc_x <- fomcS %>%
  bind_tf_idf(word, year, frequency) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(year) %>%
  mutate(id=row_number()) %>%
  ungroup()
head(fomc_x)
## # A tibble: 6 x 8
##   date      year word         frequency      tf   idf  tf_idf    id
##   <chr>    <dbl> <fct>            <int>   <dbl> <dbl>   <dbl> <int>
## 1 20081008  2008 central              7 0.00392  1.18 0.00462     1
## 2 20190320  2019 result               1 0.00152  2.56 0.00390     1
## 3 20091216  2009 2010                 5 0.00240  1.47 0.00352     1
## 4 20120620  2012 continuation         3 0.00127  2.56 0.00326     1
## 5 20081008  2008 canada               2 0.00112  2.56 0.00287     2
## 6 20081008  2008 england              2 0.00112  2.56 0.00287     3
tail(fomc_x)
## # A tibble: 6 x 8
##   date      year word      frequency      tf   idf  tf_idf    id
##   <chr>    <dbl> <fct>         <int>   <dbl> <dbl>   <dbl> <int>
## 1 20190130  2019 committee         6 0.00913 -2.06 -0.0188   433
## 2 20190320  2019 committee         6 0.00913 -2.06 -0.0188   434
## 3 20190501  2019 committee         6 0.00913 -2.06 -0.0188   435
## 4 20190130  2019 inflation         9 0.0137  -2.05 -0.0281   436
## 5 20190320  2019 inflation         9 0.0137  -2.05 -0.0281   437
## 6 20190501  2019 inflation         9 0.0137  -2.05 -0.0281   438

What are they talking about?

fed_text <- 
  fomcStatements %>%  mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content) %>%
  unnest(text) %>%
  unnest_tokens(word,text)%>%mutate(word = stripWhitespace(gsub("[^A-Za-z ]"," ",word))) %>% 
  filter(word != "") %>% filter(word != " ") %>%
  anti_join(stop_words) %>% select(date, year, word)
## Joining, by = "word"
f_text <- fed_text %>% group_by(year) %>% count(word,sort=TRUE) %>% mutate(rank=row_number()) %>%
  ungroup() %>% arrange(rank,year)
head(f_text)
## # A tibble: 6 x 4
##    year word          n  rank
##   <dbl> <chr>     <int> <int>
## 1  2007 inflation    34     1
## 2  2008 federal      48     1
## 3  2009 federal      67     1
## 4  2010 committee    50     1
## 5  2011 committee    89     1
## 6  2012 committee    87     1
 gg <- f_text %>% filter(rank<11)%>%
  ggplot(aes(y=n,x=fct_reorder(word,n))) +
  geom_col(fill="#27408b")+
  facet_wrap(~year,scales="free", ncol=4)+
  coord_flip()+
  theme_ridges(font_size=11)+
  labs(x="",y="",
       title="Most Frequent Words in FOMC Statements grouped by years (2007 - 2019")

gg

wordcloud(words = f_text$word, freq = f_text$n, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Recession period (2008)

fed_textb <- 
  fed_text %>%
  count(date, year,word,sort=TRUE) %>%
  bind_tf_idf(word, date, n) %>%
  arrange(desc(tf_idf))
fT <- fed_textb %>% 
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(date) %>%
  mutate(id=row_number()) %>%
  ungroup() %>% arrange(desc(n)) %>% filter(id<100)
dt <- subset(fT, year == 2008)%>% filter(id < 100)

ggplot(head(dt, 180), aes(word, tf_idf, fill = date)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~date,scales="free", ncol=4)+
  coord_flip()+
  theme_ridges(font_size=10)+
  theme(axis.text.x=element_blank())+
  labs(x="",y ="tf-idf",
       title="Highest words in FOMC Statements during recession (2008)")

Sentiment classification using the bing lexicon library

fed_sentiment <-
  fed_text %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(index=row_number()) %>%
  count(index, year, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
head(fed_text)
##       date year      word
## 1 20070131 2007   federal
## 2 20070131 2007    market
## 3 20070131 2007 committee
## 4 20070131 2007   decided
## 5 20070131 2007    target
## 6 20070131 2007   federal
ff <- fed_text %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(linenumber=row_number()) %>%
  count(date, year, index = linenumber %/% 10, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
head(ff)
## # A tibble: 6 x 6
##   date      year index negative positive sentiment
##   <chr>    <dbl> <dbl>    <dbl>    <dbl>     <dbl>
## 1 20070131  2007     0        4        2        -2
## 2 20070321  2007     0        3        0        -3
## 3 20070321  2007     1        1        0        -1
## 4 20070509  2007     1        5        0        -5
## 5 20070628  2007     1        2        2         0
## 6 20070628  2007     2        2        0        -2
 ggplot(ff, aes(index, sentiment, fill = sentiment>0)) +
 
   geom_col() +
 
  facet_wrap(~year,scales = "free", ncol = 4)

Sentiment Analysis

fomcX <- fomcStatements %>%  mutate(date = statement.dates, year = as.numeric(str_extract(statement.dates, '\\d{4}')), text= statement.content)%>% select(date, year, text)
Unoptimised
sentiment1 <- analyzeSentiment(fomcX$text, language = "english", aggregate = fomcX$year, removeStopwords = TRUE, stemming = TRUE)
head(sentiment1)
##   WordCount SentimentGI NegativityGI PositivityGI SentimentHE NegativityHE
## 1       118  0.08474576   0.07627119    0.1610169 0.025423729   0.01694915
## 2       109  0.07339450   0.09174312    0.1651376 0.009174312   0.01834862
## 3       111  0.08108108   0.07207207    0.1531532 0.018018018   0.01801802
## 4       115  0.10434783   0.06956522    0.1739130 0.026086957   0.01739130
## 5       134  0.09701493   0.06716418    0.1641791 0.044776119   0.02238806
## 6        83  0.06024096   0.04819277    0.1084337 0.024096386   0.03614458
##   PositivityHE  SentimentLM NegativityLM PositivityLM RatioUncertaintyLM
## 1   0.04237288  0.025423729  0.008474576  0.033898305         0.08474576
## 2   0.02752294 -0.027522936  0.036697248  0.009174312         0.04587156
## 3   0.03603604 -0.027027027  0.045045045  0.018018018         0.04504505
## 4   0.04347826 -0.008695652  0.034782609  0.026086957         0.03478261
## 5   0.06716418 -0.029850746  0.044776119  0.014925373         0.05223881
## 6   0.06024096 -0.012048193  0.036144578  0.024096386         0.06024096
##   SentimentQDAP NegativityQDAP PositivityQDAP
## 1    0.05932203     0.07627119     0.13559322
## 2    0.06422018     0.07339450     0.13761468
## 3    0.05405405     0.08108108     0.13513514
## 4    0.07826087     0.06956522     0.14782609
## 5    0.05970149     0.08208955     0.14179104
## 6    0.00000000     0.07228916     0.07228916

Generate analysis based on the Loughran-McDonald’s Financial dictionary

Optimised - runs faster
sentiment <- analyzeSentiment(fomcX$text, language = "english", aggregate = fomcX$year, removeStopwords = TRUE, stemming = TRUE, rules=list("SentimentLM"=list(ruleSentiment, loadDictionaryLM())))
head(sentiment)
##    SentimentLM
## 1  0.025423729
## 2 -0.027522936
## 3 -0.027027027
## 4 -0.008695652
## 5 -0.029850746
## 6 -0.012048193
summary(sentiment$SentimentLM)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.062937 -0.025385 -0.014851 -0.016910 -0.008362  0.025424
Count positive and negative words
table(convertToBinaryResponse(sentiment$SentimentLM))
## 
## negative positive 
##       89       13

Quick plot on the results

sentimentData <- sentiment$SentimentLM   
plotSentiment(sentimentData)

hist(sentiment$SentimentLM, probability=TRUE,
     main="Density of Distribution for Standardized Sentiment Variable")
lines(density(sentiment$SentimentLM))

Compute cross-correlation with other dictionaries

cor(sentiment1[, c("SentimentLM", "SentimentHE", "SentimentQDAP")])
##               SentimentLM SentimentHE SentimentQDAP
## SentimentLM    1.00000000   0.3515640   -0.04534553
## SentimentHE    0.35156396   1.0000000   -0.21018848
## SentimentQDAP -0.04534553  -0.2101885    1.00000000
plotSentiment(sentiment$SentimentLM, x=fomcX$year, cumsum=TRUE, xlab = "year")