DATA607 Final Project

  # list of reports, comments indicate important events around release of report
reportLinks=c("https://www.federalreserve.gov/monetarypolicy/files/20180713_mprfullreport.pdf",  
            "https://www.federalreserve.gov/monetarypolicy/files/20170707_mprfullreport.pdf",
            "https://www.federalreserve.gov/monetarypolicy/files/20160621_mprfullreport.pdf",            # released in jun, will label it July
            "https://www.federalreserve.gov/monetarypolicy/files/20150715_mprfullreport.pdf",            # July 2015  ( before lift off)
            "https://www.federalreserve.gov/monetarypolicy/files/20140715_mprfullreport.pdf",
            "https://www.federalreserve.gov/monetarypolicy/files/20130717_mprfullreport.pdf",            # July 2013  ( after Taper Tantrum)
            "https://www.federalreserve.gov/monetarypolicy/files/20120717_mprfullreport.pdf",
            "https://www.federalreserve.gov/monetarypolicy/files/20110713_mprfullreport.pdf",            # July 2011  ( early recovery)
            "https://www.federalreserve.gov/monetarypolicy/files/20100721_mprfullreport.pdf",
            "https://www.federalreserve.gov/monetarypolicy/files/20090721_mprfullreport.pdf",            # July 2009  ( end of Great Recession)
            "https://www.federalreserve.gov/monetarypolicy/files/20080715_mprfullreport.pdf",
            "https://www.federalreserve.gov/monetarypolicy/files/20070718_mprfullreport.pdf" ,           # July 2007  ( eve of  Great Recession)
            "https://www.federalreserve.gov/boarddocs/hh/2006/july/fullreport.pdf",
            "https://www.federalreserve.gov/boarddocs/hh/2005/july/fullreport.pdf",                      # July 2005  ( housing boom)
            "https://www.federalreserve.gov/boarddocs/hh/2004/july/fullreport.pdf",
            "https://www.federalreserve.gov/boarddocs/hh/2003/july/FullReport.pdf" ,                     # July 2003  ( deflation fears)
            "https://www.federalreserve.gov/boarddocs/hh/2002/july/FullReport.pdf",
            "https://www.federalreserve.gov/boarddocs/hh/2001/july/FullReport.pdf",                      # July 2001  ( dot come Recession)
            "https://www.federalreserve.gov/boarddocs/hh/2000/July/FullReport.pdf",
            "https://www.federalreserve.gov/boarddocs/hh/1999/July/FullReport.pdf",                      # July 1999  ( eve of dotcom Recession)
            "https://www.federalreserve.gov/boarddocs/hh/1998/july/FullReport.pdf",
            "https://www.federalreserve.gov/boarddocs/hh/1997/july/FullReport.pdf",                       # July 1997 ( irrational exhuberance)
            "https://www.federalreserve.gov/boarddocs/hh/1996/july/FullReport.pdf"
            )

fed_reports <- map(reportLinks,pdf_text)

length(fed_reports)

fed <- data.frame(report=c("Jul-2018",paste0("Jul-",seq(2017,1996,-1))),stringsAsFactors = FALSE) %>%
  mutate(text= fed_reports) %>% unnest(text) %>% 
  group_by(report) %>% mutate(page=row_number()) %>%
  ungroup() %>% mutate(text=strsplit(text,"\r")) %>% unnest(text) %>% mutate(text=gsub("\n","",text)) %>%
  group_by(report) %>% mutate(line=row_number())

write.csv(fed, "fed_reports.csv")

fed_r <- read.csv("fed_reports.csv", stringsAsFactors=FALSE)

head(fed_r)

##   X   report page                                        text line
## 1 1 Jul-2018    3                       Letter of Transmittal    1
## 2 2 Jul-2018    3                   Board of Governors of the    2
## 3 3 Jul-2018    3                      Federal Reserve System    3
## 4 4 Jul-2018    3             Washington, D.C., July 13, 2018    4
## 5 5 Jul-2018    3                 The President of the Senate    5
## 6 6 Jul-2018    3 The Speaker of the House of Representatives    6

fed_x <- fed_r %>% dplyr::select(report, page, text, line)

head(fed_x)

##     report page                                        text line
## 1 Jul-2018    3                       Letter of Transmittal    1
## 2 Jul-2018    3                   Board of Governors of the    2
## 3 Jul-2018    3                      Federal Reserve System    3
## 4 Jul-2018    3             Washington, D.C., July 13, 2018    4
## 5 Jul-2018    3                 The President of the Senate    5
## 6 Jul-2018    3 The Speaker of the House of Representatives    6

words <- fed_x %>%
  unnest_tokens(word, text) %>%
  count(report, word, sort = TRUE) %>%
  ungroup()

total_words <- words %>% 
  group_by(report) %>% 
  summarize(total = sum(n))

Comparing words count in each report

ggplot(data=total_words, aes(x=seq(1996,2018),y=total))+
  geom_line(color="#27408b")+
  geom_point(shape=21,fill="white",color="#27408b",size=3,stroke=1.1)+
  scale_y_continuous(labels=scales::comma)+
  theme_ridges()+
  labs(x="year",y="Words count",
       title="Words count in Federal Reserve Monetary Policy Reports",
       subtitle="For July of each year 1996-2018")

Making a list of the most frequently occuring words in each report, while removing stop-words, numbers, and special characters

fed_text <- fed_x %>% unnest_tokens(word,text)

head(fed_text)

##       report page line        word
## 1   Jul-2018    3    1      letter
## 1.1 Jul-2018    3    1          of
## 1.2 Jul-2018    3    1 transmittal
## 2   Jul-2018    3    2       board
## 2.1 Jul-2018    3    2          of
## 2.2 Jul-2018    3    2   governors

fedText <- fed_text %>% 
  mutate(word = gsub("[^A-Za-z ]","",word)) %>% 
  filter(word != "") %>%
  anti_join(stop_words) %>%
  group_by(report) %>%
  count(word,sort=TRUE) %>% 
  mutate(rank=row_number()) %>%
  ungroup() %>% 
  arrange(rank,report) %>%
  filter(rank<11)

## Joining, by = "word"

head(fedText)

## # A tibble: 6 x 4
##   report   word        n  rank
##   <chr>    <chr>   <int> <int>
## 1 Jul-1996 percent   129     1
## 2 Jul-1997 quarter   139     1
## 3 Jul-1998 percent   159     1
## 4 Jul-1999 percent   157     1
## 5 Jul-2000 percent   150     1
## 6 Jul-2001 percent   171     1

tail(fedText)

## # A tibble: 6 x 4
##   report   word             n  rank
##   <chr>    <chr>        <int> <int>
## 1 Jul-2013 reserve         88    10
## 2 Jul-2014 monetary        96    10
## 3 Jul-2015 monetary        96    10
## 4 Jul-2016 policy          69    10
## 5 Jul-2017 participants   125    10
## 6 Jul-2018 participants   129    10

Stop-words are gone

ggplot(fedText, aes(y=n,x=fct_reorder(word,n))) +
  geom_col(fill="#27408b")+
  facet_wrap(~report,scales="free", ncol=5)+
  coord_flip()+
  theme_ridges(font_size=10)+
  labs(x="",y="",
       title="Most Frequent Words Federal Reserve Monetary Policy Report")

DATA607 Final Project

Henry Otuadinma

17 March 2019

Comparing words count in each report

Making a list of the most frequently occuring words in each report, while removing stop-words, numbers, and special characters

Stop-words are gone