1️⃣ Setup and Install Required Packages

# Install Required Packages
target_packages <- c("tidyverse", "tidytext", "tm", "SentimentAnalysis", "ggplot2", "readr", "quantmod", "tokenizers")
missing_packages <- target_packages[!(target_packages %in% installed.packages()[,"Package"])]
if(length(missing_packages)) install.packages(missing_packages)

# Load Required Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## 
## The following object is masked from 'package:base':
## 
##     write
library(ggplot2)
library(readr)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tokenizers)

2️⃣ Load and Preprocess FOMC Text Data

# Function to Read Text Files with UTF-8 Encoding
read_text_files <- function(path) {
  files <- list.files(path, pattern = "\\.txt$", full.names = TRUE)
  texts <- tibble(
    file = basename(files),
    content = map_chr(files, ~ read_file(.x, locale = locale(encoding = "UTF-8")))
  )
  return(texts)
}

# Load FOMC Statements from the same directory as the R Markdown file
fomc_statements <- read_text_files("./")

# Convert Text Encoding to UTF-8 and Ensure Character Format
fomc_statements$content <- iconv(fomc_statements$content, from = "latin1", to = "UTF-8", sub = "")
fomc_texts <- as.character(fomc_statements$content)
fomc_df <- data.frame(text = fomc_texts, stringsAsFactors = FALSE)

3️⃣ Text Cleaning: Tokenization & Preprocessing

# Tokenization and Removing Stopwords
fomc_tokens <- fomc_df %>%
  unnest_tokens(word, text) %>%
  anti_join(get_stopwords())
## Joining with `by = join_by(word)`
# Removing Non-Alphabetic Characters
fomc_tokens <- fomc_tokens %>% filter(!str_detect(word, "\\d"))

# Perform Stemming (Optional: Replace with Lemmatization if Needed)
fomc_tokens <- fomc_tokens %>% mutate(word = SnowballC::wordStem(word))

# Create Document-Term Matrix (DTM)
# Ensure a document ID column exists
fomc_tokens <- fomc_tokens %>%
  mutate(doc_id = row_number())  # Assign unique document IDs

# Create Document-Term Matrix (DTM)
dtm <- fomc_tokens %>%
  count(doc_id, word) %>%
  cast_dtm(document = doc_id, term = word, value = n)
# Count word frequencies
top_words <- fomc_tokens %>%
  count(word, sort = TRUE) %>%
  top_n(20, n)  # Select top 20 words

# Plot Top 20 Words
ggplot(top_words, aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip for better readability
  labs(title = "Top 20 Most Frequent Words in FOMC Statements",
       x = "Word",
       y = "Frequency") +
  theme_minimal()

4️⃣ Sentiment Analysis: LM & GI Dictionaries

# Load Sentiment Dictionaries
dict_lm <- loadDictionaryLM()
dict_gi <- loadDictionaryGI()

# Run Sentiment Analysis on FOMC Statements
sentiment_scores <- analyzeSentiment(fomc_df$text)

# Convert List to DataFrame if Needed
if (is.list(sentiment_scores)) {
  sentiment_scores <- do.call(rbind, lapply(sentiment_scores, as.data.frame))
}
if (!is.matrix(sentiment_scores)) {
  sentiment_scores <- as.matrix(sentiment_scores)
}