# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## 
## The following object is masked from 'package:base':
## 
##     write
library(ggplot2)
library(readr)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(syuzhet)
library(knitr)
library(broom)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Define paths
fomc_data_path <- "/files/textmining"
lmd_file <- "Loughran-McDonald_MasterDictionary_1993-2023.csv"

# Load text files
fomc_files <- list.files(fomc_data_path, pattern = "\\.txt$", full.names = TRUE)
fomc_contents <- tibble(filename = fomc_files) %>%
  mutate(statement_text = map_chr(filename, read_file))

# Load sentiment dictionaries
harvard_lexicon <- SentimentAnalysis::loadDictionaryGI()
harvard_positive_list <- tolower(harvard_lexicon$positiveWords)
harvard_negative_list <- tolower(harvard_lexicon$negativeWords)

# Tokenize text
fomc_tokenized <- fomc_contents %>%
  unnest_tokens(token, statement_text) %>%
  mutate(token = tolower(token))

# Compute sentiment scores
total_words_per_year <- fomc_tokenized %>%
  mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
  group_by(year) %>%
  summarise(total_words = n())

sentiment_counts <- fomc_tokenized %>%
  mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
  filter(token %in% c(harvard_positive_list, harvard_negative_list)) %>%
  mutate(sentiment = ifelse(token %in% harvard_positive_list, "positive", "negative")) %>%
  group_by(year, sentiment) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = sentiment, values_from = count, values_fill = 0)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sentiment_evaluation <- left_join(sentiment_counts, total_words_per_year, by = "year") %>%
  mutate(sentiment_score = (positive - negative) / total_words)

# Plot sentiment over time
ggplot(sentiment_evaluation, aes(x = year, y = sentiment_score)) +
  geom_line(color = "blue") +
  geom_point(size = 2, color = "red") +
  labs(title = "Monetary Policy Sentiment Over Time",
       x = "Year",
       y = "Sentiment Score") +
  theme_minimal()

# Load Loughran-McDonald dictionary
lmd <- read.csv(lmd_file, header = TRUE)
lmd$Word <- tolower(lmd$Word)
lm_positive_words <- as.character(lmd$Word[lmd$Positive > 0])
lm_negative_words <- as.character(lmd$Word[lmd$Negative > 0])

# Compute sentiment from LMD
cleandoc <- fomc_tokenized
positive_hits <- cleandoc %>% filter(token %in% lm_positive_words) %>% count(token, sort = TRUE)
negative_hits <- cleandoc %>% filter(token %in% lm_negative_words) %>% count(token, sort = TRUE)

# Compare Harvard and LMD word frequencies
ggplot() +
  geom_bar(data = head(positive_hits, 20), aes(x = reorder(token, n), y = n, fill = "Positive"), stat = "identity", position = "dodge") +
  geom_bar(data = head(negative_hits, 20), aes(x = reorder(token, n), y = n, fill = "Negative"), stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "dodgerblue", "Negative" = "firebrick")) +
  labs(title = "Comparison of Top 20 Positive and Negative Word Frequencies",
       x = "Words",
       y = "Frequency") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))

# Fetch S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = "2008-01-01", to = "2024-12-31")
## [1] "GSPC"
sp500_returns <- data.frame(date = index(GSPC), coredata(Cl(GSPC))) %>%
  rename(year = date, returns = GSPC.Close) %>%
  mutate(year = as.integer(format(year, "%Y")))

# Merge S&P 500 data with sentiment scores
combined_data <- left_join(sp500_returns, sentiment_evaluation, by = "year")

# Run regression
model <- lm(returns ~ sentiment_score, data = combined_data)
model_summary <- tidy(model)
model_glance <- glance(model)

# Output results
kable(model_summary, caption = "Summary of Regression Analysis", digits=3)
Summary of Regression Analysis
term estimate std.error statistic p.value
(Intercept) 1867.291 45.449 41.086 0.000
sentiment_score -5881.268 1715.968 -3.427 0.001
kable(model_glance, caption = "Full Regression Statistics", digits=3)
Full Regression Statistics
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.004 0.004 563.298 11.747 0.001 1 -21466.35 42938.71 42956.49 877980861 2767 2769
# Plot regression
ggplot(combined_data, aes(x = sentiment_score, y = returns)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "S&P 500 Returns vs. Monetary Policy Sentiment",
       x = "Monetary Policy Sentiment Score",
       y = "S&P 500 Annual Returns") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1509 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1509 rows containing missing values or values outside the scale range
## (`geom_point()`).