# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
##
## The following object is masked from 'package:base':
##
## write
library(ggplot2)
library(readr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(syuzhet)
library(knitr)
library(broom)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Define paths
fomc_data_path <- "/files/textmining"
lmd_file <- "Loughran-McDonald_MasterDictionary_1993-2023.csv"
# Load text files
fomc_files <- list.files(fomc_data_path, pattern = "\\.txt$", full.names = TRUE)
fomc_contents <- tibble(filename = fomc_files) %>%
mutate(statement_text = map_chr(filename, read_file))
# Load sentiment dictionaries
harvard_lexicon <- SentimentAnalysis::loadDictionaryGI()
harvard_positive_list <- tolower(harvard_lexicon$positiveWords)
harvard_negative_list <- tolower(harvard_lexicon$negativeWords)
# Tokenize text
fomc_tokenized <- fomc_contents %>%
unnest_tokens(token, statement_text) %>%
mutate(token = tolower(token))
# Compute sentiment scores
total_words_per_year <- fomc_tokenized %>%
mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
group_by(year) %>%
summarise(total_words = n())
sentiment_counts <- fomc_tokenized %>%
mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
filter(token %in% c(harvard_positive_list, harvard_negative_list)) %>%
mutate(sentiment = ifelse(token %in% harvard_positive_list, "positive", "negative")) %>%
group_by(year, sentiment) %>%
summarise(count = n()) %>%
pivot_wider(names_from = sentiment, values_from = count, values_fill = 0)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
sentiment_evaluation <- left_join(sentiment_counts, total_words_per_year, by = "year") %>%
mutate(sentiment_score = (positive - negative) / total_words)
# Plot sentiment over time
ggplot(sentiment_evaluation, aes(x = year, y = sentiment_score)) +
geom_line(color = "blue") +
geom_point(size = 2, color = "red") +
labs(title = "Monetary Policy Sentiment Over Time",
x = "Year",
y = "Sentiment Score") +
theme_minimal()

# Load Loughran-McDonald dictionary
lmd <- read.csv(lmd_file, header = TRUE)
lmd$Word <- tolower(lmd$Word)
lm_positive_words <- as.character(lmd$Word[lmd$Positive > 0])
lm_negative_words <- as.character(lmd$Word[lmd$Negative > 0])
# Compute sentiment from LMD
cleandoc <- fomc_tokenized
positive_hits <- cleandoc %>% filter(token %in% lm_positive_words) %>% count(token, sort = TRUE)
negative_hits <- cleandoc %>% filter(token %in% lm_negative_words) %>% count(token, sort = TRUE)
# Compare Harvard and LMD word frequencies
ggplot() +
geom_bar(data = head(positive_hits, 20), aes(x = reorder(token, n), y = n, fill = "Positive"), stat = "identity", position = "dodge") +
geom_bar(data = head(negative_hits, 20), aes(x = reorder(token, n), y = n, fill = "Negative"), stat = "identity", position = "dodge") +
coord_flip() +
scale_fill_manual(values = c("Positive" = "dodgerblue", "Negative" = "firebrick")) +
labs(title = "Comparison of Top 20 Positive and Negative Word Frequencies",
x = "Words",
y = "Frequency") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10))

# Fetch S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = "2008-01-01", to = "2024-12-31")
## [1] "GSPC"
sp500_returns <- data.frame(date = index(GSPC), coredata(Cl(GSPC))) %>%
rename(year = date, returns = GSPC.Close) %>%
mutate(year = as.integer(format(year, "%Y")))
# Merge S&P 500 data with sentiment scores
combined_data <- left_join(sp500_returns, sentiment_evaluation, by = "year")
# Run regression
model <- lm(returns ~ sentiment_score, data = combined_data)
model_summary <- tidy(model)
model_glance <- glance(model)
# Output results
kable(model_summary, caption = "Summary of Regression Analysis", digits=3)
Summary of Regression Analysis
| (Intercept) |
1867.291 |
45.449 |
41.086 |
0.000 |
| sentiment_score |
-5881.268 |
1715.968 |
-3.427 |
0.001 |
kable(model_glance, caption = "Full Regression Statistics", digits=3)
Full Regression Statistics
| 0.004 |
0.004 |
563.298 |
11.747 |
0.001 |
1 |
-21466.35 |
42938.71 |
42956.49 |
877980861 |
2767 |
2769 |
# Plot regression
ggplot(combined_data, aes(x = sentiment_score, y = returns)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "S&P 500 Returns vs. Monetary Policy Sentiment",
x = "Monetary Policy Sentiment Score",
y = "S&P 500 Annual Returns") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1509 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1509 rows containing missing values or values outside the scale range
## (`geom_point()`).
