justincode.knit

# Load required libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)
library(SentimentAnalysis)

## 
## Attaching package: 'SentimentAnalysis'
## 
## The following object is masked from 'package:base':
## 
##     write

library(ggplot2)
library(readr)
library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(syuzhet)
library(knitr)
library(broom)
library(quantmod)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Define paths
fomc_data_path <- "/files/textmining"
lmd_file <- "Loughran-McDonald_MasterDictionary_1993-2023.csv"

# Load text files
fomc_files <- list.files(fomc_data_path, pattern = "\\.txt$", full.names = TRUE)
fomc_contents <- tibble(filename = fomc_files) %>%
  mutate(statement_text = map_chr(filename, read_file))

# Load sentiment dictionaries
harvard_lexicon <- SentimentAnalysis::loadDictionaryGI()
harvard_positive_list <- tolower(harvard_lexicon$positiveWords)
harvard_negative_list <- tolower(harvard_lexicon$negativeWords)

# Tokenize text
fomc_tokenized <- fomc_contents %>%
  unnest_tokens(token, statement_text) %>%
  mutate(token = tolower(token))

# Compute sentiment scores
total_words_per_year <- fomc_tokenized %>%
  mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
  group_by(year) %>%
  summarise(total_words = n())

sentiment_counts <- fomc_tokenized %>%
  mutate(year = as.integer(str_extract(filename, "\\d{4}"))) %>%
  filter(token %in% c(harvard_positive_list, harvard_negative_list)) %>%
  mutate(sentiment = ifelse(token %in% harvard_positive_list, "positive", "negative")) %>%
  group_by(year, sentiment) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = sentiment, values_from = count, values_fill = 0)

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

sentiment_evaluation <- left_join(sentiment_counts, total_words_per_year, by = "year") %>%
  mutate(sentiment_score = (positive - negative) / total_words)

# Plot sentiment over time
ggplot(sentiment_evaluation, aes(x = year, y = sentiment_score)) +
  geom_line(color = "blue") +
  geom_point(size = 2, color = "red") +
  labs(title = "Monetary Policy Sentiment Over Time",
       x = "Year",
       y = "Sentiment Score") +
  theme_minimal()

# Load Loughran-McDonald dictionary
lmd <- read.csv(lmd_file, header = TRUE)
lmd$Word <- tolower(lmd$Word)
lm_positive_words <- as.character(lmd$Word[lmd$Positive > 0])
lm_negative_words <- as.character(lmd$Word[lmd$Negative > 0])

# Compute sentiment from LMD
cleandoc <- fomc_tokenized
positive_hits <- cleandoc %>% filter(token %in% lm_positive_words) %>% count(token, sort = TRUE)
negative_hits <- cleandoc %>% filter(token %in% lm_negative_words) %>% count(token, sort = TRUE)

# Compare Harvard and LMD word frequencies
ggplot() +
  geom_bar(data = head(positive_hits, 20), aes(x = reorder(token, n), y = n, fill = "Positive"), stat = "identity", position = "dodge") +
  geom_bar(data = head(negative_hits, 20), aes(x = reorder(token, n), y = n, fill = "Negative"), stat = "identity", position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "dodgerblue", "Negative" = "firebrick")) +
  labs(title = "Comparison of Top 20 Positive and Negative Word Frequencies",
       x = "Words",
       y = "Frequency") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10))

# Fetch S&P 500 data from Yahoo Finance
getSymbols("^GSPC", src = "yahoo", from = "2008-01-01", to = "2024-12-31")

## [1] "GSPC"

sp500_returns <- data.frame(date = index(GSPC), coredata(Cl(GSPC))) %>%
  rename(year = date, returns = GSPC.Close) %>%
  mutate(year = as.integer(format(year, "%Y")))

# Merge S&P 500 data with sentiment scores
combined_data <- left_join(sp500_returns, sentiment_evaluation, by = "year")

# Run regression
model <- lm(returns ~ sentiment_score, data = combined_data)
model_summary <- tidy(model)
model_glance <- glance(model)

# Output results
kable(model_summary, caption = "Summary of Regression Analysis", digits=3)

Summary of Regression Analysis
term	estimate	std.error	statistic	p.value
(Intercept)	1867.291	45.449	41.086	0.000
sentiment_score	-5881.268	1715.968	-3.427	0.001

kable(model_glance, caption = "Full Regression Statistics", digits=3)

Full Regression Statistics
r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	nobs
0.004	0.004	563.298	11.747	0.001	1	-21466.35	42938.71	42956.49	877980861	2767	2769

# Plot regression
ggplot(combined_data, aes(x = sentiment_score, y = returns)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "S&P 500 Returns vs. Monetary Policy Sentiment",
       x = "Monetary Policy Sentiment Score",
       y = "S&P 500 Annual Returns") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1509 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1509 rows containing missing values or values outside the scale range
## (`geom_point()`).