# Load necessary libraries
library(tidyverse)
library(tidytext)
library(lubridate)
library(ggplot2)
library(textdata)
library(wordcloud)
library(corrplot)BAIS-462 Final Project
Yahoo Finance
Final Project: Ford vs. Tesla
Load Data
# Load the data from the CSV file
data <- read.csv("Yahoo Finance business information.csv")Data Cleaning
# Clean and tokenize the summary text
data_cleaned <- data %>%
mutate(summary_cleaned = tolower(summary)) %>%
unnest_tokens(word, summary_cleaned) %>%
anti_join(stop_words) # Remove stop wordsSentiment Analysis Using NRC Lexicon
# Load the NRC sentiment lexicon
nrc_sentiments <- get_sentiments("nrc")
# Perform sentiment analysis
sentiment_analysis <- data_cleaned %>%
inner_join(nrc_sentiments, by = "word") %>%
count(company_id, sentiment, sort = TRUE) %>%
mutate(sentiment_score = n)Part 1:
Area of Interest
For this project, I am interested in exploring the relationship between stock prices and sentiment surrounding two major automotive companies: Ford and Tesla. As these companies are key players in the electric vehicle (EV) and traditional automotive industries, their stock price movements are often influenced by both internal and external factors, such as market trends, earnings reports, and public sentiment.
Research Question
The key research question I aim to explore is:
How does sentiment surrounding Ford and Tesla correlate with their stock price movements before and after earnings reports?*
Hypothesis
I hypothesize that positive sentiment (whether from social media, news, or financial analysts) about Ford and Tesla will correlate with an increase in stock prices, while negative sentiment will correspond to a decrease in stock prices. This analysis will provide insights into how sentiment can act as an indicator for stock price movement, especially around key financial events like earnings reports.
Dataset Selection
For this analysis, I have chosen a dataset containing stock price and sentiment data for both Ford and Tesla. The dataset includes the following columns:
- company_id: The stock ticker for the company (e.g., ‘F’ for Ford, ‘TSLA’ for Tesla).
- earnings_date: The date of earnings report release.
- sentiment_score: A sentiment score categorized as “positive,” “negative,” or “neutral.”
- stock_price: The closing stock price on the earnings report date.
You can find the dataset I will use [here](https://your_link_to_the_dataset).
Data Dictionary
company_id: The company identifier (‘F’ for Ford, ‘TSLA’ for Tesla) earnings_date: The date of the earnings report release (in YYYY-MM-DD format) sentiment_score: The sentiment score of the earnings report (“positive,” “negative,” “neutral”) stock_price: The closing stock price of the company on the earnings report date
Summary Statistics
Here are some interesting summary statistics for the dataset:
# Filter the data for Ford and Tesla
data_filtered <- data %>%
filter(company_id %in% c("F", "TSLA"))
# Calculate summary statistics (mean and SD)
summary_data <- data_filtered %>%
group_by(company_id) %>%
summarise(mean_price = mean(closing_price, na.rm = TRUE),
sd_price = sd(closing_price, na.rm = TRUE))
# Create the boxplot and add mean annotations
ggplot(data_filtered, aes(x = company_id, y = closing_price, fill = company_id)) +
geom_boxplot() + # Boxplot to show distribution of stock prices
geom_point(data = summary_data, aes(x = company_id, y = mean_price), color = "red", size = 3) + # Mean points
geom_text(data = summary_data, aes(x = company_id, y = mean_price, label = paste0("Mean: ", round(mean_price, 2))),
color = "black", vjust = -1) + # Annotate the mean
labs(title = "Stock Price Distribution for Ford and Tesla with Mean",
x = "Company",
y = "Stock Price") +
theme_minimal() +
theme(legend.position = "none")Part 2
Sentiment vs Stock Price: Ford vs Tesla
In this section, we will perform a descriptive analysis to explore the relationship between sentiment and stock price movements for both Ford and Tesla. We will create visualizations that highlight how sentiment impacts stock prices before and after earnings reports.
1. Boxplot of Closing Price by Entity Type
Question: How does the closing price vary by entity type?
df_filtered <- data_cleaned %>% filter(company_id %in% c("TSLA", "F"))
# 1. Boxplot of Closing Price by Entity Type (for Tesla and Ford)
ggplot(df_filtered, aes(x = entity_type, y = closing_price, color = company_id)) +
geom_point() +
labs(title = "Closing Price by Entity Type (Tesla & Ford)",
x = "Entity Type",
y = "Closing Price") +
theme_minimal()# Calculate word frequencies
word_frequencies <- data_cleaned %>%
count(word, sort = TRUE) %>%
top_n(20, n)
# Display word frequency table
word_frequencies word n
1 quote 581
2 delayed 565
3 usd 510
4 nasdaq 402
5 price 402
6 real 402
7 time 402
8 nyse 285
9 eur 256
10 nasdaqgs 130
11 frankfurt 124
12 jpy 82
13 tokyo 81
14 markets 77
15 otc 77
16 otcpk 60
17 xetra 42
18 paris 29
19 hkd 22
20 hkse 22
This boxplot shows how the closing price varies across different entity types (e.g., public, private). It will highlight the central tendency, spread, and outliers for each category.
2. Correlation Matrix for Continuous Variables
Question: What are the relationships between key continuous variables?
continuous_vars <- df_filtered %>%
select(closing_price, pe_ratio, eps, beta, market_cap, volume, avg_volume)
# Compute correlation matrix
cor_matrix <- cor(continuous_vars, use = "complete.obs")
# Plot correlation heatmap
corrplot(cor_matrix, method = "circle", type = "upper",
title = "Correlation Matrix of Financial Variables (Tesla & Ford)",
mar = c(0, 0, 2, 0))- This heatmap shows the correlation between key continuous variables like closing price, pe ratio, eps, beta, market cap, and volume. The color intensity reflects the strength of the correlation.
3. Frequency Table of Recommendation Ratings
Question: What is the distribution of recommendation ratings across companies?
recommendation_freq <- table(df_filtered$recommendation_rating)
print(recommendation_freq)
2.77 2.8
6 6
# Bar chart of recommendation ratings for Tesla and Ford
ggplot(df_filtered, aes(x = recommendation_rating, fill = name)) +
geom_bar(position = "dodge", color = "black") +
labs(title = "Frequency of Recommendation Ratings (Tesla & Ford)",
x = "Recommendation Rating",
y = "Frequency") +
theme_minimal()This bar chart visualizes the frequency of different recommendation rating values. It helps you see how many companies fall into each rating category (e.g., “buy”, “hold”, “sell”).
4. Bar Chart of Companies by Industry
Question: What is the distribution of companies across different industries?
industry_count <- df_filtered %>%
count(company_profile_industry) %>%
arrange(desc(n))
ggplot(industry_count, aes(x = reorder(company_profile_industry, n), y = n, fill = company_profile_industry)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Companies by Industry (Tesla & Ford)",
x = "Industry",
y = "Count") +
theme_minimal()This bar chart shows the count of companies in each industry, providing insights into the distribution of industries represented in your dataset.
5. Time Series of Closing Price vs. Earnings Date
Question: How does the closing price change around earnings dates?
df_filtered$earnings_date <- as.Date(df_filtered$earnings_date, format = "%Y-%m-%d")
# Plot closing price over time with earnings dates for Tesla and Ford
ggplot(df_filtered, aes(x = earnings_date, y = closing_price, color = name)) +
geom_line() +
geom_point(aes(color = "red"), size = 2) +
labs(title = "Closing Price vs. Earnings Date (Tesla & Ford)",
x = "Earnings Date",
y = "Closing Price") +
theme_minimal() +
theme(legend.position = "none")This line graph shows the variation in closing price over time, with earnings dates marked in red. It provides a visual representation of how stock prices behave around earnings announcements.
Part 3
data_cleaned %>%
count(word, sort = TRUE) %>%
with(wordcloud(word, n, max.words = 100))