library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(ggthemes)
library(ggplot2)
library(dplyr)
data <- read.csv("C:/Users/aiden/OneDrive/mergedfile.csv")
My data set it configured of stock data over a 15 year span. It has open, close, high, low, and volume for quantitative values. It also has qualitative values regarding the company. These types of things include, date, sector, industry, and state. The url for this data set and documentation is https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks?select=sp500_stocks.csv
My main question that I would like to answer is can we see difference in proformance across different categories including, volume, sector, price ranges, or even state.
data$Date <- as.Date(data$Date)
# Calculate the average Adjusted Close price across all companies for each date
average_price <- data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE))
# Plot the average price over time as a line graph
ggplot(average_price, aes(x = Date, y = avg_price)) +
geom_line(color = "blue", linewidth = 1) + # Traditional line graph
labs(title = "Average Price of S&P 500 Companies Over Time",
x = "Date", y = "Average Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Calculate the average Volume across all companies for each date
average_volume <- data |>
group_by(Date) |>
summarize(avg_volume = mean(Volume, na.rm = TRUE))
# Plot the average volume over time as a line graph
ggplot(average_volume, aes(x = Date, y = avg_volume)) +
geom_line(color = "blue", linewidth = 1) + # Traditional line graph for volume
labs(title = "Average Volume of S&P 500 Companies Over Time",
x = "Date", y = "Average Volume") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
For my future inside of this project I plan to test any combination I can think of to find some correlation to prices. With this being my goal in the project of better predicting stock prices.
# Filter data for the Technology and Consumer Defensive sectors
tech_data <- data |>
filter(Sector == "Technology")
consumer_defensive_data <- data |>
filter(Sector == "Consumer Defensive")
# Calculate the average Adjusted Close price for the tech sector for each date
average_tech_price <- tech_data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
mutate(Sector = "Technology")
# Calculate the average Adjusted Close price for the consumer defensive sector for each date
average_consumer_defensive_price <- consumer_defensive_data |>
group_by(Date) |>
summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
mutate(Sector = "Consumer Defensive")
# Combine the two datasets
combined_data <- bind_rows(average_tech_price, average_consumer_defensive_price)
# Plot the average adjusted close price over time for both sectors
ggplot(combined_data, aes(x = Date, y = avg_price, color = Sector)) +
geom_line(linewidth = 1) + # Line graph for both sectors
labs(title = "Average Adjusted Close Price: Technology vs Consumer Defensive",
x = "Date", y = "Average Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
# Calculate the percentage change from the first to the last date for each company
percent_change <- data |>
group_by(Symbol, Sector) |>
arrange(Date) |>
summarize(first_price = first(Adj.Close),
last_price = last(Adj.Close)) |>
mutate(percent_change = ((last_price - first_price) / first_price) * 100)
## `summarise()` has grouped output by 'Symbol'. You can override using the
## `.groups` argument.
# Calculate the average percent change for each sector
average_percent_change <- percent_change |>
group_by(Sector) |>
summarize(avg_percent_change = mean(percent_change, na.rm = TRUE)) |>
arrange(desc(avg_percent_change)) # Arrange by highest-performing sector
# Print the table of average percent change for each sector
print(average_percent_change)
## # A tibble: 11 × 2
## Sector avg_percent_change
## <chr> <dbl>
## 1 Technology 2257.
## 2 Industrials 1822.
## 3 Financial Services 843.
## 4 Consumer Cyclical 825.
## 5 Communication Services 701.
## 6 Healthcare 466.
## 7 Utilities 463.
## 8 Real Estate 394.
## 9 Basic Materials 303.
## 10 Consumer Defensive 255.
## 11 Energy -0.242
# Filter data for Boeing (BA)
boeing_data <- data |>
filter(Symbol == "BA")
# Create a mock dataset for negative news headlines about Boeing
negative_news <- data.frame(
Date = as.Date(c("2019-03-10", "2019-12-18", "2020-01-06", "2020-11-09")),
News = c("737 MAX crash in Ethiopia",
"737 MAX grounded worldwide",
"Investigations into 737 MAX continue",
"FAA approves return of 737 MAX")
)
# Join the stock data with the negative news
boeing_data <- boeing_data |>
left_join(negative_news, by = "Date")
# Plotting
ggplot(boeing_data, aes(x = Date, y = Adj.Close)) +
geom_line(color = "blue", linewidth = 1) + # Line for stock price
geom_point(data = boeing_data[!is.na(boeing_data$News), ], aes(x = Date, y = Adj.Close),
color = "red", size = 3, shape = 21, fill = "red") + # Points for negative news
labs(title = "Boeing (BA) Stock Price and Negative News Correlation",
x = "Date", y = "Adjusted Close Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
geom_text(data = boeing_data[!is.na(boeing_data$News), ], aes(label = News),
vjust = -1, hjust = 1, size = 3, color = "red") # Add text for news