library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)
library(ggthemes)
library(ggplot2)
library(dplyr)
data <- read.csv("C:/Users/aiden/OneDrive/mergedfile.csv")

Summary

My data set it configured of stock data over a 15 year span. It has open, close, high, low, and volume for quantitative values. It also has qualitative values regarding the company. These types of things include, date, sector, industry, and state. The url for this data set and documentation is https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks?select=sp500_stocks.csv

My main question that I would like to answer is can we see difference in proformance across different categories including, volume, sector, price ranges, or even state.

  1. My first graph will show an overall baseline between all companies. This will give us an average profit to compare between different findings. This will almost act as a s&p 500 index fund.
  2. My second graph will be a similar concept except with volume. Doing this I might be able to find a connection between price drops, or big gap ups.

Plots

data$Date <- as.Date(data$Date)

# Calculate the average Adjusted Close price across all companies for each date
average_price <- data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE))

# Plot the average price over time as a line graph
ggplot(average_price, aes(x = Date, y = avg_price)) +
  geom_line(color = "blue", linewidth = 1) +  # Traditional line graph
  labs(title = "Average Price of S&P 500 Companies Over Time",
       x = "Date", y = "Average Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Calculate the average Volume across all companies for each date
average_volume <- data |>
  group_by(Date) |>
  summarize(avg_volume = mean(Volume, na.rm = TRUE))

# Plot the average volume over time as a line graph
ggplot(average_volume, aes(x = Date, y = avg_volume)) +
  geom_line(color = "blue", linewidth = 1) +  # Traditional line graph for volume
  labs(title = "Average Volume of S&P 500 Companies Over Time",
       x = "Date", y = "Average Volume") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

  • Your plan moving forward (i.e., what is on your to-do list?)

For my future inside of this project I plan to test any combination I can think of to find some correlation to prices. With this being my goal in the project of better predicting stock prices.

Initial Findings.

  1. The tech sector should be one of the highest preforming sectors with the United States growth in recent technology.
  2. We are able to correlate price drops with negative press about the company.

Plots

# Filter data for the Technology and Consumer Defensive sectors
tech_data <- data |>
  filter(Sector == "Technology")

consumer_defensive_data <- data |>
  filter(Sector == "Consumer Defensive")

# Calculate the average Adjusted Close price for the tech sector for each date
average_tech_price <- tech_data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
  mutate(Sector = "Technology")

# Calculate the average Adjusted Close price for the consumer defensive sector for each date
average_consumer_defensive_price <- consumer_defensive_data |>
  group_by(Date) |>
  summarize(avg_price = mean(Adj.Close, na.rm = TRUE)) |>
  mutate(Sector = "Consumer Defensive")

# Combine the two datasets
combined_data <- bind_rows(average_tech_price, average_consumer_defensive_price)

# Plot the average adjusted close price over time for both sectors
ggplot(combined_data, aes(x = Date, y = avg_price, color = Sector)) +
  geom_line(linewidth = 1) +  # Line graph for both sectors
  labs(title = "Average Adjusted Close Price: Technology vs Consumer Defensive",
       x = "Date", y = "Average Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

# Calculate the percentage change from the first to the last date for each company
percent_change <- data |>
  group_by(Symbol, Sector) |>
  arrange(Date) |>
  summarize(first_price = first(Adj.Close),
            last_price = last(Adj.Close)) |>
  mutate(percent_change = ((last_price - first_price) / first_price) * 100)
## `summarise()` has grouped output by 'Symbol'. You can override using the
## `.groups` argument.
# Calculate the average percent change for each sector
average_percent_change <- percent_change |>
  group_by(Sector) |>
  summarize(avg_percent_change = mean(percent_change, na.rm = TRUE)) |>
  arrange(desc(avg_percent_change))  # Arrange by highest-performing sector

# Print the table of average percent change for each sector
print(average_percent_change)
## # A tibble: 11 × 2
##    Sector                 avg_percent_change
##    <chr>                               <dbl>
##  1 Technology                       2257.   
##  2 Industrials                      1822.   
##  3 Financial Services                843.   
##  4 Consumer Cyclical                 825.   
##  5 Communication Services            701.   
##  6 Healthcare                        466.   
##  7 Utilities                         463.   
##  8 Real Estate                       394.   
##  9 Basic Materials                   303.   
## 10 Consumer Defensive                255.   
## 11 Energy                             -0.242
# Filter data for Boeing (BA)
boeing_data <- data |> 
  filter(Symbol == "BA")

# Create a mock dataset for negative news headlines about Boeing
negative_news <- data.frame(
  Date = as.Date(c("2019-03-10", "2019-12-18", "2020-01-06", "2020-11-09")),
  News = c("737 MAX crash in Ethiopia",
           "737 MAX grounded worldwide",
           "Investigations into 737 MAX continue",
           "FAA approves return of 737 MAX")
)

# Join the stock data with the negative news
boeing_data <- boeing_data |>
  left_join(negative_news, by = "Date")

# Plotting
ggplot(boeing_data, aes(x = Date, y = Adj.Close)) +
  geom_line(color = "blue", linewidth = 1) +  # Line for stock price
  geom_point(data = boeing_data[!is.na(boeing_data$News), ], aes(x = Date, y = Adj.Close), 
             color = "red", size = 3, shape = 21, fill = "red") +  # Points for negative news
  labs(title = "Boeing (BA) Stock Price and Negative News Correlation",
       x = "Date", y = "Adjusted Close Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  geom_text(data = boeing_data[!is.na(boeing_data$News), ], aes(label = News), 
            vjust = -1, hjust = 1, size = 3, color = "red")  # Add text for news