Set up

library(tidyverse)
library(tidyquant)
library(lubridate)  # For date manipulation
library(Matrix)     # For sparse matrix handling
library(broom)
library(umap)
library(plotly)

Data

# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")

# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull() 

# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")

write.csv(sp500_index_tbl, "../00_data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "../00_data/sp500_prices_tbl.csv")

Import data

sp500_index_tbl <- read_csv("../00_data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("../00_data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company        <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol          <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…

Question

Which stock prices behave similarly?

Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.

Why It Matters Understanding which companies are related is crucial for various reasons:

Assignment Details Your task is to analyze the historical price data of various stocks and determine which stocks behave similarly. We will employ clustering techniques to accomplish this task effectively.

1 Convert data to standardized form

To compare data effectively, it must be standardized or normalized. Why? Because comparing values (like stock prices) of vastly different magnitudes is impractical. So, we’ll standardize by converting from adjusted stock price (in dollars) to daily returns (as percent change from the previous day). Here’s the formula:

\[ return_{daily} = \frac{price_{i}-price_{i-1}}{price_{i-1}} \]

# Apply your data transformation skills!

# Convert from adjusted prices to daily returns
price_data <- sp500_prices_tbl %>%
  select(symbol, date, adjusted) %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(daily_return = (adjusted / lag(adjusted)) - 1) %>%  # Calculate daily returns
  filter(!is.na(daily_return)) %>%  # Remove NA values
  ungroup() %>%
  distinct(symbol, date, .keep_all = TRUE)  # Ensure no duplicates exist
# Aggregate data to weekly returns
# Reduce the number of columns by aggregating returns to weekly averages
price_data <- price_data %>%
  mutate(week = floor_date(date, "week")) %>%  # Group by week
  group_by(symbol, week) %>%
  summarize(weekly_return = mean(daily_return, na.rm = TRUE), .groups = "drop")

2 Spread to object-characteristics format

We’ll convert the daily returns (percentage change from one day to the next) to object-characteristics format, also known as the user-item format. Users are identified by the symbol (company), and items are represented by the pct_return at each date.

# Convert to a sparse matrix to handle large data efficiently
returns_matrix <- price_data %>%
  pivot_wider(names_from = week, values_from = weekly_return) %>%
  column_to_rownames(var = "symbol") %>%
  as.matrix()

# Convert to sparse matrix for memory efficiency
returns_matrix <- Matrix(returns_matrix, sparse = TRUE)
# Handle NA, NaN, or Inf values in the matrix
returns_matrix[is.na(returns_matrix)] <- 0       # Replace NA with 0
returns_matrix[is.nan(returns_matrix)] <- 0     # Replace NaN with 0
returns_matrix[is.infinite(returns_matrix)] <- 0 # Replace Inf with 0

3 Perform k-means clustering

# Perform k-means clustering to group stocks with similar price behaviors

set.seed(123)  # Set seed for reproducibility
kmeans_result <- kmeans(returns_matrix, centers = 5, nstart = 10)  # Initial assumption of 5 clusters

4 Select optimal number of clusters

# Using elbow method to find the best number of clusters
wss <- map_dbl(1:10, function(k) {
  kmeans(returns_matrix, centers = k, nstart = 10)$tot.withinss
})

elbow_plot <- tibble(k = 1:10, wss = wss) %>%
  ggplot(aes(x = k, y = wss)) +
  geom_line() +
  geom_point() +
  labs(title = "Elbow Plot for Optimal Number of Clusters",
       x = "Number of Clusters (k)",
       y = "Total Within-Cluster Sum of Squares")

print(elbow_plot)

5 Reduce dimension using UMAP

umap_results <- umap(returns_matrix, n_neighbors = 15, min_dist = 0.1, metric = "euclidean")
umap_layout <- as_tibble(umap_results$layout) %>%
  mutate(cluster = as.factor(kmeans_result$cluster),
         symbol = rownames(returns_matrix))

6 Visualize clusters by adding k-means results

umap_plot <- umap_layout %>%
  ggplot(aes(x = V1, y = V2, color = cluster, text = symbol)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "UMAP Visualization of Clusters",
       x = "UMAP Dimension 1",
       y = "UMAP Dimension 2",
       color = "Cluster") +
  theme_minimal()

# Convert to an interactive plot
interactive_umap_plot <- ggplotly(umap_plot, tooltip = "text")

# Print interactive plot
print(interactive_umap_plot)