Set up

library(tidyverse)
library(tidyquant) # for financial analysis
library(broom) # for tidy model results
library(umap)  # for dimension reduction
library(plotly) # for interactive visualization

Data

# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")

# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull()

# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")

write.csv(sp500_index_tbl, "00_Data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "00_Data/sp500_prices_tbl.csv")

Import data

sp500_index_tbl  <- read_csv("00_Data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("00_Data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "NVDA", "AAPL", "MSFT", "AMZN", "GOOGL", "AVGO", "GOOG"…
## $ company        <chr> "NVIDIA CORP", "APPLE INC", "MICROSOFT CORP", "AMAZON.C…
## $ identifier     <chr> "67066G104", "037833100", "594918104", "023135106", "02…
## $ sedol          <chr> "2379504", "2046251", "2588173", "2000019", "BYVY8G0", …
## $ weight         <dbl> 0.080116158, 0.064846379, 0.050622747, 0.040243831, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 286670335, 173194653, 87599857, 115240427, 68680304, 55…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 754,075
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 6.39125, 6.10600, 6.34900, 6.38100, 6.93250, 6.58525, 6.80000…
## $ high     <dbl> 6.53825, 6.40000, 6.39075, 6.74700, 6.95625, 6.69875, 6.82300…
## $ low      <dbl> 6.03200, 6.05775, 5.95975, 6.32325, 6.43250, 6.51500, 6.51050…
## $ close    <dbl> 6.07675, 6.38675, 6.09775, 6.71000, 6.47575, 6.67375, 6.57375…
## $ volume   <dbl> 656912000, 675764000, 663212000, 727884000, 784520000, 542444…
## $ adjusted <dbl> 6.053018, 6.361806, 6.073935, 6.683794, 6.450459, 6.647686, 6…

Question

Which stock prices behave similarly?

Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.

Why It Matters Understanding which companies are related is crucial for various reasons:

Assignment Details Your task is to analyze the historical price data of various stocks and determine which stocks behave similarly. We will employ clustering techniques to accomplish this task effectively.

# Convert adjusted prices to daily returns
returns_tbl <- sp500_prices_tbl %>%
  select(symbol, date, adjusted) %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(daily_return = (adjusted - lag(adjusted)) / lag(adjusted)) %>%
  ungroup() %>%
  drop_na()
# 2. Convert to wide format
returns_wide_tbl <- returns_tbl %>%
  select(symbol, date, daily_return) %>%
  pivot_wider(names_from = date, values_from = daily_return)

# 3. Remove missing values
cluster_data_tbl <- returns_wide_tbl %>%
  drop_na()

# 4. Create matrix for clustering
cluster_matrix <- cluster_data_tbl %>%
  column_to_rownames("symbol") %>%
  as.matrix()

# 5. K-means clustering
set.seed(123)

kmeans_obj <- kmeans(cluster_matrix, centers = 3)

kmeans_tbl <- cluster_data_tbl %>%
  select(symbol) %>%
  mutate(.cluster = as.factor(kmeans_obj$cluster))

# 6. UMAP for visualization
umap_obj <- umap(cluster_matrix)

kmeans_umap_tbl <- umap_obj$layout %>%
  as_tibble() %>%
  set_names(c("V1", "V2")) %>%
  bind_cols(kmeans_tbl)

# 7. Plot
g <- kmeans_umap_tbl %>%
  
  mutate(text_label = str_glue("Symbol: {symbol}
                               Cluster: {.cluster}")) %>%
  
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

g %>% ggplotly(tooltip = "text")

1 Convert data to standardized form

To compare data effectively, it must be standardized or normalized. Why? Because comparing values (like stock prices) of vastly different magnitudes is impractical. So, we’ll standardize by converting from adjusted stock price (in dollars) to daily returns (as percent change from the previous day). Here’s the formula:

\[ return_{daily} = \frac{price_{i}-price_{i-1}}{price_{i-1}} \]