Set up

library(tidyverse)
library(tidyquant) # for financial analysis
library(broom) # for tidy model results
library(umap)  # for dimension reduction
library(plotly) # for interactive visualization

Data

# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")

# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull() 

# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")

write.csv(sp500_index_tbl, "../00_data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "../00_data/sp500_prices_tbl.csv")

Import data

sp500_index_tbl <- read_csv("../00_data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("../00_data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()

## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company        <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol          <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…

sp500_prices_tbl %>% glimpse()

## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…

Question

Which stock prices behave similarly?

Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.

1 Convert data to standardized form

sp_500_daily_returns_tbl <- sp500_prices_tbl %>%
    select(symbol, date, adjusted) %>%
    filter(date >= ymd("2018-01-01")) %>%
    group_by(symbol) %>%
    mutate(lag_1 = lag(adjusted)) %>%
    ungroup() %>%
    filter(!is.na(lag_1)) %>%
    mutate(diff = adjusted - lag_1) %>%
    mutate(pct_return = diff / lag_1) %>%
    select(symbol, date, pct_return)

sp_500_daily_returns_tbl

## # A tibble: 502,038 × 3
##    symbol date       pct_return
##    <chr>  <date>          <dbl>
##  1 MSFT   2020-04-02  0.0207   
##  2 MSFT   2020-04-03 -0.00921  
##  3 MSFT   2020-04-06  0.0744   
##  4 MSFT   2020-04-07 -0.0108   
##  5 MSFT   2020-04-08  0.0100   
##  6 MSFT   2020-04-09  0.0000605
##  7 MSFT   2020-04-13  0.00224  
##  8 MSFT   2020-04-14  0.0495   
##  9 MSFT   2020-04-15 -0.0105   
## 10 MSFT   2020-04-16  0.0300   
## # ℹ 502,028 more rows

2 Spread to object-characteristics format

stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
    spread(key = date, value = pct_return, fill = 0)

stock_date_matrix_tbl

## # A tibble: 503 × 1,005
##    symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
##    <chr>         <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 A           0.0489     -0.0259         0.0559     -0.00444       0.0359
##  2 AAL        -0.0589     -0.0666         0.0117      0.0758        0.109 
##  3 AAPL        0.0167     -0.0144         0.0872     -0.0116        0.0256
##  4 ABBV        0.0233     -0.0234         0.0322     -0.00449       0.0420
##  5 ABNB        0           0              0           0             0     
##  6 ABT         0.0375      0.000126       0.0413     -0.00967       0.0369
##  7 ACGL        0.0115     -0.0650         0.0983      0.0314        0.0291
##  8 ACN         0.0103     -0.0264         0.0914     -0.0116        0.0464
##  9 ADBE        0.00913    -0.0341         0.0869     -0.0320        0.0267
## 10 ADI         0.0429     -0.0130         0.107       0.00470       0.0535
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## #   `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## #   `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## #   `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## #   `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## #   `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …

3 Perform K-Means Clustering

set.seed(123)
kmeans_3 <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 3, nstart = 20)
clustered_tbl <- augment(kmeans_3, stock_date_matrix_tbl)

4 Determine Optimal Number of Clusters

k_values <- tibble(k = 1:9) %>%
    mutate(kclust = map(k, ~kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)),
           glanced = map(kclust, glance)) %>%
    unnest(glanced)

k_values %>%
    ggplot(aes(x = k, y = tot.withinss)) +
    geom_line() +
    geom_point() +
    labs(title = "Elbow Method for Choosing K",
         x = "Number of Clusters",
         y = "Total Within-Cluster Sum of Squares")

5 Visualize the Clusters Using UMAP

umap_results <- stock_date_matrix_tbl %>%
    select(-symbol) %>%
    umap()

umap_layout <- umap_results$layout %>%
    as_tibble() %>%
    rename(UMAP1 = V1, UMAP2 = V2) %>%
    bind_cols(clustered_tbl %>% select(symbol, .cluster))

# Plot

ggplot(umap_layout, aes(x = UMAP1, y = UMAP2, color = as.factor(.cluster))) +
    geom_point(alpha = 0.7) +
    labs(title = "UMAP Projection of Stock Returns",
         subtitle = "Colored by K-Means Cluster",
         color = "Cluster")

Optional: Interactive Plot

umap_layout <- umap_layout %>%
    left_join(sp500_index_tbl %>% select(symbol, company = company), by = "symbol") %>%
    mutate(text_label = str_glue("Company: {company}
                                  Symbol: {symbol}
                                  Cluster: {.cluster}"))

ggplotly(
    ggplot(umap_layout, aes(x = UMAP1, y = UMAP2, color = as.factor(.cluster), text = text_label)) +
        geom_point(alpha = 0.7) +
        labs(title = "UMAP Projection of Stock Clusters", color = "Cluster"),
    tooltip = "text"
)

Apply 9: Cluster Model for S&P500

Ada Cilley

2025-04-07