sp500_index_tbl  <- read_csv("../00 data/sp500_index_tbl.csv")

## New names:
## Rows: 505 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): symbol, company, identifier, sedol, sector, local_currency dbl (3): ...1,
## weight, shares_held
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

sp500_prices_tbl <- read_csv("../00 data/sp500_prices_tbl.csv")

## New names:
## Rows: 502541 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): symbol dbl (7): ...1, open, high, low, close, volume, adjusted date (1):
## date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

sp500_index_tbl %>% glimpse()

## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company        <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol          <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…

sp500_prices_tbl %>% glimpse()

## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…

Question

Which stocks have similar trading volume behavior over time?

1 Convert data to standardized form

sp_500_daily_volume_returns_tbl <- sp500_prices_tbl %>%
    
    select(symbol, date, volume) %>%
    
    filter(date >= ymd("2020-04-01")) %>%
    
    group_by(symbol) %>%
    mutate(lag_1 = lag(volume)) %>%
    ungroup() %>%
    
    filter(!is.na(lag_1)) %>%
    
    mutate(diff = volume - lag_1) %>%
    mutate(pct_return = diff / lag_1) %>%
    
    # Fix bad values that cause the UMAP error
    mutate(pct_return = ifelse(!is.finite(pct_return), 0, pct_return)) %>%
    
    select(symbol, date, pct_return)

sp_500_daily_volume_returns_tbl

## # A tibble: 502,038 × 3
##    symbol date       pct_return
##    <chr>  <date>          <dbl>
##  1 MSFT   2020-04-02    -0.144 
##  2 MSFT   2020-04-03    -0.169 
##  3 MSFT   2020-04-06     0.627 
##  4 MSFT   2020-04-07    -0.0647
##  5 MSFT   2020-04-08    -0.230 
##  6 MSFT   2020-04-09     0.0635
##  7 MSFT   2020-04-13    -0.184 
##  8 MSFT   2020-04-14     0.262 
##  9 MSFT   2020-04-15    -0.226 
## 10 MSFT   2020-04-16     0.233 
## # ℹ 502,028 more rows

2 Spread to object-characteristics format

stock_date_matrix_tbl <- sp_500_daily_volume_returns_tbl %>%
    spread(key = date, value = pct_return, fill = 0) %>% mutate(across(-symbol, ~ replace_na(., 0))) %>%
    mutate(across(-symbol, ~ ifelse(!is.finite(.), 0, .)))

stock_date_matrix_tbl

## # A tibble: 503 × 1,005
##    symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
##    <chr>         <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 A           -0.153       0.115         0.126       0.0631        0.0997
##  2 AAL          0.153       0.328         0.0834      0.464        -0.401 
##  3 AAPL        -0.0584     -0.217         0.554       0.00529      -0.168 
##  4 ABBV        -0.376      -0.00187       0.273       0.129         0.0541
##  5 ABNB         0           0             0           0             0     
##  6 ABT         -0.167      -0.101         0.354      -0.121        -0.148 
##  7 ACGL         0.148       0.00778       0.265      -0.0200       -0.364 
##  8 ACN          0.0191      0.243        -0.180      -0.0283       -0.179 
##  9 ADBE        -0.0803      0.0961        0.308       0.0539       -0.262 
## 10 ADI          0.126      -0.381         0.630       0.0384       -0.213 
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## #   `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## #   `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## #   `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## #   `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## #   `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …

3 Perform k-means clustering

symbol_date_cluster <- kmeans(
    stock_date_matrix_tbl %>% select(-symbol),
    centers = 3,
    nstart = 20
)

summary(symbol_date_cluster)

##              Length Class  Mode   
## cluster       503   -none- numeric
## centers      3012   -none- numeric
## totss           1   -none- numeric
## withinss        3   -none- numeric
## tot.withinss    1   -none- numeric
## betweenss       1   -none- numeric
## size            3   -none- numeric
## iter            1   -none- numeric
## ifault          1   -none- numeric

tidy(symbol_date_cluster)

## # A tibble: 3 × 1,007
##   `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
##          <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1       0.509       -0.567         1.24       -0.227        -0.214       -0.234
## 2      -0.139       -0.233         0.361      -0.0809       -0.208        0.498
## 3       0.0593      -0.0613        0.223       0.0110       -0.147        0.249
## # ℹ 1,001 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## #   `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## #   `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## #   `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## #   `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## #   `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## #   `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …

glance(symbol_date_cluster)

## # A tibble: 1 × 4
##        totss tot.withinss  betweenss  iter
##        <dbl>        <dbl>      <dbl> <int>
## 1 237397091.      108858. 237288234.     3

augment(symbol_date_cluster, stock_date_matrix_tbl) %>%
    ggplot(aes(`2020-04-02`, `2020-04-14`, color = .cluster)) +
    geom_point()

4 Select optimal number of clusters

kclusts <- tibble(k = 1:9) %>%
    mutate(
        kclust = map(
            .x = k,
            .f = ~ kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)
        ),
        glanced = map(.x = kclust, .f = glance)
    )

kclusts %>%
    unnest(glanced) %>%
    ggplot(aes(k, tot.withinss)) +
    geom_point() +
    geom_line()

Final clustering model

final_cluster <- kmeans(
    stock_date_matrix_tbl %>% select(-symbol),
    centers = 5,
    nstart = 20
)

augment(final_cluster, stock_date_matrix_tbl) %>%
    ggplot(aes(`2020-04-02`, `2020-04-14`, color = .cluster)) +
    geom_point()

5 Reduce dimension using UMAP

umap_results <- stock_date_matrix_tbl %>%
    select(-symbol) %>%
    umap()

umap_results_tbl <- umap_results$layout %>%
    as_tibble() %>%
    bind_cols(stock_date_matrix_tbl %>% select(symbol))

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

umap_results_tbl

## # A tibble: 503 × 3
##        V1     V2 symbol
##     <dbl>  <dbl> <chr> 
##  1  1.37  -0.613 A     
##  2 -0.788  2.90  AAL   
##  3  0.728  0.964 AAPL  
##  4  0.802 -0.530 ABBV  
##  5  0.363  0.819 ABNB  
##  6 -0.796  0.258 ABT   
##  7 -0.404 -1.24  ACGL  
##  8  0.177  0.900 ACN   
##  9  1.01   0.744 ADBE  
## 10  1.85   0.956 ADI   
## # ℹ 493 more rows

umap_results_tbl %>%
    ggplot(aes(V1, V2)) +
    geom_point()

6 Visualize clusters by adding k-means results

kmeans_umap_tbl <- final_cluster %>%
    augment(stock_date_matrix_tbl) %>%
    select(symbol, .cluster) %>%
    left_join(umap_results_tbl, by = "symbol")

kmeans_umap_tbl

## # A tibble: 503 × 4
##    symbol .cluster     V1     V2
##    <chr>  <fct>     <dbl>  <dbl>
##  1 A      1         1.37  -0.613
##  2 AAL    1        -0.788  2.90 
##  3 AAPL   1         0.728  0.964
##  4 ABBV   1         0.802 -0.530
##  5 ABNB   1         0.363  0.819
##  6 ABT    1        -0.796  0.258
##  7 ACGL   1        -0.404 -1.24 
##  8 ACN    1         0.177  0.900
##  9 ADBE   1         1.01   0.744
## 10 ADI    1         1.85   0.956
## # ℹ 493 more rows

g <- kmeans_umap_tbl %>%
    mutate(
        text_label = str_glue(
            "Symbol: {symbol}
Cluster: {.cluster}"
        )
    ) %>%
    ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
    geom_point()

g %>% ggplotly(tooltip = "text")

Apply 9

Tyler Roden

2026-04-14

Question

1 Convert data to standardized form

2 Spread to object-characteristics format

3 Perform k-means clustering

4 Select optimal number of clusters

Final clustering model

5 Reduce dimension using UMAP

6 Visualize clusters by adding k-means results