sp500_index_tbl <- read_csv("../00 data/sp500_index_tbl.csv")
## New names:
## Rows: 505 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): symbol, company, identifier, sedol, sector, local_currency dbl (3): ...1,
## weight, shares_held
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_prices_tbl <- read_csv("../00 data/sp500_prices_tbl.csv")
## New names:
## Rows: 502541 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): symbol dbl (7): ...1, open, high, low, close, volume, adjusted date (1):
## date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…
Which stocks have similar trading volume behavior over time?
sp_500_daily_volume_returns_tbl <- sp500_prices_tbl %>%
select(symbol, date, volume) %>%
filter(date >= ymd("2020-04-01")) %>%
group_by(symbol) %>%
mutate(lag_1 = lag(volume)) %>%
ungroup() %>%
filter(!is.na(lag_1)) %>%
mutate(diff = volume - lag_1) %>%
mutate(pct_return = diff / lag_1) %>%
# Fix bad values that cause the UMAP error
mutate(pct_return = ifelse(!is.finite(pct_return), 0, pct_return)) %>%
select(symbol, date, pct_return)
sp_500_daily_volume_returns_tbl
## # A tibble: 502,038 × 3
## symbol date pct_return
## <chr> <date> <dbl>
## 1 MSFT 2020-04-02 -0.144
## 2 MSFT 2020-04-03 -0.169
## 3 MSFT 2020-04-06 0.627
## 4 MSFT 2020-04-07 -0.0647
## 5 MSFT 2020-04-08 -0.230
## 6 MSFT 2020-04-09 0.0635
## 7 MSFT 2020-04-13 -0.184
## 8 MSFT 2020-04-14 0.262
## 9 MSFT 2020-04-15 -0.226
## 10 MSFT 2020-04-16 0.233
## # ℹ 502,028 more rows
stock_date_matrix_tbl <- sp_500_daily_volume_returns_tbl %>%
spread(key = date, value = pct_return, fill = 0) %>% mutate(across(-symbol, ~ replace_na(., 0))) %>%
mutate(across(-symbol, ~ ifelse(!is.finite(.), 0, .)))
stock_date_matrix_tbl
## # A tibble: 503 × 1,005
## symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A -0.153 0.115 0.126 0.0631 0.0997
## 2 AAL 0.153 0.328 0.0834 0.464 -0.401
## 3 AAPL -0.0584 -0.217 0.554 0.00529 -0.168
## 4 ABBV -0.376 -0.00187 0.273 0.129 0.0541
## 5 ABNB 0 0 0 0 0
## 6 ABT -0.167 -0.101 0.354 -0.121 -0.148
## 7 ACGL 0.148 0.00778 0.265 -0.0200 -0.364
## 8 ACN 0.0191 0.243 -0.180 -0.0283 -0.179
## 9 ADBE -0.0803 0.0961 0.308 0.0539 -0.262
## 10 ADI 0.126 -0.381 0.630 0.0384 -0.213
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## # `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## # `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## # `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## # `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## # `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …
symbol_date_cluster <- kmeans(
stock_date_matrix_tbl %>% select(-symbol),
centers = 3,
nstart = 20
)
summary(symbol_date_cluster)
## Length Class Mode
## cluster 503 -none- numeric
## centers 3012 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
tidy(symbol_date_cluster)
## # A tibble: 3 × 1,007
## `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.509 -0.567 1.24 -0.227 -0.214 -0.234
## 2 -0.139 -0.233 0.361 -0.0809 -0.208 0.498
## 3 0.0593 -0.0613 0.223 0.0110 -0.147 0.249
## # ℹ 1,001 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## # `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## # `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## # `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## # `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## # `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## # `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(symbol_date_cluster)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 237397091. 108858. 237288234. 3
augment(symbol_date_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-04-14`, color = .cluster)) +
geom_point()
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(
.x = k,
.f = ~ kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)
),
glanced = map(.x = kclust, .f = glance)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()
final_cluster <- kmeans(
stock_date_matrix_tbl %>% select(-symbol),
centers = 5,
nstart = 20
)
augment(final_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-04-14`, color = .cluster)) +
geom_point()
umap_results <- stock_date_matrix_tbl %>%
select(-symbol) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as_tibble() %>%
bind_cols(stock_date_matrix_tbl %>% select(symbol))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl
## # A tibble: 503 × 3
## V1 V2 symbol
## <dbl> <dbl> <chr>
## 1 1.37 -0.613 A
## 2 -0.788 2.90 AAL
## 3 0.728 0.964 AAPL
## 4 0.802 -0.530 ABBV
## 5 0.363 0.819 ABNB
## 6 -0.796 0.258 ABT
## 7 -0.404 -1.24 ACGL
## 8 0.177 0.900 ACN
## 9 1.01 0.744 ADBE
## 10 1.85 0.956 ADI
## # ℹ 493 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()
kmeans_umap_tbl <- final_cluster %>%
augment(stock_date_matrix_tbl) %>%
select(symbol, .cluster) %>%
left_join(umap_results_tbl, by = "symbol")
kmeans_umap_tbl
## # A tibble: 503 × 4
## symbol .cluster V1 V2
## <chr> <fct> <dbl> <dbl>
## 1 A 1 1.37 -0.613
## 2 AAL 1 -0.788 2.90
## 3 AAPL 1 0.728 0.964
## 4 ABBV 1 0.802 -0.530
## 5 ABNB 1 0.363 0.819
## 6 ABT 1 -0.796 0.258
## 7 ACGL 1 -0.404 -1.24
## 8 ACN 1 0.177 0.900
## 9 ADBE 1 1.01 0.744
## 10 ADI 1 1.85 0.956
## # ℹ 493 more rows
g <- kmeans_umap_tbl %>%
mutate(
text_label = str_glue(
"Symbol: {symbol}
Cluster: {.cluster}"
)
) %>%
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
g %>% ggplotly(tooltip = "text")