library(tidyverse)
library(tidyquant) # for financial analysis
library(broom) # for tidy model results
library(umap) # for dimension reduction
library(plotly) # for interactive visualization
# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")
# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull()
# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")
write.csv(sp500_index_tbl, "../00_data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "../00_data/sp500_prices_tbl.csv")
sp500_index_tbl <- read_csv("../00_data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("../00_data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…
Which stock prices behave similarly?
Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.
sp_500_daily_returns_tbl <- sp500_prices_tbl %>%
select(symbol, date, adjusted) %>%
filter(date >= ymd("2018-01-01")) %>%
group_by(symbol) %>%
mutate(lag_1 = lag(adjusted)) %>%
ungroup() %>%
filter(!is.na(lag_1)) %>%
mutate(diff = adjusted - lag_1) %>%
mutate(pct_return = diff / lag_1) %>%
select(symbol, date, pct_return)
sp_500_daily_returns_tbl
## # A tibble: 502,038 × 3
## symbol date pct_return
## <chr> <date> <dbl>
## 1 MSFT 2020-04-02 0.0207
## 2 MSFT 2020-04-03 -0.00921
## 3 MSFT 2020-04-06 0.0744
## 4 MSFT 2020-04-07 -0.0108
## 5 MSFT 2020-04-08 0.0100
## 6 MSFT 2020-04-09 0.0000605
## 7 MSFT 2020-04-13 0.00224
## 8 MSFT 2020-04-14 0.0495
## 9 MSFT 2020-04-15 -0.0105
## 10 MSFT 2020-04-16 0.0300
## # ℹ 502,028 more rows
stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
spread(key = date, value = pct_return, fill = 0)
stock_date_matrix_tbl
## # A tibble: 503 × 1,005
## symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.0489 -0.0259 0.0559 -0.00444 0.0359
## 2 AAL -0.0589 -0.0666 0.0117 0.0758 0.109
## 3 AAPL 0.0167 -0.0144 0.0872 -0.0116 0.0256
## 4 ABBV 0.0233 -0.0234 0.0322 -0.00449 0.0420
## 5 ABNB 0 0 0 0 0
## 6 ABT 0.0375 0.000126 0.0413 -0.00967 0.0369
## 7 ACGL 0.0115 -0.0650 0.0983 0.0314 0.0291
## 8 ACN 0.0103 -0.0264 0.0914 -0.0116 0.0464
## 9 ADBE 0.00913 -0.0341 0.0869 -0.0320 0.0267
## 10 ADI 0.0429 -0.0130 0.107 0.00470 0.0535
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## # `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## # `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## # `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## # `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## # `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …
set.seed(123)
kmeans_3 <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 3, nstart = 20)
clustered_tbl <- augment(kmeans_3, stock_date_matrix_tbl)
k_values <- tibble(k = 1:9) %>%
mutate(kclust = map(k, ~kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)),
glanced = map(kclust, glance)) %>%
unnest(glanced)
k_values %>%
ggplot(aes(x = k, y = tot.withinss)) +
geom_line() +
geom_point() +
labs(title = "Elbow Method for Choosing K",
x = "Number of Clusters",
y = "Total Within-Cluster Sum of Squares")
umap_results <- stock_date_matrix_tbl %>%
select(-symbol) %>%
umap()
umap_layout <- umap_results$layout %>%
as_tibble() %>%
rename(UMAP1 = V1, UMAP2 = V2) %>%
bind_cols(clustered_tbl %>% select(symbol, .cluster))
# Plot
ggplot(umap_layout, aes(x = UMAP1, y = UMAP2, color = as.factor(.cluster))) +
geom_point(alpha = 0.7) +
labs(title = "UMAP Projection of Stock Returns",
subtitle = "Colored by K-Means Cluster",
color = "Cluster")
umap_layout <- umap_layout %>%
left_join(sp500_index_tbl %>% select(symbol, company = company), by = "symbol") %>%
mutate(text_label = str_glue("Company: {company}
Symbol: {symbol}
Cluster: {.cluster}"))
ggplotly(
ggplot(umap_layout, aes(x = UMAP1, y = UMAP2, color = as.factor(.cluster), text = text_label)) +
geom_point(alpha = 0.7) +
labs(title = "UMAP Projection of Stock Clusters", color = "Cluster"),
tooltip = "text"
)