library(tidyverse)
library(tidyquant) # for financial analysis
library(broom) # for tidy model results
library(umap) # for dimension reduction
library(plotly) # for interactive visualization
# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")
# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull()
# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")
write.csv(sp500_index_tbl, "../00_data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "../00_data/sp500_prices_tbl.csv")
Import data
sp500_index_tbl <- read_csv("../00_data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("../00_data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()
## Rows: 504
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol <chr> "NVDA", "AAPL", "MSFT", "AMZN", "META", "GOOGL", "GOOG"…
## $ company <chr> "NVIDIA CORP", "APPLE INC", "MICROSOFT CORP", "AMAZON.C…
## $ identifier <chr> "67066G104", "037833100", "594918104", "023135106", "30…
## $ sedol <chr> "2379504", "2046251", "2588173", "2000019", "B7TL820", …
## $ weight <dbl> 0.070339526, 0.069431688, 0.062504001, 0.038087149, 0.0…
## $ sector <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held <dbl> 301446769, 186310490, 91083580, 114471018, 26772470, 71…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 575,165
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol <chr> "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA…
## $ date <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open <dbl> 6.39125, 6.10600, 6.34900, 6.38100, 6.93250, 6.58525, 6.80000…
## $ high <dbl> 6.53825, 6.40000, 6.39075, 6.74700, 6.95625, 6.69875, 6.82300…
## $ low <dbl> 6.03200, 6.05775, 5.95975, 6.32325, 6.43250, 6.51500, 6.51050…
## $ close <dbl> 6.07675, 6.38675, 6.09775, 6.71000, 6.47575, 6.67375, 6.57375…
## $ volume <dbl> 656912000, 675764000, 663212000, 727884000, 784520000, 542444…
## $ adjusted <dbl> 6.055418, 6.364329, 6.076344, 6.686445, 6.453017, 6.650321, 6…
Which stock prices behave similarly?
Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.
Why It Matters Understanding which companies are related is crucial for various reasons:
Assignment Details Your task is to analyze the historical price data of various stocks and determine which stocks behave similarly. We will employ clustering techniques to accomplish this task effectively.
To compare data effectively, it must be standardized or normalized. Why? Because comparing values (like stock prices) of vastly different magnitudes is impractical. So, we’ll standardize by converting from adjusted stock price (in dollars) to daily returns (as percent change from the previous day). Here’s the formula:
\[ return_{daily} = \frac{price_{i}-price_{i-1}}{price_{i-1}} \]
sp500_prices_tbl %>% glimpse()
## Rows: 575,165
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol <chr> "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA", "NVDA…
## $ date <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open <dbl> 6.39125, 6.10600, 6.34900, 6.38100, 6.93250, 6.58525, 6.80000…
## $ high <dbl> 6.53825, 6.40000, 6.39075, 6.74700, 6.95625, 6.69875, 6.82300…
## $ low <dbl> 6.03200, 6.05775, 5.95975, 6.32325, 6.43250, 6.51500, 6.51050…
## $ close <dbl> 6.07675, 6.38675, 6.09775, 6.71000, 6.47575, 6.67375, 6.57375…
## $ volume <dbl> 656912000, 675764000, 663212000, 727884000, 784520000, 542444…
## $ adjusted <dbl> 6.055418, 6.364329, 6.076344, 6.686445, 6.453017, 6.650321, 6…
# Apply your data transformation skills!
sp_500_daily_returns_tbl <- sp500_prices_tbl %>%
select(symbol, date, adjusted) %>%
filter(date >= ymd("2018-01-01")) %>%
group_by(symbol) %>%
mutate(lag_1 = lag(adjusted)) %>%
ungroup() %>%
filter(!is.na(lag_1)) %>%
mutate(diff = adjusted - lag_1) %>%
mutate(pct_return = diff / lag_1) %>%
select(symbol, date, pct_return)
sp_500_daily_returns_tbl
## # A tibble: 574,662 × 3
## symbol date pct_return
## <chr> <date> <dbl>
## 1 NVDA 2020-04-02 0.0510
## 2 NVDA 2020-04-03 -0.0452
## 3 NVDA 2020-04-06 0.100
## 4 NVDA 2020-04-07 -0.0349
## 5 NVDA 2020-04-08 0.0306
## 6 NVDA 2020-04-09 -0.0150
## 7 NVDA 2020-04-13 0.0262
## 8 NVDA 2020-04-14 0.0523
## 9 NVDA 2020-04-15 -0.0110
## 10 NVDA 2020-04-16 0.0494
## # ℹ 574,652 more rows
We’ll convert the daily returns (percentage change from one day to the next) to object-characteristics format, also known as the user-item format. Users are identified by the symbol (company), and items are represented by the pct_return at each date.
stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
spread(key = date, value = pct_return, fill = 0)
stock_date_matrix_tbl
## # A tibble: 503 × 1,158
## symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.0489 -0.0259 0.0560 -0.00444 0.0359
## 2 AAPL 0.0167 -0.0144 0.0872 -0.0116 0.0256
## 3 ABBV 0.0233 -0.0234 0.0322 -0.00449 0.0420
## 4 ABNB 0 0 0 0 0
## 5 ABT 0.0375 0.000126 0.0413 -0.00967 0.0369
## 6 ACGL 0.0115 -0.0650 0.0983 0.0314 0.0291
## 7 ACN 0.0103 -0.0264 0.0914 -0.0116 0.0464
## 8 ADBE 0.00913 -0.0341 0.0869 -0.0320 0.0267
## 9 ADI 0.0429 -0.0130 0.107 0.00470 0.0535
## 10 ADM 0.0136 0.00932 0.0326 0.00559 0.0139
## # ℹ 493 more rows
## # ℹ 1,152 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## # `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## # `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## # `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## # `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## # `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …
stock_date_matrix_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 3, nstart = 20)
summary(stock_date_matrix_cluster)
## Length Class Mode
## cluster 503 -none- numeric
## centers 3471 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
tidy(stock_date_matrix_cluster)
## # A tibble: 3 × 1,160
## `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00636 -0.0192 0.103 0.0317 0.0612 0.0372
## 2 0.0114 -0.0193 0.0931 -0.00156 0.0382 0.00745
## 3 0.0179 -0.0132 0.0636 0.000700 0.0377 0.0231
## # ℹ 1,154 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## # `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## # `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## # `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## # `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## # `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## # `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(stock_date_matrix_cluster)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 184. 158. 25.7 3
augment(stock_date_matrix_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-06-11`, color = .cluster)) +
geom_point()
kclusts <- tibble(k = 1:9) %>%
mutate(kclust = map(.x = k, .f = ~ kmeans(stock_date_matrix_tbl %>%
select(-symbol), centers= .x,
nstart = 20)),
glanced = map(.x = kclust, .f = glance))
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()
final_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 5, nstart = 20)
augment(stock_date_matrix_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-04-03`, color = .cluster)) +
geom_point()
umap_results <- stock_date_matrix_tbl %>%
select(-symbol) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as.tibble() %>%
bind_cols(stock_date_matrix_tbl %>% select(symbol))
umap_results_tbl
## # A tibble: 503 × 3
## V1 V2 symbol
## <dbl> <dbl> <chr>
## 1 2.60 1.85 A
## 2 1.24 3.17 AAPL
## 3 2.64 -0.513 ABBV
## 4 -2.25 0.799 ABNB
## 5 2.43 0.910 ABT
## 6 -1.16 -0.737 ACGL
## 7 1.20 1.88 ACN
## 8 1.19 3.51 ADBE
## 9 -0.228 3.76 ADI
## 10 -0.560 -0.558 ADM
## # ℹ 493 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()
kmeans_umap_tbl <- final_cluster %>%
augment(stock_date_matrix_tbl) %>%
select(symbol, .cluster) %>%
# Add umap results
left_join(umap_results_tbl) %>%
# Add employment info
left_join(sp500_index_tbl %>%
select(symbol, company))
kmeans_umap_tbl
## # A tibble: 503 × 5
## symbol .cluster V1 V2 company
## <chr> <fct> <dbl> <dbl> <chr>
## 1 A 4 2.60 1.85 AGILENT TECHNOLOGIES INC
## 2 AAPL 2 1.24 3.17 APPLE INC
## 3 ABBV 4 2.64 -0.513 ABBVIE INC
## 4 ABNB 2 -2.25 0.799 AIRBNB INC CLASS A
## 5 ABT 4 2.43 0.910 ABBOTT LABORATORIES
## 6 ACGL 1 -1.16 -0.737 ARCH CAPITAL GROUP LTD
## 7 ACN 1 1.20 1.88 ACCENTURE PLC CL A
## 8 ADBE 2 1.19 3.51 ADOBE INC
## 9 ADI 2 -0.228 3.76 ANALOG DEVICES INC
## 10 ADM 1 -0.560 -0.558 ARCHER DANIELS MIDLAND CO
## # ℹ 493 more rows
g <- kmeans_umap_tbl %>%
# Create Text label
mutate(text_label = str_glue("Stock: {symbol}
Cluster: {.cluster}
Company: {company}")) %>%
# Plot
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
g %>% ggplotly(tooltip = "text")