Set up

library(tidyverse)
library(tidyquant) # for financial analysis
library(broom) # for tidy model results
library(umap)  # for dimension reduction
library(plotly) # for interactive visualization

Data

# Get info on companies listed in S&P500
sp500_index_tbl <- tq_index("SP500")

# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbl %>% distinct(symbol) %>% pull() 

# Get stock prices of the companies
sp500_prices_tbl <- tq_get(sp500_symbols, from = "2020-04-01")

write.csv(sp500_index_tbl, "../00_data/sp500_index_tbl.csv")
write.csv(sp500_prices_tbl, "../00_data/sp500_prices_tbl.csv")

Import data

sp500_index_tbl <- read_csv("../00_data/sp500_index_tbl.csv")
sp500_prices_tbl <- read_csv("../00_data/sp500_prices_tbl.csv")
sp500_index_tbl %>% glimpse()
## Rows: 504
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "AAPL", "MSFT", "NVDA", "AMZN", "META", "BRK-B", "GOOGL…
## $ company        <chr> "APPLE INC", "MICROSOFT CORP", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "037833100", "594918104", "67066G104", "023135106", "30…
## $ sedol          <chr> "2046251", "2588173", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.063852496, 0.063282113, 0.058805166, 0.038154398, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 185532921, 91814848, 302468387, 116491471, 27046746, 22…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 629,218
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 61.6250, 60.0850, 60.7000, 62.7250, 67.7000, 65.6850, 67.1750…
## $ high     <dbl> 62.1800, 61.2875, 61.4250, 65.7775, 67.9250, 66.8425, 67.5175…
## $ low      <dbl> 59.7825, 59.2250, 59.7425, 62.3450, 64.7500, 65.3075, 66.1750…
## $ close    <dbl> 60.2275, 61.2325, 60.3525, 65.6175, 64.8575, 66.5175, 66.9975…
## $ volume   <dbl> 176218400, 165934000, 129880000, 201820400, 202887200, 168895…
## $ adjusted <dbl> 58.46381, 59.43937, 58.58514, 63.69596, 62.95821, 64.56960, 6…

Question

Which stock prices behave similarly?

Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.

Why It Matters Understanding which companies are related is crucial for various reasons:

Assignment Details Your task is to analyze the historical price data of various stocks and determine which stocks behave similarly. We will employ clustering techniques to accomplish this task effectively.

1 Convert data to standardized form

To compare data effectively, it must be standardized or normalized. Why? Because comparing values (like stock prices) of vastly different magnitudes is impractical. So, we’ll standardize by converting from adjusted stock price (in dollars) to daily returns (as percent change from the previous day). Here’s the formula:

\[ return_{daily} = \frac{price_{i}-price_{i-1}}{price_{i-1}} \]

sp500_prices_tbl %>% glimpse()
## Rows: 629,218
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 61.6250, 60.0850, 60.7000, 62.7250, 67.7000, 65.6850, 67.1750…
## $ high     <dbl> 62.1800, 61.2875, 61.4250, 65.7775, 67.9250, 66.8425, 67.5175…
## $ low      <dbl> 59.7825, 59.2250, 59.7425, 62.3450, 64.7500, 65.3075, 66.1750…
## $ close    <dbl> 60.2275, 61.2325, 60.3525, 65.6175, 64.8575, 66.5175, 66.9975…
## $ volume   <dbl> 176218400, 165934000, 129880000, 201820400, 202887200, 168895…
## $ adjusted <dbl> 58.46381, 59.43937, 58.58514, 63.69596, 62.95821, 64.56960, 6…
# Apply your data transformation skills!
sp_500_daily_returns_tbl <- sp500_prices_tbl %>%
    
    select(symbol, date, adjusted) %>%
    
    filter(date >= ymd("2018-01-01")) %>%
    
    group_by(symbol) %>%
    mutate(lag_1 = lag(adjusted)) %>%
    ungroup() %>%
    
    filter(!is.na(lag_1)) %>%
    
    mutate(diff = adjusted - lag_1) %>%
    mutate(pct_return = diff / lag_1) %>%
    
    select(symbol, date, pct_return)

sp_500_daily_returns_tbl
## # A tibble: 628,715 × 3
##    symbol date       pct_return
##    <chr>  <date>          <dbl>
##  1 AAPL   2020-04-02    0.0167 
##  2 AAPL   2020-04-03   -0.0144 
##  3 AAPL   2020-04-06    0.0872 
##  4 AAPL   2020-04-07   -0.0116 
##  5 AAPL   2020-04-08    0.0256 
##  6 AAPL   2020-04-09    0.00722
##  7 AAPL   2020-04-13    0.0196 
##  8 AAPL   2020-04-14    0.0505 
##  9 AAPL   2020-04-15   -0.00913
## 10 AAPL   2020-04-16    0.00795
## # ℹ 628,705 more rows

2 Spread to object-characteristics format

We’ll convert the daily returns (percentage change from one day to the next) to object-characteristics format, also known as the user-item format. Users are identified by the symbol (company), and items are represented by the pct_return at each date.

stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
    spread(key = date, value = pct_return, fill = 0)

stock_date_matrix_tbl
## # A tibble: 503 × 1,264
##    symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
##    <chr>         <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 A           0.0489     -0.0259         0.0559     -0.00444       0.0359
##  2 AAPL        0.0167     -0.0144         0.0872     -0.0116        0.0256
##  3 ABBV        0.0233     -0.0234         0.0322     -0.00449       0.0420
##  4 ABNB        0           0              0           0             0     
##  5 ABT         0.0375      0.000126       0.0413     -0.00967       0.0369
##  6 ACGL        0.0115     -0.0650         0.0983      0.0314        0.0291
##  7 ACN         0.0103     -0.0264         0.0914     -0.0116        0.0464
##  8 ADBE        0.00913    -0.0341         0.0869     -0.0320        0.0267
##  9 ADI         0.0429     -0.0130         0.107       0.00470       0.0535
## 10 ADM         0.0136      0.00932        0.0326      0.00559       0.0139
## # ℹ 493 more rows
## # ℹ 1,258 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## #   `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## #   `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## #   `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## #   `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## #   `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …

3 Performance Kmeans Clustering

stock_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 3, nstart = 20)
summary(stock_cluster)
##              Length Class  Mode   
## cluster       503   -none- numeric
## centers      3789   -none- numeric
## totss           1   -none- numeric
## withinss        3   -none- numeric
## tot.withinss    1   -none- numeric
## betweenss       1   -none- numeric
## size            3   -none- numeric
## iter            1   -none- numeric
## ifault          1   -none- numeric
tidy(stock_cluster) 
## # A tibble: 3 × 1,266
##   `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
##          <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1      0.0174       -0.0136       0.0648      0.00118       0.0386      0.0226 
## 2      0.0114       -0.0178       0.0916     -0.00234       0.0357      0.00833
## 3      0.00694      -0.0185       0.102       0.0307        0.0603      0.0379 
## # ℹ 1,260 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## #   `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## #   `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## #   `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## #   `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## #   `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## #   `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(stock_cluster)
## # A tibble: 1 × 4
##   totss tot.withinss betweenss  iter
##   <dbl>        <dbl>     <dbl> <int>
## 1  206.         178.      28.0     3
augment(stock_cluster, stock_date_matrix_tbl) %>%
  
  ggplot(aes("2020-04-02","2020-08-20", color = .cluster)) +
  geom_point()

Select Optimal Number of Clusters

kclusters <- tibble(k = 1:9) %>%
  mutate(kclust = map(.x = k, .f = ~ kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)),
         glanced = map(.x = kclust, .f = glance))

kclusters %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_point() +
  geom_line()

stock_final_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 5, nstart = 20)
augment(stock_final_cluster, stock_date_matrix_tbl) %>%
  
  ggplot(aes(`2020-04-02`,`2020-08-20`, color = .cluster)) +
  geom_point()

Reduce Dimension Using Umap

stock_umap_results <- stock_date_matrix_tbl %>%
  select(-symbol) %>%
  umap()

umap_results_tbl <- stock_umap_results$layout %>%
  as.tibble() %>%
  bind_cols(stock_date_matrix_tbl %>% select(symbol))

umap_results_tbl
## # A tibble: 503 × 3
##         V1       V2 symbol
##      <dbl>    <dbl> <chr> 
##  1  2.34    1.51    A     
##  2  3.00    0.0493  AAPL  
##  3  0.0527  2.29    ABBV  
##  4  2.53   -0.862   ABNB  
##  5  1.01    1.79    ABT   
##  6 -1.23   -0.939   ACGL  
##  7  1.79    0.00816 ACN   
##  8  3.18   -0.167   ADBE  
##  9  3.02   -1.73    ADI   
## 10 -1.17   -0.545   ADM   
## # ℹ 493 more rows

Visulaizing Clusters by Adding K-Means Clusters

stock_kmeans_umap_tbl <- stock_final_cluster %>%
  augment(stock_date_matrix_tbl) %>%
  select(symbol, .cluster) %>%
  
  # Add Umap Results
  left_join(umap_results_tbl) %>%

  left_join(sp500_index_tbl %>% select(symbol, company, sector), by = "symbol") 

stock_kmeans_umap_tbl
## # A tibble: 503 × 6
##    symbol .cluster      V1       V2 company                   sector
##    <chr>  <fct>      <dbl>    <dbl> <chr>                     <chr> 
##  1 A      1         2.34    1.51    AGILENT TECHNOLOGIES INC  -     
##  2 AAPL   2         3.00    0.0493  APPLE INC                 -     
##  3 ABBV   5         0.0527  2.29    ABBVIE INC                -     
##  4 ABNB   2         2.53   -0.862   AIRBNB INC CLASS A        -     
##  5 ABT    5         1.01    1.79    ABBOTT LABORATORIES       -     
##  6 ACGL   1        -1.23   -0.939   ARCH CAPITAL GROUP LTD    -     
##  7 ACN    1         1.79    0.00816 ACCENTURE PLC CL A        -     
##  8 ADBE   2         3.18   -0.167   ADOBE INC                 -     
##  9 ADI    2         3.02   -1.73    ANALOG DEVICES INC        -     
## 10 ADM    5        -1.17   -0.545   ARCHER DANIELS MIDLAND CO -     
## # ℹ 493 more rows
graph <- stock_kmeans_umap_tbl %>%
  
  # Create text label
  mutate(text_label = str_glue("Ticker: {symbol}
                               Cluster: {.cluster}
                               Company: {company}
                               Sector:  {sector}")) %>%
  
  # Plot
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

graph %>% ggplotly(tooltip = "text")