sp500_index_tbl  <- read_csv("../Apply9/sp500_index_tbl.csv")
## New names:
## Rows: 505 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): symbol, company, identifier, sedol, sector, local_currency dbl (3): ...1,
## weight, shares_held
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_prices_tbl <- read_csv("../Apply9/sp500_prices_tbl.csv")
## New names:
## Rows: 502541 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): symbol dbl (7): ...1, open, high, low, close, volume, adjusted date (1):
## date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company        <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol          <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…
skimr::skim(sp500_prices_tbl)
Data summary
Name sp500_prices_tbl
Number of rows 502541
Number of columns 9
_______________________
Column type frequency:
character 1
Date 1
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
symbol 0 1 1 5 0 503 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 2020-04-01 2024-03-28 2022-03-30 1005

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
…1 0 1 251271.00 145071.24 1.00 125636.00 251271.00 376906.00 502541.00 ▇▇▇▇▇
open 0 1 168.49 301.27 3.22 56.36 102.67 194.50 8022.92 ▇▁▁▁▁
high 0 1 170.56 305.00 3.29 57.07 104.01 196.78 8158.99 ▇▁▁▁▁
low 0 1 166.41 297.74 3.02 55.63 101.34 192.05 8010.00 ▇▁▁▁▁
close 0 1 168.52 301.44 3.12 56.36 102.70 194.47 8099.96 ▇▁▁▁▁
volume 0 1 5240569.73 12254470.69 0.00 959100.00 2003500.00 4579100.00 666378600.00 ▇▁▁▁▁
adjusted 0 1 164.56 300.98 2.96 53.17 98.79 188.76 8099.96 ▇▁▁▁▁

#Question

Do some S&P 500 stocks move the same way?

Convert Data To Standardized Form

sp500_prices_clean <- sp500_prices_tbl %>%
  janitor::clean_names() %>%
  select(symbol, date, adjusted)

sp500_returns_tbl <- sp500_prices_clean %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(lag_adjusted = lag(adjusted)) %>%
  ungroup() %>%
  filter(!is.na(lag_adjusted)) %>%
  mutate(pct_return = (adjusted - lag_adjusted) / lag_adjusted) %>%
  select(symbol, date, pct_return)

sp500_returns_tbl
## # A tibble: 502,038 × 3
##    symbol date       pct_return
##    <chr>  <date>          <dbl>
##  1 MSFT   2020-04-02    0.0207 
##  2 AAPL   2020-04-02    0.0167 
##  3 NVDA   2020-04-02    0.0510 
##  4 AMZN   2020-04-02    0.00583
##  5 META   2020-04-02   -0.00883
##  6 GOOGL  2020-04-02    0.0135 
##  7 BRK-B  2020-04-02    0.0215 
##  8 GOOG   2020-04-02    0.0138 
##  9 LLY    2020-04-02    0.0422 
## 10 AVGO   2020-04-02    0.0601 
## # ℹ 502,028 more rows

Spread to Object Characteristics Format

stock_date_matrix_tbl <- sp500_returns_tbl %>%
  pivot_wider(names_from = date, values_from = pct_return, values_fill = 0)

stock_date_matrix_tbl
## # A tibble: 503 × 1,005
##    symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
##    <chr>         <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 MSFT        0.0207      -0.00921       0.0744   -0.0108          0.0100
##  2 AAPL        0.0167      -0.0144        0.0872   -0.0116          0.0256
##  3 NVDA        0.0510      -0.0452        0.100    -0.0349          0.0306
##  4 AMZN        0.00583     -0.00638       0.0477    0.00701         0.0156
##  5 META       -0.00883     -0.0253        0.0737    0.0198          0.0323
##  6 GOOGL       0.0135      -0.0218        0.0828   -0.000532        0.0207
##  7 BRK-B       0.0215      -0.00773       0.0387    0.0000540       0.0311
##  8 GOOG        0.0138      -0.0205        0.0811   -0.000345        0.0200
##  9 LLY         0.0422      -0.0177        0.0140    0.00191         0.0306
## 10 AVGO        0.0601      -0.0118        0.0776    0.00329         0.0309
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## #   `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## #   `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## #   `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## #   `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## #   `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …

Perform K Means Clustering

stock_cluster <- kmeans(stock_date_matrix_tbl %>%
  select(-symbol), centers = 3, nstart = 20)

summary(stock_cluster)
##              Length Class  Mode   
## cluster       503   -none- numeric
## centers      3012   -none- numeric
## totss           1   -none- numeric
## withinss        3   -none- numeric
## tot.withinss    1   -none- numeric
## betweenss       1   -none- numeric
## size            3   -none- numeric
## iter            1   -none- numeric
## ifault          1   -none- numeric
tidy(stock_cluster)
## # A tibble: 3 × 1,007
##   `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
##          <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1      0.0127       -0.0161       0.0931    -0.00113        0.0381      0.00835
## 2      0.00624      -0.0191       0.101      0.0310         0.0609      0.0366 
## 3      0.0180       -0.0139       0.0645     0.000615       0.0385      0.0236 
## # ℹ 1,001 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## #   `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## #   `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## #   `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## #   `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## #   `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## #   `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(stock_cluster)
## # A tibble: 1 × 4
##   totss tot.withinss betweenss  iter
##   <dbl>        <dbl>     <dbl> <int>
## 1  163.         139.      24.4     3
augment(stock_cluster, stock_date_matrix_tbl) %>%
  ggplot(aes(`2020-04-02`, `2020-04-03`, color = .cluster)) +
  geom_point()

Select Optimal Number of Clusters

kclusts <- tibble(k = 1:9) %>%
  mutate(kclust = map(.x = k, .f = ~ kmeans(stock_date_matrix_tbl %>%
    select(-symbol), centers = .x, nstart = 20)),
    glanced = map(.x = kclust, .f = glance))

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_point() +
  geom_line()

final_cluster <- kmeans(stock_date_matrix_tbl %>%
  select(-symbol), centers = 5, nstart = 20)

augment(final_cluster, stock_date_matrix_tbl) %>%
  ggplot(aes(`2020-04-02`, `2020-04-03`, color = .cluster)) +
  geom_point()

Reduce Dimensions Using UMAP

umap_results <- stock_date_matrix_tbl %>%
  select(-symbol) %>%
  umap()

umap_results_tbl <- umap_results$layout %>%
  as_tibble() %>%
  bind_cols(stock_date_matrix_tbl %>% select(symbol))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl
## # A tibble: 503 × 3
##       V1      V2 symbol
##    <dbl>   <dbl> <chr> 
##  1 3.06   1.28   MSFT  
##  2 2.82   1.13   AAPL  
##  3 3.20   2.00   NVDA  
##  4 3.02   1.21   AMZN  
##  5 2.89   1.28   META  
##  6 2.87   1.17   GOOGL 
##  7 0.205  0.0709 BRK-B 
##  8 2.84   1.08   GOOG  
##  9 0.862 -1.91   LLY   
## 10 2.83   2.21   AVGO  
## # ℹ 493 more rows
umap_results_tbl %>%
  ggplot(aes(V1, V2)) +
  geom_point()

Visualize Clusters

kmeans_umap_tbl <- final_cluster %>%
  augment(stock_date_matrix_tbl) %>%
  select(symbol, .cluster) %>%
  
  left_join(umap_results_tbl, by = "symbol") %>%
  
  left_join(sp500_index_tbl %>%
    janitor::clean_names() %>%
    select(symbol, company, sector), by = "symbol")

kmeans_umap_tbl
## # A tibble: 503 × 6
##    symbol .cluster    V1      V2 company                     sector
##    <chr>  <fct>    <dbl>   <dbl> <chr>                       <chr> 
##  1 MSFT   3        3.06   1.28   MICROSOFT CORP              -     
##  2 AAPL   3        2.82   1.13   APPLE INC                   -     
##  3 NVDA   3        3.20   2.00   NVIDIA CORP                 -     
##  4 AMZN   3        3.02   1.21   AMAZON.COM INC              -     
##  5 META   3        2.89   1.28   META PLATFORMS INC CLASS A  -     
##  6 GOOGL  3        2.87   1.17   ALPHABET INC CL A           -     
##  7 BRK-B  1        0.205  0.0709 BERKSHIRE HATHAWAY INC CL B -     
##  8 GOOG   3        2.84   1.08   ALPHABET INC CL C           -     
##  9 LLY    1        0.862 -1.91   ELI LILLY + CO              -     
## 10 AVGO   3        2.83   2.21   BROADCOM INC                -     
## # ℹ 493 more rows
g <- kmeans_umap_tbl %>%
  
  mutate(text_label = str_glue("Symbol:     {symbol}
                                Company:    {company}
                                Sector:     {sector}
                                Cluster:    {.cluster}")) %>%
  
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

g %>% ggplotly(tooltip = "text")

Conclusion

This analysis shows that some S&P 500 stocks do move in similar ways. By using daily returns, the stocks were compared based on movement instead of price. The k-means clustering grouped similar stocks together, and the UMAP graph made those groups easier to see. Overall, clustering helps show which stocks have similar return patterns in the market.