sp500_index_tbl <- read_csv("../Apply9/sp500_index_tbl.csv")
## New names:
## Rows: 505 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): symbol, company, identifier, sedol, sector, local_currency dbl (3): ...1,
## weight, shares_held
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_prices_tbl <- read_csv("../Apply9/sp500_prices_tbl.csv")
## New names:
## Rows: 502541 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): symbol dbl (7): ...1, open, high, low, close, volume, adjusted date (1):
## date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…
skimr::skim(sp500_prices_tbl)
| Name | sp500_prices_tbl |
| Number of rows | 502541 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| symbol | 0 | 1 | 1 | 5 | 0 | 503 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 2020-04-01 | 2024-03-28 | 2022-03-30 | 1005 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 251271.00 | 145071.24 | 1.00 | 125636.00 | 251271.00 | 376906.00 | 502541.00 | ▇▇▇▇▇ |
| open | 0 | 1 | 168.49 | 301.27 | 3.22 | 56.36 | 102.67 | 194.50 | 8022.92 | ▇▁▁▁▁ |
| high | 0 | 1 | 170.56 | 305.00 | 3.29 | 57.07 | 104.01 | 196.78 | 8158.99 | ▇▁▁▁▁ |
| low | 0 | 1 | 166.41 | 297.74 | 3.02 | 55.63 | 101.34 | 192.05 | 8010.00 | ▇▁▁▁▁ |
| close | 0 | 1 | 168.52 | 301.44 | 3.12 | 56.36 | 102.70 | 194.47 | 8099.96 | ▇▁▁▁▁ |
| volume | 0 | 1 | 5240569.73 | 12254470.69 | 0.00 | 959100.00 | 2003500.00 | 4579100.00 | 666378600.00 | ▇▁▁▁▁ |
| adjusted | 0 | 1 | 164.56 | 300.98 | 2.96 | 53.17 | 98.79 | 188.76 | 8099.96 | ▇▁▁▁▁ |
#Question
Do some S&P 500 stocks move the same way?
sp500_prices_clean <- sp500_prices_tbl %>%
janitor::clean_names() %>%
select(symbol, date, adjusted)
sp500_returns_tbl <- sp500_prices_clean %>%
group_by(symbol) %>%
arrange(date) %>%
mutate(lag_adjusted = lag(adjusted)) %>%
ungroup() %>%
filter(!is.na(lag_adjusted)) %>%
mutate(pct_return = (adjusted - lag_adjusted) / lag_adjusted) %>%
select(symbol, date, pct_return)
sp500_returns_tbl
## # A tibble: 502,038 × 3
## symbol date pct_return
## <chr> <date> <dbl>
## 1 MSFT 2020-04-02 0.0207
## 2 AAPL 2020-04-02 0.0167
## 3 NVDA 2020-04-02 0.0510
## 4 AMZN 2020-04-02 0.00583
## 5 META 2020-04-02 -0.00883
## 6 GOOGL 2020-04-02 0.0135
## 7 BRK-B 2020-04-02 0.0215
## 8 GOOG 2020-04-02 0.0138
## 9 LLY 2020-04-02 0.0422
## 10 AVGO 2020-04-02 0.0601
## # ℹ 502,028 more rows
stock_date_matrix_tbl <- sp500_returns_tbl %>%
pivot_wider(names_from = date, values_from = pct_return, values_fill = 0)
stock_date_matrix_tbl
## # A tibble: 503 × 1,005
## symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MSFT 0.0207 -0.00921 0.0744 -0.0108 0.0100
## 2 AAPL 0.0167 -0.0144 0.0872 -0.0116 0.0256
## 3 NVDA 0.0510 -0.0452 0.100 -0.0349 0.0306
## 4 AMZN 0.00583 -0.00638 0.0477 0.00701 0.0156
## 5 META -0.00883 -0.0253 0.0737 0.0198 0.0323
## 6 GOOGL 0.0135 -0.0218 0.0828 -0.000532 0.0207
## 7 BRK-B 0.0215 -0.00773 0.0387 0.0000540 0.0311
## 8 GOOG 0.0138 -0.0205 0.0811 -0.000345 0.0200
## 9 LLY 0.0422 -0.0177 0.0140 0.00191 0.0306
## 10 AVGO 0.0601 -0.0118 0.0776 0.00329 0.0309
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## # `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## # `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## # `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## # `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## # `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …
stock_cluster <- kmeans(stock_date_matrix_tbl %>%
select(-symbol), centers = 3, nstart = 20)
summary(stock_cluster)
## Length Class Mode
## cluster 503 -none- numeric
## centers 3012 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
tidy(stock_cluster)
## # A tibble: 3 × 1,007
## `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0127 -0.0161 0.0931 -0.00113 0.0381 0.00835
## 2 0.00624 -0.0191 0.101 0.0310 0.0609 0.0366
## 3 0.0180 -0.0139 0.0645 0.000615 0.0385 0.0236
## # ℹ 1,001 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## # `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## # `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## # `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## # `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## # `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## # `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(stock_cluster)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 163. 139. 24.4 3
augment(stock_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-04-03`, color = .cluster)) +
geom_point()
kclusts <- tibble(k = 1:9) %>%
mutate(kclust = map(.x = k, .f = ~ kmeans(stock_date_matrix_tbl %>%
select(-symbol), centers = .x, nstart = 20)),
glanced = map(.x = kclust, .f = glance))
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()
final_cluster <- kmeans(stock_date_matrix_tbl %>%
select(-symbol), centers = 5, nstart = 20)
augment(final_cluster, stock_date_matrix_tbl) %>%
ggplot(aes(`2020-04-02`, `2020-04-03`, color = .cluster)) +
geom_point()
umap_results <- stock_date_matrix_tbl %>%
select(-symbol) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as_tibble() %>%
bind_cols(stock_date_matrix_tbl %>% select(symbol))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl
## # A tibble: 503 × 3
## V1 V2 symbol
## <dbl> <dbl> <chr>
## 1 3.06 1.28 MSFT
## 2 2.82 1.13 AAPL
## 3 3.20 2.00 NVDA
## 4 3.02 1.21 AMZN
## 5 2.89 1.28 META
## 6 2.87 1.17 GOOGL
## 7 0.205 0.0709 BRK-B
## 8 2.84 1.08 GOOG
## 9 0.862 -1.91 LLY
## 10 2.83 2.21 AVGO
## # ℹ 493 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()
kmeans_umap_tbl <- final_cluster %>%
augment(stock_date_matrix_tbl) %>%
select(symbol, .cluster) %>%
left_join(umap_results_tbl, by = "symbol") %>%
left_join(sp500_index_tbl %>%
janitor::clean_names() %>%
select(symbol, company, sector), by = "symbol")
kmeans_umap_tbl
## # A tibble: 503 × 6
## symbol .cluster V1 V2 company sector
## <chr> <fct> <dbl> <dbl> <chr> <chr>
## 1 MSFT 3 3.06 1.28 MICROSOFT CORP -
## 2 AAPL 3 2.82 1.13 APPLE INC -
## 3 NVDA 3 3.20 2.00 NVIDIA CORP -
## 4 AMZN 3 3.02 1.21 AMAZON.COM INC -
## 5 META 3 2.89 1.28 META PLATFORMS INC CLASS A -
## 6 GOOGL 3 2.87 1.17 ALPHABET INC CL A -
## 7 BRK-B 1 0.205 0.0709 BERKSHIRE HATHAWAY INC CL B -
## 8 GOOG 3 2.84 1.08 ALPHABET INC CL C -
## 9 LLY 1 0.862 -1.91 ELI LILLY + CO -
## 10 AVGO 3 2.83 2.21 BROADCOM INC -
## # ℹ 493 more rows
g <- kmeans_umap_tbl %>%
mutate(text_label = str_glue("Symbol: {symbol}
Company: {company}
Sector: {sector}
Cluster: {.cluster}")) %>%
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
g %>% ggplotly(tooltip = "text")
This analysis shows that some S&P 500 stocks do move in similar ways. By using daily returns, the stocks were compared based on movement instead of price. The k-means clustering grouped similar stocks together, and the UMAP graph made those groups easier to see. Overall, clustering helps show which stocks have similar return patterns in the market.