library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyquant) # for financial analysis
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(broom) # for tidy model results
library(umap)  # for dimension reduction
## Warning: package 'umap' was built under R version 4.3.3
library(plotly) # for interactive visualization
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# Get info on companies listed in S&P500
sp500_index_tbll <- tq_index("SP500")
## Getting holdings for SP500
# Get individual stocks from S&P500
sp500_symbols <- sp500_index_tbll %>% distinct(symbol) %>% pull() 

# Get stock prices of the companies
sp500_prices_tbll <- tq_get(sp500_symbols, from = "2020-04-01")
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `data.. = purrr::map(...)`.
## Caused by warning:
## ! x = '-', get = 'stock.prices': Error in getSymbols.yahoo(Symbols = "-", env = <environment>, verbose = FALSE, : Unable to import "-".
## HTTP error 404.
##  Removing -.
sp500_index_tbl <- read_csv("sp500_index_tbl.csv")
## New names:
## Rows: 505 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (6): symbol, company, identifier, sedol, sector, local_currency dbl (3): ...1,
## weight, shares_held
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_prices_tbl <- read_csv("sp500_prices_tbl.csv")
## New names:
## Rows: 502541 Columns: 9
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): symbol dbl (7): ...1, open, high, low, close, volume, adjusted date (1):
## date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
sp500_index_tbl %>% glimpse()
## Rows: 505
## Columns: 9
## $ ...1           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ symbol         <chr> "MSFT", "AAPL", "NVDA", "AMZN", "META", "GOOGL", "BRK-B…
## $ company        <chr> "MICROSOFT CORP", "APPLE INC", "NVIDIA CORP", "AMAZON.C…
## $ identifier     <chr> "594918104", "037833100", "67066G104", "023135106", "30…
## $ sedol          <chr> "2588173", "2046251", "2379504", "2000019", "B7TL820", …
## $ weight         <dbl> 0.070781439, 0.056357717, 0.050531591, 0.037332751, 0.0…
## $ sector         <chr> "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", …
## $ shares_held    <dbl> 89663945, 175158625, 29805583, 110304496, 26548208, 711…
## $ local_currency <chr> "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD",…
sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…

#Question Which stock prices behave similarly?

Our main objective is to identify stocks that exhibit similar price behaviors over time. By doing so, we aim to gain insights into the relationships between different companies, uncovering potential competitors and sector affiliations.

Why It Matters Understanding which companies are related is crucial for various reasons:

Identifying Relationships: Clustering similar stocks reveals underlying connections and dependencies in the market. Competitor Analysis: Discovering competitors and sector affiliations aids in strategic planning and market positioning. Market Dynamics: Analyzing stock price similarities provides valuable insights into market dynamics, benefiting diverse fields such as finance, sales, and marketing. Assignment Details Your task is to analyze the historical price data of various stocks and determine which stocks behave similarly. We will employ clustering techniques to accomplish this task effectively.

After looking at the cluster graph, a main argument I see is that many company’s, or company’s around the same field of work and are grouped together. This could indicate competitors in that field.

1 Convert data to standardized form

To compare data effectively, it must be standardized or normalized. Why? Because comparing values (like stock prices) of vastly different magnitudes is impractical. So, we’ll standardize by converting from adjusted stock price (in dollars) to daily returns (as percent change from the previous day). Here’s the formula:

sp500_prices_tbl %>% glimpse()
## Rows: 502,541
## Columns: 9
## $ ...1     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
## $ date     <date> 2020-04-01, 2020-04-02, 2020-04-03, 2020-04-06, 2020-04-07, …
## $ open     <dbl> 153.00, 151.86, 155.10, 160.32, 169.59, 165.67, 166.36, 164.3…
## $ high     <dbl> 157.75, 155.48, 157.38, 166.50, 170.00, 166.67, 167.37, 165.5…
## $ low      <dbl> 150.82, 150.36, 152.19, 157.58, 163.26, 163.50, 163.33, 162.3…
## $ close    <dbl> 152.11, 155.26, 153.83, 165.27, 163.49, 165.13, 165.14, 165.5…
## $ volume   <dbl> 57969900, 49630700, 41243300, 67111700, 62769000, 48318200, 5…
## $ adjusted <dbl> 146.7080, 149.7461, 148.3670, 159.4007, 157.6839, 159.2657, 1…
# Apply your data transformation skills!
sp_500_daily_returns_tbl <- sp500_prices_tbl %>%
    
    select(symbol, date, adjusted) %>%
    
    filter(date >= ymd("2018-01-01")) %>%
    
    group_by(symbol) %>%
    mutate(lag_1 = lag(adjusted)) %>%
    ungroup() %>%
    
    filter(!is.na(lag_1)) %>%
    
    mutate(diff = adjusted - lag_1) %>%
    mutate(pct_return = diff / lag_1) %>%
    
    select(symbol, date, pct_return)

sp_500_daily_returns_tbl
## # A tibble: 502,038 × 3
##    symbol date       pct_return
##    <chr>  <date>          <dbl>
##  1 MSFT   2020-04-02  0.0207   
##  2 MSFT   2020-04-03 -0.00921  
##  3 MSFT   2020-04-06  0.0744   
##  4 MSFT   2020-04-07 -0.0108   
##  5 MSFT   2020-04-08  0.0100   
##  6 MSFT   2020-04-09  0.0000605
##  7 MSFT   2020-04-13  0.00224  
##  8 MSFT   2020-04-14  0.0495   
##  9 MSFT   2020-04-15 -0.0105   
## 10 MSFT   2020-04-16  0.0300   
## # ℹ 502,028 more rows

2 Spread to object-characteristics format

We’ll convert the daily returns (percentage change from one day to the next) to object-characteristics format, also known as the user-item format. Users are identified by the symbol (company), and items are represented by the pct_return at each date.

stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>%
    spread(key = date, value = pct_return, fill = 0)

stock_date_matrix_tbl
## # A tibble: 503 × 1,005
##    symbol `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08`
##    <chr>         <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
##  1 A           0.0489     -0.0259         0.0559     -0.00444       0.0359
##  2 AAL        -0.0589     -0.0666         0.0117      0.0758        0.109 
##  3 AAPL        0.0167     -0.0144         0.0872     -0.0116        0.0256
##  4 ABBV        0.0233     -0.0234         0.0322     -0.00449       0.0420
##  5 ABNB        0           0              0           0             0     
##  6 ABT         0.0375      0.000126       0.0413     -0.00967       0.0369
##  7 ACGL        0.0115     -0.0650         0.0983      0.0314        0.0291
##  8 ACN         0.0103     -0.0264         0.0914     -0.0116        0.0464
##  9 ADBE        0.00913    -0.0341         0.0869     -0.0320        0.0267
## 10 ADI         0.0429     -0.0130         0.107       0.00470       0.0535
## # ℹ 493 more rows
## # ℹ 999 more variables: `2020-04-09` <dbl>, `2020-04-13` <dbl>,
## #   `2020-04-14` <dbl>, `2020-04-15` <dbl>, `2020-04-16` <dbl>,
## #   `2020-04-17` <dbl>, `2020-04-20` <dbl>, `2020-04-21` <dbl>,
## #   `2020-04-22` <dbl>, `2020-04-23` <dbl>, `2020-04-24` <dbl>,
## #   `2020-04-27` <dbl>, `2020-04-28` <dbl>, `2020-04-29` <dbl>,
## #   `2020-04-30` <dbl>, `2020-05-01` <dbl>, `2020-05-04` <dbl>, …

3 Perfrom k-means clustering

occupation_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 3, nstart = 20)

summary(occupation_cluster)
##              Length Class  Mode   
## cluster       503   -none- numeric
## centers      3012   -none- numeric
## totss           1   -none- numeric
## withinss        3   -none- numeric
## tot.withinss    1   -none- numeric
## betweenss       1   -none- numeric
## size            3   -none- numeric
## iter            1   -none- numeric
## ifault          1   -none- numeric
tidy(occupation_cluster)
## # A tibble: 3 × 1,007
##   `2020-04-02` `2020-04-03` `2020-04-06` `2020-04-07` `2020-04-08` `2020-04-09`
##          <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
## 1      0.0180       -0.0139       0.0645     0.000615       0.0385      0.0236 
## 2      0.00624      -0.0191       0.101      0.0310         0.0609      0.0366 
## 3      0.0127       -0.0161       0.0931    -0.00113        0.0381      0.00835
## # ℹ 1,001 more variables: `2020-04-13` <dbl>, `2020-04-14` <dbl>,
## #   `2020-04-15` <dbl>, `2020-04-16` <dbl>, `2020-04-17` <dbl>,
## #   `2020-04-20` <dbl>, `2020-04-21` <dbl>, `2020-04-22` <dbl>,
## #   `2020-04-23` <dbl>, `2020-04-24` <dbl>, `2020-04-27` <dbl>,
## #   `2020-04-28` <dbl>, `2020-04-29` <dbl>, `2020-04-30` <dbl>,
## #   `2020-05-01` <dbl>, `2020-05-04` <dbl>, `2020-05-05` <dbl>,
## #   `2020-05-06` <dbl>, `2020-05-07` <dbl>, `2020-05-08` <dbl>, …
glance(occupation_cluster)
## # A tibble: 1 × 4
##   totss tot.withinss betweenss  iter
##   <dbl>        <dbl>     <dbl> <int>
## 1  163.         139.      24.4     4
augment(occupation_cluster, stock_date_matrix_tbl) %>%
  
  ggplot(aes('2020-04-02', '2020-04-03', color = .cluster)) +
  geom_point()

4 Select optimal number of clusters

kclusts <- tibble(k = 1:9) %>%
  mutate(kclust = map(.x = k, .f = ~ kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = .x, nstart = 20)),
         glanced = map(.x = kclust, .f = glance))

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_point() +
  geom_line()

final_cluster <- kmeans(stock_date_matrix_tbl %>% select(-symbol), centers = 5, nstart = 20)
augment(final_cluster, stock_date_matrix_tbl) %>%
  
  ggplot(aes('2024-03-28', '2020-04-02', color = .cluster)) +
  geom_point()

5 Reduce dimension using UMAP

umap_results <- stock_date_matrix_tbl %>% 
  select(-symbol) %>%
  umap()

umap_results_tb1 <- umap_results$layout %>%
  as.tibble() %>%
  bind_cols(stock_date_matrix_tbl %>% select(symbol))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tb1
## # A tibble: 503 × 3
##        V1     V2 symbol
##     <dbl>  <dbl> <chr> 
##  1  3.56   1.12  A     
##  2 -2.65   0.367 AAL   
##  3  2.78   2.67  AAPL  
##  4  2.49  -1.14  ABBV  
##  5  1.04   2.19  ABNB  
##  6  3.12   0.523 ABT   
##  7 -0.759 -0.580 ACGL  
##  8  1.97   1.98  ACN   
##  9  2.63   2.66  ADBE  
## 10  1.52   3.44  ADI   
## # ℹ 493 more rows
umap_results_tb1 %>% 
  ggplot(aes(V1, V2)) +
  geom_point()

#6 Visualize cluster by adding k-means results

kmeans_umap_tb1 <- final_cluster %>%
  augment(stock_date_matrix_tbl) %>%
  select(symbol, .cluster) %>%
  
  # Add umap results
  left_join(umap_results_tb1) %>%
  
  # Add employment info 
  left_join(sp500_index_tbl %>%
            select(symbol, company, sector),
            by = "symbol")
## Joining with `by = join_by(symbol)`
kmeans_umap_tb1
## # A tibble: 503 × 6
##    symbol .cluster     V1     V2 company                     sector
##    <chr>  <fct>     <dbl>  <dbl> <chr>                       <chr> 
##  1 A      2         3.56   1.12  AGILENT TECHNOLOGIES INC    -     
##  2 AAL    4        -2.65   0.367 AMERICAN AIRLINES GROUP INC -     
##  3 AAPL   2         2.78   2.67  APPLE INC                   -     
##  4 ABBV   5         2.49  -1.14  ABBVIE INC                  -     
##  5 ABNB   2         1.04   2.19  AIRBNB INC CLASS A          -     
##  6 ABT    5         3.12   0.523 ABBOTT LABORATORIES         -     
##  7 ACGL   1        -0.759 -0.580 ARCH CAPITAL GROUP LTD      -     
##  8 ACN    1         1.97   1.98  ACCENTURE PLC CL A          -     
##  9 ADBE   2         2.63   2.66  ADOBE INC                   -     
## 10 ADI    2         1.52   3.44  ANALOG DEVICES INC          -     
## # ℹ 493 more rows
g <- kmeans_umap_tb1 %>%

mutate(text_label = str_glue("Stock: {symbol}
                               Cluster: {.cluster}
                               Company: {company}
                               Sector: {sector}")) %>%
    
    ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()