library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
## Loading required package: timechange
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
big_tech_stock_prices <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv')
## Rows: 45088 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): stock_symbol
## dbl (6): open, high, low, close, adj_close, volume
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
big_tech_companies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv')
## Rows: 14 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): stock_symbol, company
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Explore data
big_tech_stock_prices %>% count(stock_symbol) # META and TSLA have smaller number of rows
## # A tibble: 14 × 2
## stock_symbol n
## <chr> <int>
## 1 AAPL 3271
## 2 ADBE 3271
## 3 AMZN 3271
## 4 CRM 3271
## 5 CSCO 3271
## 6 GOOGL 3271
## 7 IBM 3271
## 8 INTC 3271
## 9 META 2688
## 10 MSFT 3271
## 11 NFLX 3271
## 12 NVDA 3271
## 13 ORCL 3271
## 14 TSLA 3148
big_tech_stock_prices %>%
group_by(stock_symbol) %>%
slice_max(desc(date), n = 1) # META starts in 2012-05-18 and TSLA in 2010-06-30
## # A tibble: 14 × 8
## # Groups: stock_symbol [14]
## stock_symbol date open high low close adj_close volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2010-01-04 7.62 7.66 7.58 7.64 6.52 493729600
## 2 ADBE 2010-01-04 36.7 37.3 36.7 37.1 37.1 4710200
## 3 AMZN 2010-01-04 6.81 6.83 6.66 6.70 6.70 151998000
## 4 CRM 2010-01-04 18.7 18.9 18.5 18.7 18.7 7906000
## 5 CSCO 2010-01-04 24.1 24.8 24.0 24.7 17.4 59853700
## 6 GOOGL 2010-01-04 15.7 15.8 15.6 15.7 15.7 78169752
## 7 IBM 2010-01-04 125. 127. 125. 127. 81.3 6438444
## 8 INTC 2010-01-04 20.8 21.0 20.7 20.9 14.0 47800900
## 9 META 2012-05-18 42.0 45 38 38.2 38.2 573576400
## 10 MSFT 2010-01-04 30.6 31.1 30.6 31.0 23.7 38409100
## 11 NFLX 2010-01-04 7.93 7.96 7.57 7.64 7.64 17239600
## 12 NVDA 2010-01-04 4.63 4.66 4.53 4.62 4.24 80020400
## 13 ORCL 2010-01-04 24.7 25.2 24.7 24.8 20.8 26795000
## 14 TSLA 2010-06-30 1.72 2.03 1.55 1.59 1.59 257806500
big_tech_stock_prices %>%
group_by(stock_symbol) %>%
slice_min(desc(date), n = 1) # META ends in 2023-01-24
## # A tibble: 14 × 8
## # Groups: stock_symbol [14]
## stock_symbol date open high low close adj_close volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2022-12-29 128. 130. 128. 130. 130. 75703700
## 2 ADBE 2022-12-29 331. 340. 330. 338. 338. 1793100
## 3 AMZN 2022-12-29 82.9 84.6 82.6 84.2 84.2 54995900
## 4 CRM 2022-12-29 130. 133. 130. 133. 133. 7628800
## 5 CSCO 2022-12-29 47.3 47.7 47.3 47.5 47.1 11396500
## 6 GOOGL 2022-12-29 86.6 88.8 86.6 88.4 88.4 23333500
## 7 IBM 2022-12-29 141. 142. 140. 141. 141. 2337200
## 8 INTC 2022-12-29 25.8 26.3 25.8 26.2 26.2 30944800
## 9 META 2023-01-24 142. 145 141. 143. 143. 21835300
## 10 MSFT 2022-12-29 236. 242. 236. 241. 241. 19770700
## 11 NFLX 2022-12-29 283. 296. 281. 291. 291. 9588500
## 12 NVDA 2022-12-29 144. 147. 142. 146. 146. 35492300
## 13 ORCL 2022-12-29 81.0 81.8 80.7 81.4 81.1 3867800
## 14 TSLA 2022-12-29 120. 124. 118. 122. 122. 221923300
# Rearrange so all stocks have the same dates
big_tech_stock_prices <- big_tech_stock_prices %>%
# Filter out so all stocks start in 2012-05-18
filter(date >= "2012-05-018", date <= "2022-12-29")
# standardize and normalize
stocks_tidy <- big_tech_stock_prices %>%
group_by(stock_symbol) %>%
tq_transmute(select = adj_close,
mutate_fun = periodReturn,
period = "monthly",
type = "log") %>%
ungroup()
stocks_tidy
## # A tibble: 1,792 × 3
## stock_symbol date monthly.returns
## <chr> <date> <dbl>
## 1 AAPL 2012-05-31 -0.00759
## 2 AAPL 2012-06-29 0.0108
## 3 AAPL 2012-07-31 0.0448
## 4 AAPL 2012-08-31 0.0897
## 5 AAPL 2012-09-28 0.00279
## 6 AAPL 2012-10-31 -0.114
## 7 AAPL 2012-11-30 -0.0125
## 8 AAPL 2012-12-31 -0.0951
## 9 AAPL 2013-01-31 -0.156
## 10 AAPL 2013-02-28 -0.0256
## # … with 1,782 more rows
# Spread to the format required for clustering
stocks_demo <- stocks_tidy %>%
pivot_wider(names_from = date, values_from = monthly.returns)
stocks_demo
## # A tibble: 14 × 129
## stock_symbol 2012-05-3…¹ 2012-0…² 2012-0…³ 2012-0…⁴ 2012-0…⁵ 2012-…⁶ 2012-1…⁷
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL -0.00759 0.0108 0.0448 0.0897 0.00279 -0.114 -0.0125
## 2 ADBE -0.0840 0.0416 -0.0471 0.0126 0.0367 0.0479 0.0169
## 3 AMZN -0.0774 0.0700 0.0214 0.0622 0.0241 -0.0880 0.0791
## 4 CRM -0.124 -0.00260 -0.106 0.155 0.0504 -0.0449 0.0770
## 5 CSCO -0.202 0.0502 -0.0690 0.179 0.00105 -0.100 0.105
## 6 GOOGL -0.0398 -0.00136 0.0873 0.0791 0.0965 -0.104 0.0262
## 7 IBM -0.0712 0.0138 0.00204 -0.00152 0.0627 -0.0643 -0.0188
## 8 INTC -0.106 0.0309 -0.0363 -0.0257 -0.0915 -0.0465 -0.0898
## 9 META -0.256 0.0494 -0.359 -0.184 0.182 -0.0257 0.282
## 10 MSFT -0.0857 0.0468 -0.0373 0.0514 -0.0350 -0.0419 -0.0615
## 11 NFLX -0.249 0.0766 -0.186 0.0493 -0.0926 0.375 0.0307
## 12 NVDA -0.0624 0.106 -0.0205 0.0355 -0.0504 -0.108 0.00560
## 13 ORCL -0.111 0.115 0.0188 0.0469 -0.00602 -0.0102 0.0348
## 14 TSLA -0.135 0.0589 -0.132 0.0393 0.0263 -0.0401 0.184
## # … with 121 more variables: `2012-12-31` <dbl>, `2013-01-31` <dbl>,
## # `2013-02-28` <dbl>, `2013-03-28` <dbl>, `2013-04-30` <dbl>,
## # `2013-05-31` <dbl>, `2013-06-28` <dbl>, `2013-07-31` <dbl>,
## # `2013-08-30` <dbl>, `2013-09-30` <dbl>, `2013-10-31` <dbl>,
## # `2013-11-29` <dbl>, `2013-12-31` <dbl>, `2014-01-31` <dbl>,
## # `2014-02-28` <dbl>, `2014-03-31` <dbl>, `2014-04-30` <dbl>,
## # `2014-05-30` <dbl>, `2014-06-30` <dbl>, `2014-07-31` <dbl>, …
stocks_clust <- kmeans(stocks_demo %>% select(-stock_symbol), centers = 3)
summary(stocks_clust)
## Length Class Mode
## cluster 14 -none- numeric
## centers 384 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(stocks_clust)
## # A tibble: 3 × 131
## `2012-05-31` 2012-06…¹ 2012-…² 2012-…³ 2012-…⁴ 2012-…⁵ 2012-…⁶ 2012-…⁷ 2013-…⁸
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.102 0.0442 -0.0418 0.0417 0.0228 -0.0582 0.0370 0.0148 0.0302
## 2 -0.135 0.0589 -0.132 0.0393 0.0263 -0.0401 0.184 0.00148 0.102
## 3 -0.249 0.0766 -0.186 0.0493 -0.0926 0.375 0.0307 0.125 0.579
## # … with 122 more variables: `2013-02-28` <dbl>, `2013-03-28` <dbl>,
## # `2013-04-30` <dbl>, `2013-05-31` <dbl>, `2013-06-28` <dbl>,
## # `2013-07-31` <dbl>, `2013-08-30` <dbl>, `2013-09-30` <dbl>,
## # `2013-10-31` <dbl>, `2013-11-29` <dbl>, `2013-12-31` <dbl>,
## # `2014-01-31` <dbl>, `2014-02-28` <dbl>, `2014-03-31` <dbl>,
## # `2014-04-30` <dbl>, `2014-05-30` <dbl>, `2014-06-30` <dbl>,
## # `2014-07-31` <dbl>, `2014-08-29` <dbl>, `2014-09-30` <dbl>, …
augment(stocks_clust, stocks_demo) %>%
ggplot(aes(`2022-12-29`, `2012-05-31`, color = .cluster)) +
geom_point()
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(stocks_demo, -stock_symbol), .x)),
glanced = map(kclust, glance)
)
kclusts %>%
unnest(cols = c(glanced)) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
geom_point(size = 2, color = "midnightblue")
final_clust <- kmeans(select(stocks_demo, -stock_symbol), centers = 5)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- augment(final_clust, stocks_demo) %>%
ggplot(aes(`2022-12-29`, `2012-05-31`, color = .cluster, name = stock_symbol)) +
geom_point()
ggplotly(p, height = 500)