Explore Data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
big_tech_stock_prices <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_stock_prices.csv')
## Rows: 45088 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): stock_symbol
## dbl  (6): open, high, low, close, adj_close, volume
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
big_tech_companies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv')
## Rows: 14 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): stock_symbol, company
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

big_tech_stock_prices %>% count(stock_symbol) # META and TSLA have smaller number of rows
## # A tibble: 14 × 2
##    stock_symbol     n
##    <chr>        <int>
##  1 AAPL          3271
##  2 ADBE          3271
##  3 AMZN          3271
##  4 CRM           3271
##  5 CSCO          3271
##  6 GOOGL         3271
##  7 IBM           3271
##  8 INTC          3271
##  9 META          2688
## 10 MSFT          3271
## 11 NFLX          3271
## 12 NVDA          3271
## 13 ORCL          3271
## 14 TSLA          3148
big_tech_stock_prices %>%
    group_by(stock_symbol) %>%
    slice_max(desc(date), n = 1)              # META starts in 2012-05-18 and TSLA in 2010-06-30
## # A tibble: 14 × 8
## # Groups:   stock_symbol [14]
##    stock_symbol date         open   high    low  close adj_close    volume
##    <chr>        <date>      <dbl>  <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
##  1 AAPL         2010-01-04   7.62   7.66   7.58   7.64      6.52 493729600
##  2 ADBE         2010-01-04  36.7   37.3   36.7   37.1      37.1    4710200
##  3 AMZN         2010-01-04   6.81   6.83   6.66   6.70      6.70 151998000
##  4 CRM          2010-01-04  18.7   18.9   18.5   18.7      18.7    7906000
##  5 CSCO         2010-01-04  24.1   24.8   24.0   24.7      17.4   59853700
##  6 GOOGL        2010-01-04  15.7   15.8   15.6   15.7      15.7   78169752
##  7 IBM          2010-01-04 125.   127.   125.   127.       81.3    6438444
##  8 INTC         2010-01-04  20.8   21.0   20.7   20.9      14.0   47800900
##  9 META         2012-05-18  42.0   45     38     38.2      38.2  573576400
## 10 MSFT         2010-01-04  30.6   31.1   30.6   31.0      23.7   38409100
## 11 NFLX         2010-01-04   7.93   7.96   7.57   7.64      7.64  17239600
## 12 NVDA         2010-01-04   4.63   4.66   4.53   4.62      4.24  80020400
## 13 ORCL         2010-01-04  24.7   25.2   24.7   24.8      20.8   26795000
## 14 TSLA         2010-06-30   1.72   2.03   1.55   1.59      1.59 257806500
big_tech_stock_prices %>%
    group_by(stock_symbol) %>%
    slice_min(desc(date), n = 1)              # META ends in 2023-01-24
## # A tibble: 14 × 8
## # Groups:   stock_symbol [14]
##    stock_symbol date        open  high   low close adj_close    volume
##    <chr>        <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>
##  1 AAPL         2022-12-29 128.  130.  128.  130.      130.   75703700
##  2 ADBE         2022-12-29 331.  340.  330.  338.      338.    1793100
##  3 AMZN         2022-12-29  82.9  84.6  82.6  84.2      84.2  54995900
##  4 CRM          2022-12-29 130.  133.  130.  133.      133.    7628800
##  5 CSCO         2022-12-29  47.3  47.7  47.3  47.5      47.1  11396500
##  6 GOOGL        2022-12-29  86.6  88.8  86.6  88.4      88.4  23333500
##  7 IBM          2022-12-29 141.  142.  140.  141.      141.    2337200
##  8 INTC         2022-12-29  25.8  26.3  25.8  26.2      26.2  30944800
##  9 META         2023-01-24 142.  145   141.  143.      143.   21835300
## 10 MSFT         2022-12-29 236.  242.  236.  241.      241.   19770700
## 11 NFLX         2022-12-29 283.  296.  281.  291.      291.    9588500
## 12 NVDA         2022-12-29 144.  147.  142.  146.      146.   35492300
## 13 ORCL         2022-12-29  81.0  81.8  80.7  81.4      81.1   3867800
## 14 TSLA         2022-12-29 120.  124.  118.  122.      122.  221923300
# Rearrange so all stocks have the same dates
big_tech_stock_prices <- big_tech_stock_prices %>%
    # Filter out so all stocks start in 2012-05-18
    filter(date >= "2012-05-018", date <= "2022-12-29")
# standardize and normalize
stocks_tidy <- big_tech_stock_prices %>%
    
    group_by(stock_symbol) %>%
    
    tq_transmute(select     = adj_close, 
                 mutate_fun = periodReturn, 
                 period     = "monthly",
                 type       = "log") %>%
    
    ungroup()

stocks_tidy
## # A tibble: 1,792 × 3
##    stock_symbol date       monthly.returns
##    <chr>        <date>               <dbl>
##  1 AAPL         2012-05-31        -0.00759
##  2 AAPL         2012-06-29         0.0108 
##  3 AAPL         2012-07-31         0.0448 
##  4 AAPL         2012-08-31         0.0897 
##  5 AAPL         2012-09-28         0.00279
##  6 AAPL         2012-10-31        -0.114  
##  7 AAPL         2012-11-30        -0.0125 
##  8 AAPL         2012-12-31        -0.0951 
##  9 AAPL         2013-01-31        -0.156  
## 10 AAPL         2013-02-28        -0.0256 
## # … with 1,782 more rows
# Spread to the format required for clustering
stocks_demo <- stocks_tidy %>%
    pivot_wider(names_from = date, values_from = monthly.returns)

stocks_demo
## # A tibble: 14 × 129
##    stock_symbol 2012-05-3…¹ 2012-0…² 2012-0…³ 2012-0…⁴ 2012-0…⁵ 2012-…⁶ 2012-1…⁷
##    <chr>              <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
##  1 AAPL            -0.00759  0.0108   0.0448   0.0897   0.00279 -0.114  -0.0125 
##  2 ADBE            -0.0840   0.0416  -0.0471   0.0126   0.0367   0.0479  0.0169 
##  3 AMZN            -0.0774   0.0700   0.0214   0.0622   0.0241  -0.0880  0.0791 
##  4 CRM             -0.124   -0.00260 -0.106    0.155    0.0504  -0.0449  0.0770 
##  5 CSCO            -0.202    0.0502  -0.0690   0.179    0.00105 -0.100   0.105  
##  6 GOOGL           -0.0398  -0.00136  0.0873   0.0791   0.0965  -0.104   0.0262 
##  7 IBM             -0.0712   0.0138   0.00204 -0.00152  0.0627  -0.0643 -0.0188 
##  8 INTC            -0.106    0.0309  -0.0363  -0.0257  -0.0915  -0.0465 -0.0898 
##  9 META            -0.256    0.0494  -0.359   -0.184    0.182   -0.0257  0.282  
## 10 MSFT            -0.0857   0.0468  -0.0373   0.0514  -0.0350  -0.0419 -0.0615 
## 11 NFLX            -0.249    0.0766  -0.186    0.0493  -0.0926   0.375   0.0307 
## 12 NVDA            -0.0624   0.106   -0.0205   0.0355  -0.0504  -0.108   0.00560
## 13 ORCL            -0.111    0.115    0.0188   0.0469  -0.00602 -0.0102  0.0348 
## 14 TSLA            -0.135    0.0589  -0.132    0.0393   0.0263  -0.0401  0.184  
## # … with 121 more variables: `2012-12-31` <dbl>, `2013-01-31` <dbl>,
## #   `2013-02-28` <dbl>, `2013-03-28` <dbl>, `2013-04-30` <dbl>,
## #   `2013-05-31` <dbl>, `2013-06-28` <dbl>, `2013-07-31` <dbl>,
## #   `2013-08-30` <dbl>, `2013-09-30` <dbl>, `2013-10-31` <dbl>,
## #   `2013-11-29` <dbl>, `2013-12-31` <dbl>, `2014-01-31` <dbl>,
## #   `2014-02-28` <dbl>, `2014-03-31` <dbl>, `2014-04-30` <dbl>,
## #   `2014-05-30` <dbl>, `2014-06-30` <dbl>, `2014-07-31` <dbl>, …

Implement k-means clustering

stocks_clust <- kmeans(stocks_demo %>% select(-stock_symbol), centers = 3)
summary(stocks_clust)
##              Length Class  Mode   
## cluster       14    -none- numeric
## centers      384    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
library(broom)
tidy(stocks_clust)
## # A tibble: 3 × 131
##   `2012-05-31` 2012-06…¹ 2012-…² 2012-…³ 2012-…⁴ 2012-…⁵ 2012-…⁶ 2012-…⁷ 2013-…⁸
##          <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1       -0.102    0.0442 -0.0418  0.0417  0.0228 -0.0582  0.0370 0.0148   0.0302
## 2       -0.135    0.0589 -0.132   0.0393  0.0263 -0.0401  0.184  0.00148  0.102 
## 3       -0.249    0.0766 -0.186   0.0493 -0.0926  0.375   0.0307 0.125    0.579 
## # … with 122 more variables: `2013-02-28` <dbl>, `2013-03-28` <dbl>,
## #   `2013-04-30` <dbl>, `2013-05-31` <dbl>, `2013-06-28` <dbl>,
## #   `2013-07-31` <dbl>, `2013-08-30` <dbl>, `2013-09-30` <dbl>,
## #   `2013-10-31` <dbl>, `2013-11-29` <dbl>, `2013-12-31` <dbl>,
## #   `2014-01-31` <dbl>, `2014-02-28` <dbl>, `2014-03-31` <dbl>,
## #   `2014-04-30` <dbl>, `2014-05-30` <dbl>, `2014-06-30` <dbl>,
## #   `2014-07-31` <dbl>, `2014-08-29` <dbl>, `2014-09-30` <dbl>, …
augment(stocks_clust, stocks_demo) %>%
  ggplot(aes(`2022-12-29`, `2012-05-31`, color = .cluster)) +
  geom_point()

Choosing K

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(stocks_demo, -stock_symbol), .x)),
    glanced = map(kclust, glance)
  )

kclusts %>%
  unnest(cols = c(glanced)) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
  geom_point(size = 2, color = "midnightblue")

final_clust <- kmeans(select(stocks_demo, -stock_symbol), centers = 5)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- augment(final_clust, stocks_demo) %>%
  ggplot(aes(`2022-12-29`, `2012-05-31`, color = .cluster, name = stock_symbol)) +
  geom_point()

ggplotly(p, height = 500)

Question and Data

What is the research question?

Describe the data briefly.

Which of the variables is the target variable (one that the model is predicting)? How is it different from the target variables from the first four case studies?

Describe the first observation (the first row) in the data.

Data Exploration and Transformation

What are the characteristics of the key variables used in the analysis? For example, which of the predictors seems to have correlations with the outcome variable? Does any variable have a skewed distribution, missing values, or incorrect information?

Describe the differences between the original data and the data transformed for modeling. Why?

Data Preparation and Modeling

What are the names of data preparation steps (e.g., step_dummy) mentioned in the video?

What is the name of the machine learning models (e.g., random forest) used in the analysis?

Describe the steps taken for data preparation. Why?

Describe the characteristics of the models used in the analysis, including their advantages and disadvantages.

Model Evaluation

What metrics are used in the model evaluation?

Describe the characteristics of the metrics used in the analysis, including their advantages and disadvantages.

Conclusion

What are the major findings?