# Core libraries 

library(tidyverse)
library(timetk)
library(tidyquant)

# Explorations
library(DataExplorer)

# Visualization 
library(plotly)
library(highcharter)
library(gt)
library(ggthemes)

0.1 Classifying Companies by Sector

complete_stock_tbl <-read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/all_stocks_2006-01-01_to_2018-01-01.csv")

complete_stock_tbl %>% head(10)
## # A tibble: 10 × 7
##    Date        Open  High   Low Close  Volume Name 
##    <date>     <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>
##  1 2006-01-03  77.8  79.4  77.2  79.1 3117200 MMM  
##  2 2006-01-04  79.5  79.5  78.2  78.7 2558000 MMM  
##  3 2006-01-05  78.4  78.6  77.6  78.0 2529500 MMM  
##  4 2006-01-06  78.6  78.9  77.6  78.6 2479500 MMM  
##  5 2006-01-09  78.5  79.8  78.5  79.0 1845600 MMM  
##  6 2006-01-10  79    79.0  78.1  78.5 1919900 MMM  
##  7 2006-01-11  78.4  78.7  77.8  78.4 1911900 MMM  
##  8 2006-01-12  78.2  78.2  77.2  77.7 2121100 MMM  
##  9 2006-01-13  77.0  78.2  77.0  77.5 1925300 MMM  
## 10 2006-01-17  77.1  77.6  77    77.1 2073400 MMM
glimpse(complete_stock_tbl)
## Rows: 93,612
## Columns: 7
## $ Date   <date> 2006-01-03, 2006-01-04, 2006-01-05, 2006-01-06, 2006-01-09, 20…
## $ Open   <dbl> 77.76, 79.49, 78.41, 78.64, 78.50, 79.00, 78.44, 78.20, 76.95, …
## $ High   <dbl> 79.35, 79.49, 78.65, 78.90, 79.83, 79.01, 78.66, 78.23, 78.20, …
## $ Low    <dbl> 77.24, 78.25, 77.56, 77.64, 78.46, 78.08, 77.84, 77.20, 76.95, …
## $ Close  <dbl> 79.11, 78.71, 77.99, 78.63, 79.02, 78.53, 78.37, 77.70, 77.50, …
## $ Volume <dbl> 3117200, 2558000, 2529500, 2479500, 1845600, 1919900, 1911900, …
## $ Name   <chr> "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", …
final_stock_tbl <- complete_stock_tbl %>%
  set_names(names(.) %>% str_to_lower()) %>%
  mutate(
    sector = case_when(
      name == "MMM" ~ "Business/Consumer Services",
      name == "AXP" ~ "Financial Services",
      name == "AAPL" ~ "Technology", 
      name == "BA" ~ "Industrial Goods",
      name == "CAT" ~ "Industrial Goods", 
      name == "CVX" ~ "Companies on the Energy Service",
      name == "CSCO" ~ "Technology",
      name == "KO" ~ "Consumer Goods", 
      name == "DIS" ~ "Media/Entertainment",
      name == "XOM" ~ "Companies on the Energy Service",
      name == "GE" ~ "Business/Consumer Services",
      name == "GS" ~ "Financial Services",
      name == "HD" ~ "Retail/Wholesale",
      name == "IBM" ~ "Business/Consumer Services",
      name == "INTC" ~ "Technology",
      name == "JNJ" ~ "Health Care/Life Sciences",
      name == "JPM" ~ "Financial Services",
      name == "MCD" ~ "Leisure/Arts/Hospitality",
      name == "MRK" ~ "Health Care/Life Sciences",
      name == "MSFT" ~ "Technology",
      name == "NKE" ~ "Consumer Goods",
      name == "PFE" ~ "Health Care/Life Sciences",
      name == "PG" ~ "Consumer Goods",
      name == "TRV" ~ "Financial Services",
      name == "UTX" ~ "Technology", 
      name == "UNH" ~ "Financial Services", 
      name == "VZ" ~ "Telecommunication Services", 
      name == "WMT" ~ "Retail/Wholesale",
      name == "GOOGL" ~ "Technology",
      name == "AMZN" ~ "Retail/Wholesale",
      TRUE ~ "Technology"
      
    )
  )
stocks_by_sector <- final_stock_tbl %>%
  distinct(sector, name) %>%
  group_by(sector) %>%
  summarise(
    count = n()
  ) %>%
  ungroup() %>%
  arrange(desc(count))

# Treemap with Tidyquant colors

treemap_by_sector <- stocks_by_sector %>%
  hctreemap2(group_vars = "sector",
             size_var = "count",
             layoutAlgorithm = "squarified") %>%
  hc_plotOptions(treemap = list(colorByPoint = TRUE)) %>%
    hc_colors(c("#2C3E50", "#E31A1C", "#18BC9C", "#CCBE93", 
                "#A6CEE3", "#1F78B4", "#B2DF8A", "#FB9A99",
                "#FDBF6F", "#FF7F00", "#6A3D9A" )) %>%
    hc_colorAxis(dataClasses = color_classes(stocks_by_sector$sector)) %>%    
    hc_legend(enabled = FALSE) %>% 
  hc_title(text = "Number of Companies by Sector") %>% 
  hc_subtitle(text = "Dow Jones Index") %>% 
    hc_tooltip(pointFormat = "<b>{point.name}</b>:<br>
             Count: {point.value:,.0f}")
  
# Return the plot
treemap_by_sector

0.2 Companies to Analyze

The following companies will be analyzed: - Apple, Caterpillar, Google, Exxon, Golman Sachs, Walmart

0.3 Explaining Loess

As shown in the plots below we see some companies which are less volatile than others, companies that have a more upward trend line than others. LOESS stands for Locally Weighted Scatterplot Smoothing and think of this as a regression within a specific “window” (range of time) in which the closest observation is given more weight than “father” observations of a specific “focal point”. We performed weighted least squared (best fitting line).

Nevertheless, I would say these are the most important points for understanding how LOESS works

Formula for Assignning weights (Tricube weight function) Where: - k: Number of observations (within each window) - xi: Focal Point - xk: The neighboring point - di: Distance from Focal point to the Nth neighboring point

w(xk) = (1 - |(xi - xk)/di|^3 )^3

After that linear regression is apply to find the line that reduces error the most. This will be done for all points in the “actual” time series.

#Function to get average price by week, month and quarter 
#If we use the mean(close) daily we get the actual values
#Since values are unique 

convert_date_ts <- function(data, unit = "day") {
  
  new_data <- data %>%
    mutate(date = floor_date(date, unit = unit)) %>%
    group_by(date, name) %>%
    summarise(
      close = mean(close)
    ) %>%
    ungroup() 
  
  return(new_data)
}

0.3.1 Day

# Pick by stock (6 stocks from different industries)

filtered_final_stock_tbl <- final_stock_tbl %>%
  filter(name %in% c("AAPL", "GS", "GOOGL", "XOM", "CAT", "WMT")) %>%
  mutate(
    date = ymd(date)
  )

# Daily
filtered_final_stock_tbl %>% 
  convert_date_ts() %>% 
  group_by(name) %>% 
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.2 Week

filtered_final_stock_tbl %>% 
  convert_date_ts(unit = "week") %>% 
  group_by(name) %>% 
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.3 Month

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "month") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.4 Quarter

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "quarter") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close, 
    .facet_ncol = 2, 
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.5 Year

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "year") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2, 
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.4 Determining missing values with Data Explorer

filtered_final_stock_tbl%>%
  plot_missing(
    ggtheme = theme_calc(),
    title = "Percent Missing Values by Column"
  )

0.5 Which stocks grew the most percent wise on average

Note: As expected, during the years of 2007 and 2008 is where we have the highest volatility due to the financial crisis that occur during those years caused mainly by uncertainty in the markets

# Daily stock return by company 
# Using Tidyquant package

daily_stock_return <- filtered_final_stock_tbl %>%
  mutate(
    name = name %>% as_factor()
  ) %>%
  group_by(name) %>%
  tq_transmute(
    select = close, 
    mutate_fun = periodReturn,
    period = "daily", 
    col_rename = "stock_return"
  )

# Visualization with ggplot

daily_stock_return %>%
  ggplot(aes(x = date, y = stock_return)) + 
  geom_line(color = palette_light()[[1]]) + 
  facet_wrap(~name) + 
  theme_calc() +
  labs(
    title = "Stock Returns",
    subtitle = "by Company",
    y= "returns"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

return_volatility_tbl <- function(data, unit="daily", ...){
  
  vars_col <- quos(...)
  
  tbl_return_volatility <- data %>% 
    mutate(
    name = name %>% as_factor()
  ) %>% 
  group_by(name) %>% 
  tq_transmute(
    select = close,
    mutate_fun = periodReturn,
    period = unit,
    col_rename = "stock_return"
  ) %>% 
    mutate(
    company = case_when(
      name == "AAPL" ~ "Apple",
      name == "GOOGL" ~ "Google",
      name == "XOM" ~ "Exxon",
      name == "GS" ~ "Goldman Sachs",
      name == "WMT" ~ "Walmart",
      TRUE ~ "Caterpillar"
    )
  ) %>% 
    group_by(!!!vars_col) %>% 
      summarise(
    `percent return` = mean(stock_return),
    `standard deviation` = sd(stock_return)
  ) %>% 
  ungroup() %>% 
  arrange(desc(`percent return`)) %>% gt() %>% 
  data_color(
    columns = vars(`standard deviation`),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    )) %>% 
  data_color(
    columns = vars(`percent return`),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
      domain = NULL
    )
  ) %>% fmt_percent(
    columns = vars(`percent return`), 
    decimals = 4
  ) %>% 
  fmt_number(
    columns = vars(`standard deviation`),
    decimals = 4
  ) %>% 
    tab_header(
    title = md("**2006 - 2017 Measure of Return and Volatility**"),
    subtitle = "An overview of Risk and Return"
  )
  
  
  return(tbl_return_volatility)
    
}

0.5.1 Daily

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "daily", company, name)
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
company name percent return standard deviation
Apple AAPL 0.1124% 0.0204
Google GOOGL 0.0691% 0.0184
Caterpillar CAT 0.0535% 0.0201
Goldman Sachs GS 0.0509% 0.0239
Walmart WMT 0.0326% 0.0122
Exxon XOM 0.0233% 0.0151

0.5.2 Weekly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "weekly", company, name)
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
company name percent return standard deviation
Apple AAPL 0.5444% 0.0451
Google GOOGL 0.3329% 0.0404
Caterpillar CAT 0.2638% 0.0455
Goldman Sachs GS 0.2426% 0.0525
Walmart WMT 0.1545% 0.0257
Exxon XOM 0.0960% 0.0277

0.5.3 Monthly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "monthly", company, name)
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
company name percent return standard deviation
Apple AAPL 2.3579% 0.0907
Google GOOGL 1.4307% 0.0824
Caterpillar CAT 1.1438% 0.0935
Goldman Sachs GS 0.8721% 0.0892
Walmart WMT 0.6391% 0.0473
Exxon XOM 0.3549% 0.0462

0.5.4 Yearly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "yearly", company, name)
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
company name percent return standard deviation
Apple AAPL 36.8065% 0.5619
Google GOOGL 20.8735% 0.4000
Goldman Sachs GS 14.7381% 0.4343
Caterpillar CAT 13.0254% 0.3249
Walmart WMT 7.7882% 0.1674
Exxon XOM 4.1885% 0.1624

0.6 Exploring Seasonality and Pattern

To better understand certain seasonality patterns it is critical to see how the target variable behaves at certain period of time. In this case we would analyze how the closing price of company x behaves at different days of the week, months, quarters and years. This could help us understnading if there are other external factors that influeneces the price of the stock

Summary: - *Different periods in time: We will analyze the distribution at during weekdays, week, month, quarter and year - Exxon (XOM): For this case we will look at how the stock of Exxon behaves. We will see the distribution of the closing price at different points in time - Weekdays: During weekday we dont see an obvious differents in the distribution of the closing price - Months: Months of June and December tend to have a slightly higher closing price - Quarters: The second and fourth quarter has a slightly median price in the distribution of the closing price - Year**: 2010 had the worst distribution in the closing price for the company (probably an external factor impacted the sector), while 2014 was the year in which the distribution of the closing price was the highest

# Let's pick Exxon for seasonality 

filtered_final_stock_tbl %>%
  filter(name == "XOM") %>%
  plot_seasonal_diagnostics(
    .date_var = date,
    .value = close, 
    .interactive = FALSE
  )

0.6.1 Non-Interactive STL

# Function to create seasonal decomposition with loess for the stocks

plot_seasonal_decomposition <- function(data, interactive=FALSE){
  
  stl_plot <- data %>% 
    group_by(name) %>% 
    plot_stl_diagnostics(
      .date_var = date,
      .value = log1p(close),
      .interactive = interactive
    )
  
  return(stl_plot)
}


# Visualization Not interactive
filtered_final_stock_tbl %>% 
  filter(name %in% c("AAPL", "XOM", "GS")) %>% 
  group_by(name) %>% 
  plot_seasonal_decomposition()

0.6.2 Interactive STL

filtered_final_stock_tbl %>%
  filter(name %in% c("AAPL", "XOM", "GS")) %>%
  group_by(name) %>%
  plot_seasonal_decomposition(interactive = TRUE)

0.7 Outlier Detection

Anomalies are used mainly for finding out events that were not expected in our time series. It can also be that a mítake in the data could have caused an outlier in the time series. Detecting Outliers gives us the possiblity of investigating further if events that were not expected actually drove the time series to behaved in an anomally manner.

filtered_final_stock_tbl %>%
  group_by(name) %>%
  plot_anomaly_diagnostics(
    .date = date,
    .value = close,
    .facet_ncol =2,
    .interactive = FALSE,
    .title = "Anomaly Diagnostics Dow Jones",
    .anom_color = "#FB3029",
    .max_anomalies = 0.07,
    .alpha = 0.05
  )

0.7.1 Investigate Anomaly of XOM (Exxon)

As we have seen previouslu, Exxon belongs to the energy industry more specifically the oil industry. The question we will answer will be : What events occurred that caused a large drop in the stock price?. Below we will se attached a time series of the West Texas Intermediate (WTI) to see if it has a similar behavior with the XOM stock. For those that dont know WTI is a benchmark for oil pricing in the markets.

Summary:

  • Outlier #1: Caused by economic growth and demand in the oil industry resulting in profit gains
  • Outlier #2: Caused by the financial crisis 2008-2008
  • Outlier. #3: Caused by lower than expected results in the market (loss in profit) around 2010
filtered_final_stock_tbl %>%
  filter(name == "XOM") %>%
  plot_anomaly_diagnostics(
    .date = date,
    .value = close,
    .interactive = TRUE,
    .title = "Anomaly Diagnostics Exxon (XOM)",
    .anom_color = "#FB3029",
    .max_anomalies = 0.05,
    .alpha = 0.05
  )

1 Statistical Methods

Definition Lists:

1.0.1 Autocorrelation (ACF)

filtered_final_stock_tbl %>%
  group_by(name) %>%
  summarise_by_time(
    .date_var = date,
    .value = mean(close, na.rm = TRUE),
    .by = "month"
  ) %>%
  tk_acf_diagnostics(
    .date_var = date,
    .value = log1p(.value)
  ) %>%
  ggplot(aes(x=lag, y=ACF, color=name, group=name)) + 
  # Add horizontal line a y=0
  geom_hline(yintercept = 0) + 
  # Plot autocorrelation 
  geom_point(size=2) + 
  geom_segment(aes(xend=lag, yend =0), size =1) + 
  # Add cutoffs 
  geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) + 
  geom_line(aes(y =.white_noise_lower), color = "black", linetype =2 ) +
  # Add facets
  facet_wrap(~ name, ncol = 3) +
  # Aesthetics 
  expand_limits(y = c(-1,1)) + 
  scale_color_stata() +
  theme_calc() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5)
  ) + 
  labs(
    title = "AutoCorrelation (ACF)",
    subtitle = "Dow Jones 30 Index"
  )

1.0.2 Partial Auto Correlation (PACF)

filtered_final_stock_tbl %>%
    group_by(name) %>%
  summarise_by_time(
    .date_var = date,
    .value = mean(close, na.rm = TRUE),
    .by="month"
  ) %>% 
  tk_acf_diagnostics(
    .date_var = date,
    .value = log1p(.value)
  ) %>% 
      ggplot(aes(x = lag, y = PACF, color = name, group = name)) +
    # Add horizontal line a y=0
    geom_hline(yintercept = 0) +
    # Plot autocorrelations
    geom_point(size = 2) +
    geom_segment(aes(xend = lag, yend = 0), size = 1) +
    # Add cutoffs
    geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) +
    geom_line(aes(y = .white_noise_lower), color = "black", linetype = 2) +
    # Add facets
    facet_wrap(~ name, ncol = 3) +
    # Aesthetics
    expand_limits(y = c(-1, 1)) +
    scale_color_stata() +
    theme_calc() +
    theme(
        legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)
    ) + labs(
      title = "Partial AutoCorrelation (PACF)",
      subtitle = "Dow Jones 30 Index"
    )

1.0.3 Cross Correlation (CFF)

The first time I heard about the concept “cross correlation”

tech_stocks_tbl <- filtered_final_stock_tbl %>%
  filter(sector == "Technology") %>%
  select(date, name, close) %>%
  pivot_wider(names_from = name, values_from = close) %>%
  summarise_by_time(
    .date_var = date,
    .by = "month",
    across(AAPL:GOOGL, .fns=mean)
  )

tech_stocks_tbl %>%
  tk_acf_diagnostics(
    date, 
    AAPL,
    .ccf_vars = GOOGL
  )  %>% 
  select(
    lag, CCF_GOOGL
  ) %>%
  slice(1:10) %>%
  gt() %>%
  data_color(
    columns = vars(CCF_GOOGL) , 
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    ))
lag CCF_GOOGL
0 0.9137803
1 0.8816792
2 0.8501216
3 0.8231274
4 0.8001255
5 0.7770242
6 0.7513068
7 0.7243054
8 0.6970645
9 0.6747878

Summary: -CCF on AAPL Close Price: The closing price of AAPL is the target variable and the closing price of GOOGL is the independent variable.

  • High correlation at Lag 0: This indicates that when the closing price of AAPL is high or increasing this has positive effect on the closing price of GOOGL on that same day or at LAG 0.
#Plot

tech_stocks_tbl %>%
  plot_acf_diagnostics(
    date,
    AAPL,
    .ccf_vars = GOOGL,
    .show_ccf_vars_only = TRUE,
    .interactive = TRUE,
    .line_color = "black",
    .point_color = palette_light()[[2]],
    .line_size = 1.5,
    .title = "Cross Correlation of Technology Stocks"
  )

1.1 Simple Moving Averages

Moving averages help ú smooth the time series and helps us to better understand where the trend is heading. Believe it or not, moving averages is used a lot for forecasting although, it is not the most optimal technique to do so especially since it does not handle that well the impact of outliers. Moreover, moving averages helps us detect as well seasonality in our time series.

Most Common Moving Average in the Stock Market - 15 days MA - 30 days MA - 100 days MA - 200 days MA

Note: Notice how the higher the moving average the smoother the line. Usually, we use moving averages to detect trends in our time series data

gs_ma_sample <- filtered_final_stock_tbl %>%
  select(date, close, name) %>%
  filter(name == "GS") %>%
  filter_by_time(.date_var = date, .start_date = "2015", .end_date = "2017") %>%
  mutate(
    adjusted_15_mm_close = slidify_vec(
      .x = close,
      .period = 15,
      .f = mean, 
      na.rm = TRUE,
      .align = "center",
      .partital = TRUE
    ),
    adjusted_30_mm_close = slidify_vec(
      .x = close,
      .period = 30,
      .f = mean, 
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    )
  ) %>%
  pivot_longer(
    contains("close"), names_repair = "unique") %>%
  rename(name = name...2, 
         metric = name...3
  )
    

gs_ma_sample %>%
  ggplot(aes(x=date, y=value)) + 
  geom_line(aes(color = metric), size = 1) +
  theme_calc() +
  theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) + 
  guides(color = guide_legend(nrow = 2, byrow = TRUE)) + 
  labs(
    title = "Moving Average (MA) as a Metric",
    subtitle = str_glue("GS Stock 2015-2017"),
    y = "Closing Price") +
  scale_color_stata() +
  scale_x_date(expand = c(0,0))

Note: In this case I used a “Rolling” Median since I want to avoid the effect of outliers. Moving average are more prone to be affected by outliers! Nevertheless, you can use moving average by changing the function from median to mean.

mn_final_tbl <- filtered_final_stock_tbl %>%
  select(date, close, name) %>%
  mutate(
    adjusted_15_mm_close = slidify_vec(
      .x = close, 
      .period = 15, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE), 
    adjusted_30_mm_close = slidify_vec(
      .x = close, 
      .period = 30, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    ),
    adjusted_100_mm_close = slidify_vec(
      .x = close,
      .period = 100, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE),
    adjusted_300_mm_close = slidify_vec(
      .x = close, 
      .period = 300,
      .f = median, 
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    )
    ) %>%
  pivot_longer(contains("close"), names_repair = "unique") %>%
  rename( 
    name = name...2,
    metric = name...3
    )

mn_final_tbl
## # A tibble: 90,590 × 4
##    date       name  metric                value
##    <date>     <chr> <chr>                 <dbl>
##  1 2006-01-03 AAPL  close                 10.7 
##  2 2006-01-03 AAPL  adjusted_15_mm_close  10.9 
##  3 2006-01-03 AAPL  adjusted_30_mm_close  11   
##  4 2006-01-03 AAPL  adjusted_100_mm_close 10.2 
##  5 2006-01-03 AAPL  adjusted_300_mm_close  9.38
##  6 2006-01-04 AAPL  close                 10.7 
##  7 2006-01-04 AAPL  adjusted_15_mm_close  10.9 
##  8 2006-01-04 AAPL  adjusted_30_mm_close  10.9 
##  9 2006-01-04 AAPL  adjusted_100_mm_close 10.2 
## 10 2006-01-04 AAPL  adjusted_300_mm_close  9.38
## # ℹ 90,580 more rows

1.1.1 Visualization

mn_final_tbl %>%
  ggplot(aes(x=date, y =value, color = metric)) +
  geom_line(size=1, alpha = 0.8) + 
  facet_wrap(~name, scales = "free_y") + 
  theme_calc() + 
  scale_color_stata() +
  labs(
    title = "Moving Medians",
    subtitle = "Dow Jones 30",
    y = "price"
  ) + 
  theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), 
        legend.text = element_text(size = 10)) +
  guides(color = guide_legend(nrow = 2, byrow = TRUE))

1.1.2 Moving Average in Forecasting

The goal of this section is to showcase in which scenarios moving average could be useful and in which ones could it be that they are not that useful.

Summary:
  • Easy to understand: Moving averages are easy to understand thus not many technicalities are needed to understand this concept.
  • Expected Volatility: When there is an expected volatility, you run into the risk of having large residuals through the use of moving averages.
  • Low volatility: Moving averages are useful during time frames of low volatility as it is able to capture the short-term trend.
gs_filtered_data <- filtered_final_stock_tbl %>%
  filter(name == "GS") %>%
  filter_by_time(
    .start_date = "2009", 
    .end_date = "2012"
  ) %>%
  select(date, name, close)

ma_60day_forecasting <- gs_filtered_data %>% 
  mutate(
    mavg_60_days = slidify_vec(close, .f = ~ mean(.x, na.rm = TRUE), .period = 60, .align = "right")) %>% 
  bind_rows(
    future_frame(., .length_out = 60)
  ) %>% 
  # fill function: replace NA with our moving averages
  fill(
    mavg_60_days,
    .direction = "down"
  )

ma_60day_forecasting
## # A tibble: 1,066 × 4
##    date       name  close mavg_60_days
##    <date>     <chr> <dbl>        <dbl>
##  1 2009-01-02 GS     86.8           NA
##  2 2009-01-05 GS     88.8           NA
##  3 2009-01-06 GS     88.7           NA
##  4 2009-01-07 GS     84.5           NA
##  5 2009-01-08 GS     85.4           NA
##  6 2009-01-09 GS     83.9           NA
##  7 2009-01-12 GS     77.7           NA
##  8 2009-01-13 GS     77.9           NA
##  9 2009-01-14 GS     75.7           NA
## 10 2009-01-15 GS     73.8           NA
## # ℹ 1,056 more rows

2 Forecasting plot

ma_60day_forecasting %>% pivot_longer(close:mavg_60_days,names_repair="unique") %>%
  rename(
    "type" = "name...3"
  ) %>% 
  ggplot(aes(x=date, y=value)) + geom_line(aes(color=type), size=1) +
  theme_calc() + scale_color_stata() +
  labs(
    title = "60 day Moving Average",
    subtitle = "MA as a Forecasting Technique"
  ) + theme(plot.title = element_text(hjust=0.5))

3 Residuals of Forecasting with Moving Average

In this scenario we can

actual_60days <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>% 
  filter_by_time(
    .start_date = "2013",
    .end_date = "2013-03-01"
  ) %>% 
  select(date, name, close)



plot_series <- ma_60day_forecasting %>% 
  filter_by_time(
    .start_date = "2013-01-02", .end_date = "2013-03-01"
  ) %>% 
  left_join(actual_60days, by="date") %>% 
  select(date, mavg_60_days, close.y) %>% 
  rename(
    "actual_close_price" = "close.y"
  ) %>% 
  pivot_longer(mavg_60_days:actual_close_price) %>% 
  # Remove weekends from moving average
  drop_na() %>% 
  ggplot(aes(x=date, value)) + geom_line(aes(color=name), size=1) + geom_point(aes(color=name), size=2) + theme_calc() + scale_color_stata() + 
  labs(
    title = "Residuals from 60 day Moving average", 
    subtitle = "GS stock Moving Average"
  ) + theme(legend.position = "bottom") + guides(color = guide_legend(nrow=1, byrow=TRUE))


plot_series

## Rolling Corelation

Another way of finding a relationship between two time series is through the use of Rolling Correlations. What we do here is find a correlation at different points in time between two time series and determine if the relationship between the both is significant.

Summary:


tech_stocks_tbl <- filtered_final_stock_tbl %>%
  filter(sector == "Technology") %>%
  select(date, name, close) %>%
  pivot_wider(names_from = name, values_from = close) %>%
  summarise_by_time(
    .date_var = date, 
    .by = "month",
    across(AAPL:GOOGL, .fns = mean)
  )

tech_stocks_tbl %>%
  tk_acf_diagnostics(
    date,
    AAPL,
    .ccf_vars = GOOGL
) %>%
  select(lag, CCF_GOOGL)%>% 
  slice(1:10) %>%
  gt() %>%
  data_color(
    columns = vars(CCF_GOOGL),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    ))
lag CCF_GOOGL
0 0.9137803
1 0.8816792
2 0.8501216
3 0.8231274
4 0.8001255
5 0.7770242
6 0.7513068
7 0.7243054
8 0.6970645
9 0.6747878

3.1 Linear Regression and the use of Log Transformations

The main reason why we need to use log transformations in time series analysis is to reduce the impact of extentsive outliers therefore, stablizing the variance across the time series. Log transformations are required in models like Linear Regression where an outlier can cause a significant impact on the adjusted squared error. Although, in this case wen dont have “significant” outliers log transformations can help you mitigate the impact of such

Log Transformation Formula:

\(\large Y = log_b^x\)
\(\large b^y = x\)

Summary:

  • Log1p function: This function is the same as a log transformation + 1. We use this to mitigate the impact of values which are zero. (This is not the case)
  • Feature Importance: Notice how most of our months are giving significant importance to reduce our adjusted R-squared. Meaning that the month has a significant impact on determining where the closing price of a stock is heading. This will lead to the question is there some sort of seasonality?
  • Getting from log values to actual values? To transform back to our actual values we use the expm1 function which will allows us to get the orginial values back. We want to definitely know this whenever we are forecasting!
# We will pick GS as it is one of the ones with the highest volatility

# GS linear regression model using log transformations
log_gs_lm_plot <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>% 
  filter_by_time(
    .start_date = "2016-01-01",
    .end_date = "2017-01-01"
  ) %>% 
  plot_time_series_regression(
    .date_var = date,
    .formula = log1p(close) ~ as.numeric(date) + 
      wday(date, label = TRUE) + 
      lubridate::month(date, label = TRUE),
    .interactive=FALSE,
    .show_summary = TRUE,
    .title = "Log transformations in Linear Regression Models"
  )
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.114757 -0.018312  0.001485  0.020826  0.105742 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -9.3685671  4.3585702  -2.149  0.03262
## as.numeric(date)                         0.0008535  0.0002566   3.326  0.00102
## wday(date, label = TRUE).L              -0.0018868  0.0050972  -0.370  0.71159
## wday(date, label = TRUE).Q               0.0004393  0.0050410   0.087  0.93063
## wday(date, label = TRUE).C              -0.0015788  0.0049871  -0.317  0.75184
## wday(date, label = TRUE)^4               0.0031286  0.0049447   0.633  0.52753
## lubridate::month(date, label = TRUE).L   0.0212167  0.0939807   0.226  0.82159
## lubridate::month(date, label = TRUE).Q   0.2516840  0.0078410  32.098  < 2e-16
## lubridate::month(date, label = TRUE).C   0.1107287  0.0078093  14.179  < 2e-16
## lubridate::month(date, label = TRUE)^4   0.0581218  0.0078034   7.448 1.79e-12
## lubridate::month(date, label = TRUE)^5  -0.0174280  0.0077749  -2.242  0.02592
## lubridate::month(date, label = TRUE)^6   0.0410392  0.0078056   5.258 3.28e-07
## lubridate::month(date, label = TRUE)^7  -0.0437489  0.0077170  -5.669 4.19e-08
## lubridate::month(date, label = TRUE)^8  -0.0437221  0.0076994  -5.679 3.99e-08
## lubridate::month(date, label = TRUE)^9   0.0105762  0.0077319   1.368  0.17266
## lubridate::month(date, label = TRUE)^10  0.0131328  0.0076836   1.709  0.08874
## lubridate::month(date, label = TRUE)^11 -0.0166546  0.0077086  -2.161  0.03174
##                                            
## (Intercept)                             *  
## as.numeric(date)                        ** 
## wday(date, label = TRUE).L                 
## wday(date, label = TRUE).Q                 
## wday(date, label = TRUE).C                 
## wday(date, label = TRUE)^4                 
## lubridate::month(date, label = TRUE).L     
## lubridate::month(date, label = TRUE).Q  ***
## lubridate::month(date, label = TRUE).C  ***
## lubridate::month(date, label = TRUE)^4  ***
## lubridate::month(date, label = TRUE)^5  *  
## lubridate::month(date, label = TRUE)^6  ***
## lubridate::month(date, label = TRUE)^7  ***
## lubridate::month(date, label = TRUE)^8  ***
## lubridate::month(date, label = TRUE)^9     
## lubridate::month(date, label = TRUE)^10 .  
## lubridate::month(date, label = TRUE)^11 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03551 on 235 degrees of freedom
## Multiple R-squared:  0.933,  Adjusted R-squared:  0.9285 
## F-statistic: 204.6 on 16 and 235 DF,  p-value: < 2.2e-16
log_gs_lm_plot

Transforming back to the original values using expm1 function:

log_transformed_back_plot <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>%
  filter_by_time(
    .start_date = "2016-01-01",
    .end_date = "2017-01-01"
  ) %>%
  plot_time_series_regression(
    .date_var = date, 
    .formula = expm1(log1p(close)) ~ as.numeric(date) + 
      wday(date, label = TRUE) +
      lubridate::month(date, label = TRUE),
    .interactive = TRUE, 
    .show_summary = TRUE, 
    .title = "Transforming to actual close price using expm1 function"
  )
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.0393  -3.0182   0.3419   3.3421  18.1935 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -2.804e+03  7.652e+02  -3.664 0.000306
## as.numeric(date)                         1.751e-01  4.505e-02   3.885 0.000133
## wday(date, label = TRUE).L              -3.287e-01  8.948e-01  -0.367 0.713718
## wday(date, label = TRUE).Q               3.947e-02  8.850e-01   0.045 0.964464
## wday(date, label = TRUE).C              -2.123e-01  8.755e-01  -0.242 0.808638
## wday(date, label = TRUE)^4               5.540e-01  8.681e-01   0.638 0.523987
## lubridate::month(date, label = TRUE).L  -2.077e+00  1.650e+01  -0.126 0.899952
## lubridate::month(date, label = TRUE).Q   4.869e+01  1.377e+00  35.375  < 2e-16
## lubridate::month(date, label = TRUE).C   2.412e+01  1.371e+00  17.596  < 2e-16
## lubridate::month(date, label = TRUE)^4   1.309e+01  1.370e+00   9.555  < 2e-16
## lubridate::month(date, label = TRUE)^5  -1.112e+00  1.365e+00  -0.814 0.416229
## lubridate::month(date, label = TRUE)^6   6.582e+00  1.370e+00   4.803 2.78e-06
## lubridate::month(date, label = TRUE)^7  -7.165e+00  1.355e+00  -5.289 2.82e-07
## lubridate::month(date, label = TRUE)^8  -7.312e+00  1.352e+00  -5.410 1.55e-07
## lubridate::month(date, label = TRUE)^9   1.411e+00  1.357e+00   1.040 0.299594
## lubridate::month(date, label = TRUE)^10  1.905e+00  1.349e+00   1.412 0.159213
## lubridate::month(date, label = TRUE)^11 -2.533e+00  1.353e+00  -1.872 0.062459
##                                            
## (Intercept)                             ***
## as.numeric(date)                        ***
## wday(date, label = TRUE).L                 
## wday(date, label = TRUE).Q                 
## wday(date, label = TRUE).C                 
## wday(date, label = TRUE)^4                 
## lubridate::month(date, label = TRUE).L     
## lubridate::month(date, label = TRUE).Q  ***
## lubridate::month(date, label = TRUE).C  ***
## lubridate::month(date, label = TRUE)^4  ***
## lubridate::month(date, label = TRUE)^5     
## lubridate::month(date, label = TRUE)^6  ***
## lubridate::month(date, label = TRUE)^7  ***
## lubridate::month(date, label = TRUE)^8  ***
## lubridate::month(date, label = TRUE)^9     
## lubridate::month(date, label = TRUE)^10    
## lubridate::month(date, label = TRUE)^11 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.234 on 235 degrees of freedom
## Multiple R-squared:  0.9425, Adjusted R-squared:  0.9386 
## F-statistic: 240.6 on 16 and 235 DF,  p-value: < 2.2e-16
log_transformed_back_plot

4 Splitting Time Series for Forecasting:

When we talk about implementing machine learning models we usually talk about splitting the dataset into

# Test is 25% of our data

split_df <- filtered_final_stock_tbl %>%
  filter(name == "GS") %>%
  time_series_split(
    date_var = date, 
    assess = "3 year",
    cumulative = TRUE
  )

splitted_data <- split_df %>%
  tk_time_series_cv_plan()

splitted_data %>% 
  ggplot() + geom_line(aes(x=date, y=close, color=.key), size=1) +
  theme_calc() + scale_color_stata() +
  geom_rect(data=data.frame(xmin=as.Date(c("2015-01-02")),
                               xmax=as.Date(c("2017-12-29")),
                               ymin=-Inf,
                               ymax=Inf),
               aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
               fill=palette_light()[2],alpha=0.3) + 
  geom_label(
    label = "Testing Set",
    vjust = 0.5,
    aes(x=as.Date(c("2016-07-02")),
        y = 100)) + 
       geom_rect(data=data.frame(xmin=as.Date(c("2006-01-03")),
                               xmax=as.Date(c("2014-12-31")),
                               ymin=-Inf,
                               ymax=Inf),
               aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
               fill=palette_light()[3],alpha=0.3) + scale_x_date(expand = c(0,0)) + 
    geom_label(
    label = "Training Set",
    vjust = 0.5,
    aes(x=as.Date(c("2012-07-02")),
        y = 220)) + 
  labs(
    title = "Splitting Time Series into Training and Testing Sets",
    subtitle = "Dow Jones 30"
  ) + theme(plot.title = element_text(hjust=0.5), legend.position = "bottom") + 
  guides(color = guide_legend(nrow=1, byrow=TRUE)) + 
  scale_y_continuous(labels = scales::dollar)

## Autoregressive “Integrated” Moving Average (ARIMA)

Most of us have hear the concept of Autoregressive Integrated Moving Average (ARIMA). Before understanding the concept of ARIMA we should answer the following questions: