# Core libraries 

library(tidyverse)
library(timetk)
library(tidyquant)

# Explorations
library(DataExplorer)

# Visualization 
library(plotly)
library(highcharter)
library(gt)
library(ggthemes)

0.1 Classifying Companies by Sector

complete_stock_tbl <-read_csv("/Users/nguyenbuiminh/Desktop/coding_python/ml/time_series/all_stocks_2006-01-01_to_2018-01-01.csv")

complete_stock_tbl %>% head(10)

## # A tibble: 10 × 7
##    Date        Open  High   Low Close  Volume Name 
##    <date>     <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>
##  1 2006-01-03  77.8  79.4  77.2  79.1 3117200 MMM  
##  2 2006-01-04  79.5  79.5  78.2  78.7 2558000 MMM  
##  3 2006-01-05  78.4  78.6  77.6  78.0 2529500 MMM  
##  4 2006-01-06  78.6  78.9  77.6  78.6 2479500 MMM  
##  5 2006-01-09  78.5  79.8  78.5  79.0 1845600 MMM  
##  6 2006-01-10  79    79.0  78.1  78.5 1919900 MMM  
##  7 2006-01-11  78.4  78.7  77.8  78.4 1911900 MMM  
##  8 2006-01-12  78.2  78.2  77.2  77.7 2121100 MMM  
##  9 2006-01-13  77.0  78.2  77.0  77.5 1925300 MMM  
## 10 2006-01-17  77.1  77.6  77    77.1 2073400 MMM

glimpse(complete_stock_tbl)

## Rows: 93,612
## Columns: 7
## $ Date   <date> 2006-01-03, 2006-01-04, 2006-01-05, 2006-01-06, 2006-01-09, 20…
## $ Open   <dbl> 77.76, 79.49, 78.41, 78.64, 78.50, 79.00, 78.44, 78.20, 76.95, …
## $ High   <dbl> 79.35, 79.49, 78.65, 78.90, 79.83, 79.01, 78.66, 78.23, 78.20, …
## $ Low    <dbl> 77.24, 78.25, 77.56, 77.64, 78.46, 78.08, 77.84, 77.20, 76.95, …
## $ Close  <dbl> 79.11, 78.71, 77.99, 78.63, 79.02, 78.53, 78.37, 77.70, 77.50, …
## $ Volume <dbl> 3117200, 2558000, 2529500, 2479500, 1845600, 1919900, 1911900, …
## $ Name   <chr> "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", "MMM", …

final_stock_tbl <- complete_stock_tbl %>%
  set_names(names(.) %>% str_to_lower()) %>%
  mutate(
    sector = case_when(
      name == "MMM" ~ "Business/Consumer Services",
      name == "AXP" ~ "Financial Services",
      name == "AAPL" ~ "Technology", 
      name == "BA" ~ "Industrial Goods",
      name == "CAT" ~ "Industrial Goods", 
      name == "CVX" ~ "Companies on the Energy Service",
      name == "CSCO" ~ "Technology",
      name == "KO" ~ "Consumer Goods", 
      name == "DIS" ~ "Media/Entertainment",
      name == "XOM" ~ "Companies on the Energy Service",
      name == "GE" ~ "Business/Consumer Services",
      name == "GS" ~ "Financial Services",
      name == "HD" ~ "Retail/Wholesale",
      name == "IBM" ~ "Business/Consumer Services",
      name == "INTC" ~ "Technology",
      name == "JNJ" ~ "Health Care/Life Sciences",
      name == "JPM" ~ "Financial Services",
      name == "MCD" ~ "Leisure/Arts/Hospitality",
      name == "MRK" ~ "Health Care/Life Sciences",
      name == "MSFT" ~ "Technology",
      name == "NKE" ~ "Consumer Goods",
      name == "PFE" ~ "Health Care/Life Sciences",
      name == "PG" ~ "Consumer Goods",
      name == "TRV" ~ "Financial Services",
      name == "UTX" ~ "Technology", 
      name == "UNH" ~ "Financial Services", 
      name == "VZ" ~ "Telecommunication Services", 
      name == "WMT" ~ "Retail/Wholesale",
      name == "GOOGL" ~ "Technology",
      name == "AMZN" ~ "Retail/Wholesale",
      TRUE ~ "Technology"
      
    )
  )

stocks_by_sector <- final_stock_tbl %>%
  distinct(sector, name) %>%
  group_by(sector) %>%
  summarise(
    count = n()
  ) %>%
  ungroup() %>%
  arrange(desc(count))

# Treemap with Tidyquant colors

treemap_by_sector <- stocks_by_sector %>%
  hctreemap2(group_vars = "sector",
             size_var = "count",
             layoutAlgorithm = "squarified") %>%
  hc_plotOptions(treemap = list(colorByPoint = TRUE)) %>%
    hc_colors(c("#2C3E50", "#E31A1C", "#18BC9C", "#CCBE93", 
                "#A6CEE3", "#1F78B4", "#B2DF8A", "#FB9A99",
                "#FDBF6F", "#FF7F00", "#6A3D9A" )) %>%
    hc_colorAxis(dataClasses = color_classes(stocks_by_sector$sector)) %>%    
    hc_legend(enabled = FALSE) %>% 
  hc_title(text = "Number of Companies by Sector") %>% 
  hc_subtitle(text = "Dow Jones Index") %>% 
    hc_tooltip(pointFormat = "<b>{point.name}</b>:<br>
             Count: {point.value:,.0f}")
  
# Return the plot
treemap_by_sector

0.2 Companies to Analyze

The following companies will be analyzed: - Apple, Caterpillar, Google, Exxon, Golman Sachs, Walmart

0.3 Explaining Loess

As shown in the plots below we see some companies which are less volatile than others, companies that have a more upward trend line than others. LOESS stands for Locally Weighted Scatterplot Smoothing and think of this as a regression within a specific “window” (range of time) in which the closest observation is given more weight than “father” observations of a specific “focal point”. We performed weighted least squared (best fitting line).

Nevertheless, I would say these are the most important points for understanding how LOESS works

Number of observations within a window: The higher the number of observations within a specific window, the smoother the trendline will be.
Focal Point:: Is the observation for which we are performing LOESS (The closer other observations are from this focal point, the “higher the weights assigned to these observations” causing those values (closer to the focal point) to have a higher impact to the trendline)

Formula for Assignning weights (Tricube weight function) Where: - k: Number of observations (within each window) - xi: Focal Point - xk: The neighboring point - di: Distance from Focal point to the Nth neighboring point

w(xk) = (1 - |(xi - xk)/di|^3 )^3

After that linear regression is apply to find the line that reduces error the most. This will be done for all points in the “actual” time series.

#Function to get average price by week, month and quarter 
#If we use the mean(close) daily we get the actual values
#Since values are unique 

convert_date_ts <- function(data, unit = "day") {
  
  new_data <- data %>%
    mutate(date = floor_date(date, unit = unit)) %>%
    group_by(date, name) %>%
    summarise(
      close = mean(close)
    ) %>%
    ungroup() 
  
  return(new_data)
}

0.3.1 Day

# Pick by stock (6 stocks from different industries)

filtered_final_stock_tbl <- final_stock_tbl %>%
  filter(name %in% c("AAPL", "GS", "GOOGL", "XOM", "CAT", "WMT")) %>%
  mutate(
    date = ymd(date)
  )

# Daily
filtered_final_stock_tbl %>% 
  convert_date_ts() %>% 
  group_by(name) %>% 
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.2 Week

filtered_final_stock_tbl %>% 
  convert_date_ts(unit = "week") %>% 
  group_by(name) %>% 
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.3 Month

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "month") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2,
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.4 Quarter

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "quarter") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close, 
    .facet_ncol = 2, 
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.3.5 Year

filtered_final_stock_tbl %>%
  convert_date_ts(unit = "year") %>%
  group_by(name) %>%
  plot_time_series(
    .date_var = date,
    .value = close,
    .facet_ncol = 2, 
    .smooth_color = "#18BC9C",
    .smooth_size = 0.5
  )

0.4 Determining missing values with Data Explorer

Using DataExplorer: The DataExplorer libary is a useful package to initally explore your data which is critical for understanding your table
Missing data: We have some missing data in the open, high and low. However, I suspect this is coming from the data itself rather than from some major event. Especially, if we have the closing price fo those companies
Imputation of Missing Values: There are techniques to deal with missing values which will help with the missing data. For now just know that there are ways to deal with missing data

filtered_final_stock_tbl%>%
  plot_missing(
    ggtheme = theme_calc(),
    title = "Percent Missing Values by Column"
  )

0.5 Which stocks grew the most percent wise on average

Note: As expected, during the years of 2007 and 2008 is where we have the highest volatility due to the financial crisis that occur during those years caused mainly by uncertainty in the markets

Apple (AAPL): Not so surprisingly enough, Apple was the one on top of the list when it comes to returns. However, when the time period gets into Quarterly and Yearly the standard deviation is much higher meaning that this company faced a lot of volatility at certain periods of time
EXXON (XOM): Had the lowest amount of stock return
Goldman Sachs (GS): this company had on average the highest level of volatility, probably mostly impacted by the financial recession of 2007-2008

# Daily stock return by company 
# Using Tidyquant package

daily_stock_return <- filtered_final_stock_tbl %>%
  mutate(
    name = name %>% as_factor()
  ) %>%
  group_by(name) %>%
  tq_transmute(
    select = close, 
    mutate_fun = periodReturn,
    period = "daily", 
    col_rename = "stock_return"
  )

# Visualization with ggplot

daily_stock_return %>%
  ggplot(aes(x = date, y = stock_return)) + 
  geom_line(color = palette_light()[[1]]) + 
  facet_wrap(~name) + 
  theme_calc() +
  labs(
    title = "Stock Returns",
    subtitle = "by Company",
    y= "returns"
  ) +
  theme(plot.title = element_text(hjust = 0.5))

return_volatility_tbl <- function(data, unit="daily", ...){
  
  vars_col <- quos(...)
  
  tbl_return_volatility <- data %>% 
    mutate(
    name = name %>% as_factor()
  ) %>% 
  group_by(name) %>% 
  tq_transmute(
    select = close,
    mutate_fun = periodReturn,
    period = unit,
    col_rename = "stock_return"
  ) %>% 
    mutate(
    company = case_when(
      name == "AAPL" ~ "Apple",
      name == "GOOGL" ~ "Google",
      name == "XOM" ~ "Exxon",
      name == "GS" ~ "Goldman Sachs",
      name == "WMT" ~ "Walmart",
      TRUE ~ "Caterpillar"
    )
  ) %>% 
    group_by(!!!vars_col) %>% 
      summarise(
    `percent return` = mean(stock_return),
    `standard deviation` = sd(stock_return)
  ) %>% 
  ungroup() %>% 
  arrange(desc(`percent return`)) %>% gt() %>% 
  data_color(
    columns = vars(`standard deviation`),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    )) %>% 
  data_color(
    columns = vars(`percent return`),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
      domain = NULL
    )
  ) %>% fmt_percent(
    columns = vars(`percent return`), 
    decimals = 4
  ) %>% 
  fmt_number(
    columns = vars(`standard deviation`),
    decimals = 4
  ) %>% 
    tab_header(
    title = md("**2006 - 2017 Measure of Return and Volatility**"),
    subtitle = "An overview of Risk and Return"
  )
  
  
  return(tbl_return_volatility)
    
}

0.5.1 Daily

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "daily", company, name)

company	name	percent return	standard deviation
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
Apple	AAPL	0.1124%	0.0204
Google	GOOGL	0.0691%	0.0184
Caterpillar	CAT	0.0535%	0.0201
Goldman Sachs	GS	0.0509%	0.0239
Walmart	WMT	0.0326%	0.0122
Exxon	XOM	0.0233%	0.0151

0.5.2 Weekly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "weekly", company, name)

company	name	percent return	standard deviation
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
Apple	AAPL	0.5444%	0.0451
Google	GOOGL	0.3329%	0.0404
Caterpillar	CAT	0.2638%	0.0455
Goldman Sachs	GS	0.2426%	0.0525
Walmart	WMT	0.1545%	0.0257
Exxon	XOM	0.0960%	0.0277

0.5.3 Monthly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "monthly", company, name)

company	name	percent return	standard deviation
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
Apple	AAPL	2.3579%	0.0907
Google	GOOGL	1.4307%	0.0824
Caterpillar	CAT	1.1438%	0.0935
Goldman Sachs	GS	0.8721%	0.0892
Walmart	WMT	0.6391%	0.0473
Exxon	XOM	0.3549%	0.0462

0.5.4 Yearly

filtered_final_stock_tbl %>%
  return_volatility_tbl(unit = "yearly", company, name)

company	name	percent return	standard deviation
2006 - 2017 Measure of Return and Volatility
An overview of Risk and Return
Apple	AAPL	36.8065%	0.5619
Google	GOOGL	20.8735%	0.4000
Goldman Sachs	GS	14.7381%	0.4343
Caterpillar	CAT	13.0254%	0.3249
Walmart	WMT	7.7882%	0.1674
Exxon	XOM	4.1885%	0.1624

0.6 Exploring Seasonality and Pattern

To better understand certain seasonality patterns it is critical to see how the target variable behaves at certain period of time. In this case we would analyze how the closing price of company x behaves at different days of the week, months, quarters and years. This could help us understnading if there are other external factors that influeneces the price of the stock

Summary: - *Different periods in time: We will analyze the distribution at during weekdays, week, month, quarter and year - Exxon (XOM): For this case we will look at how the stock of Exxon behaves. We will see the distribution of the closing price at different points in time - Weekdays: During weekday we dont see an obvious differents in the distribution of the closing price - Months: Months of June and December tend to have a slightly higher closing price - Quarters: The second and fourth quarter has a slightly median price in the distribution of the closing price - Year**: 2010 had the worst distribution in the closing price for the company (probably an external factor impacted the sector), while 2014 was the year in which the distribution of the closing price was the highest

# Let's pick Exxon for seasonality 

filtered_final_stock_tbl %>%
  filter(name == "XOM") %>%
  plot_seasonal_diagnostics(
    .date_var = date,
    .value = close, 
    .interactive = FALSE
  )

0.6.1 Non-Interactive STL

# Function to create seasonal decomposition with loess for the stocks

plot_seasonal_decomposition <- function(data, interactive=FALSE){
  
  stl_plot <- data %>% 
    group_by(name) %>% 
    plot_stl_diagnostics(
      .date_var = date,
      .value = log1p(close),
      .interactive = interactive
    )
  
  return(stl_plot)
}


# Visualization Not interactive
filtered_final_stock_tbl %>% 
  filter(name %in% c("AAPL", "XOM", "GS")) %>% 
  group_by(name) %>% 
  plot_seasonal_decomposition()

0.6.2 Interactive STL

filtered_final_stock_tbl %>%
  filter(name %in% c("AAPL", "XOM", "GS")) %>%
  group_by(name) %>%
  plot_seasonal_decomposition(interactive = TRUE)

0.7 Outlier Detection

Anomalies are used mainly for finding out events that were not expected in our time series. It can also be that a mítake in the data could have caused an outlier in the time series. Detecting Outliers gives us the possiblity of investigating further if events that were not expected actually drove the time series to behaved in an anomally manner.

filtered_final_stock_tbl %>%
  group_by(name) %>%
  plot_anomaly_diagnostics(
    .date = date,
    .value = close,
    .facet_ncol =2,
    .interactive = FALSE,
    .title = "Anomaly Diagnostics Dow Jones",
    .anom_color = "#FB3029",
    .max_anomalies = 0.07,
    .alpha = 0.05
  )

0.7.1 Investigate Anomaly of XOM (Exxon)

As we have seen previouslu, Exxon belongs to the energy industry more specifically the oil industry. The question we will answer will be : What events occurred that caused a large drop in the stock price?. Below we will se attached a time series of the West Texas Intermediate (WTI) to see if it has a similar behavior with the XOM stock. For those that dont know WTI is a benchmark for oil pricing in the markets.

Summary:

Outlier #1: Caused by economic growth and demand in the oil industry resulting in profit gains
Outlier #2: Caused by the financial crisis 2008-2008
Outlier. #3: Caused by lower than expected results in the market (loss in profit) around 2010

filtered_final_stock_tbl %>%
  filter(name == "XOM") %>%
  plot_anomaly_diagnostics(
    .date = date,
    .value = close,
    .interactive = TRUE,
    .title = "Anomaly Diagnostics Exxon (XOM)",
    .anom_color = "#FB3029",
    .max_anomalies = 0.05,
    .alpha = 0.05
  )

1 Statistical Methods

Definition Lists:

Lags: Lag is the time gap we have with our time series. Lag 1 is measuring the time series with itself so it will always have a correlation of 1
Autocorrelation: Autocorrelation or ACF, explains how a time series is similar to itself through the use of lags. By finding patterns, we will be able to find cyclical seasonality patterns, which would further help us to use certain lags for forecasting since we will add this seasonality patterns to our model. Autocorrelation is comprised of both the direct and *indirect** relationships (unlike PACF which is only direct)
Partial Autocorrelation: Partial Autocorrelation (PACF) is the same as autocorrelation, however, the impact of lags from before are removed or in other words the relationship from previous lags are not taken in the calculation. Therefore, if we want to find the relationship between lag 1 and lag 4, the correlation between those 2 lags will be direct and will not be affected by the correlations of lag 2 and lag 3.

1.0.1 Autocorrelation (ACF)

filtered_final_stock_tbl %>%
  group_by(name) %>%
  summarise_by_time(
    .date_var = date,
    .value = mean(close, na.rm = TRUE),
    .by = "month"
  ) %>%
  tk_acf_diagnostics(
    .date_var = date,
    .value = log1p(.value)
  ) %>%
  ggplot(aes(x=lag, y=ACF, color=name, group=name)) + 
  # Add horizontal line a y=0
  geom_hline(yintercept = 0) + 
  # Plot autocorrelation 
  geom_point(size=2) + 
  geom_segment(aes(xend=lag, yend =0), size =1) + 
  # Add cutoffs 
  geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) + 
  geom_line(aes(y =.white_noise_lower), color = "black", linetype =2 ) +
  # Add facets
  facet_wrap(~ name, ncol = 3) +
  # Aesthetics 
  expand_limits(y = c(-1,1)) + 
  scale_color_stata() +
  theme_calc() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5)
  ) + 
  labs(
    title = "AutoCorrelation (ACF)",
    subtitle = "Dow Jones 30 Index"
  )

1.0.2 Partial Auto Correlation (PACF)

filtered_final_stock_tbl %>%
    group_by(name) %>%
  summarise_by_time(
    .date_var = date,
    .value = mean(close, na.rm = TRUE),
    .by="month"
  ) %>% 
  tk_acf_diagnostics(
    .date_var = date,
    .value = log1p(.value)
  ) %>% 
      ggplot(aes(x = lag, y = PACF, color = name, group = name)) +
    # Add horizontal line a y=0
    geom_hline(yintercept = 0) +
    # Plot autocorrelations
    geom_point(size = 2) +
    geom_segment(aes(xend = lag, yend = 0), size = 1) +
    # Add cutoffs
    geom_line(aes(y = .white_noise_upper), color = "black", linetype = 2) +
    geom_line(aes(y = .white_noise_lower), color = "black", linetype = 2) +
    # Add facets
    facet_wrap(~ name, ncol = 3) +
    # Aesthetics
    expand_limits(y = c(-1, 1)) +
    scale_color_stata() +
    theme_calc() +
    theme(
        legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)
    ) + labs(
      title = "Partial AutoCorrelation (PACF)",
      subtitle = "Dow Jones 30 Index"
    )

1.0.3 Cross Correlation (CFF)

The first time I heard about the concept “cross correlation”

tech_stocks_tbl <- filtered_final_stock_tbl %>%
  filter(sector == "Technology") %>%
  select(date, name, close) %>%
  pivot_wider(names_from = name, values_from = close) %>%
  summarise_by_time(
    .date_var = date,
    .by = "month",
    across(AAPL:GOOGL, .fns=mean)
  )

tech_stocks_tbl %>%
  tk_acf_diagnostics(
    date, 
    AAPL,
    .ccf_vars = GOOGL
  )  %>% 
  select(
    lag, CCF_GOOGL
  ) %>%
  slice(1:10) %>%
  gt() %>%
  data_color(
    columns = vars(CCF_GOOGL) , 
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    ))

lag	CCF_GOOGL
0	0.9137803
1	0.8816792
2	0.8501216
3	0.8231274
4	0.8001255
5	0.7770242
6	0.7513068
7	0.7243054
8	0.6970645
9	0.6747878

Summary: -CCF on AAPL Close Price: The closing price of AAPL is the target variable and the closing price of GOOGL is the independent variable.

High correlation at Lag 0: This indicates that when the closing price of AAPL is high or increasing this has positive effect on the closing price of GOOGL on that same day or at LAG 0.

#Plot

tech_stocks_tbl %>%
  plot_acf_diagnostics(
    date,
    AAPL,
    .ccf_vars = GOOGL,
    .show_ccf_vars_only = TRUE,
    .interactive = TRUE,
    .line_color = "black",
    .point_color = palette_light()[[2]],
    .line_size = 1.5,
    .title = "Cross Correlation of Technology Stocks"
  )

1.1 Simple Moving Averages

Moving averages help ú smooth the time series and helps us to better understand where the trend is heading. Believe it or not, moving averages is used a lot for forecasting although, it is not the most optimal technique to do so especially since it does not handle that well the impact of outliers. Moreover, moving averages helps us detect as well seasonality in our time series.

Most Common Moving Average in the Stock Market - 15 days MA - 30 days MA - 100 days MA - 200 days MA

Note: Notice how the higher the moving average the smoother the line. Usually, we use moving averages to detect trends in our time series data

gs_ma_sample <- filtered_final_stock_tbl %>%
  select(date, close, name) %>%
  filter(name == "GS") %>%
  filter_by_time(.date_var = date, .start_date = "2015", .end_date = "2017") %>%
  mutate(
    adjusted_15_mm_close = slidify_vec(
      .x = close,
      .period = 15,
      .f = mean, 
      na.rm = TRUE,
      .align = "center",
      .partital = TRUE
    ),
    adjusted_30_mm_close = slidify_vec(
      .x = close,
      .period = 30,
      .f = mean, 
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    )
  ) %>%
  pivot_longer(
    contains("close"), names_repair = "unique") %>%
  rename(name = name...2, 
         metric = name...3
  )
    

gs_ma_sample %>%
  ggplot(aes(x=date, y=value)) + 
  geom_line(aes(color = metric), size = 1) +
  theme_calc() +
  theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5)) + 
  guides(color = guide_legend(nrow = 2, byrow = TRUE)) + 
  labs(
    title = "Moving Average (MA) as a Metric",
    subtitle = str_glue("GS Stock 2015-2017"),
    y = "Closing Price") +
  scale_color_stata() +
  scale_x_date(expand = c(0,0))

Note: In this case I used a “Rolling” Median since I want to avoid the effect of outliers. Moving average are more prone to be affected by outliers! Nevertheless, you can use moving average by changing the function from median to mean.

mn_final_tbl <- filtered_final_stock_tbl %>%
  select(date, close, name) %>%
  mutate(
    adjusted_15_mm_close = slidify_vec(
      .x = close, 
      .period = 15, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE), 
    adjusted_30_mm_close = slidify_vec(
      .x = close, 
      .period = 30, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    ),
    adjusted_100_mm_close = slidify_vec(
      .x = close,
      .period = 100, 
      .f = median,
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE),
    adjusted_300_mm_close = slidify_vec(
      .x = close, 
      .period = 300,
      .f = median, 
      na.rm = TRUE,
      .align = "center",
      .partial = TRUE
    )
    ) %>%
  pivot_longer(contains("close"), names_repair = "unique") %>%
  rename( 
    name = name...2,
    metric = name...3
    )

mn_final_tbl

## # A tibble: 90,590 × 4
##    date       name  metric                value
##    <date>     <chr> <chr>                 <dbl>
##  1 2006-01-03 AAPL  close                 10.7 
##  2 2006-01-03 AAPL  adjusted_15_mm_close  10.9 
##  3 2006-01-03 AAPL  adjusted_30_mm_close  11   
##  4 2006-01-03 AAPL  adjusted_100_mm_close 10.2 
##  5 2006-01-03 AAPL  adjusted_300_mm_close  9.38
##  6 2006-01-04 AAPL  close                 10.7 
##  7 2006-01-04 AAPL  adjusted_15_mm_close  10.9 
##  8 2006-01-04 AAPL  adjusted_30_mm_close  10.9 
##  9 2006-01-04 AAPL  adjusted_100_mm_close 10.2 
## 10 2006-01-04 AAPL  adjusted_300_mm_close  9.38
## # ℹ 90,580 more rows

1.1.1 Visualization

mn_final_tbl %>%
  ggplot(aes(x=date, y =value, color = metric)) +
  geom_line(size=1, alpha = 0.8) + 
  facet_wrap(~name, scales = "free_y") + 
  theme_calc() + 
  scale_color_stata() +
  labs(
    title = "Moving Medians",
    subtitle = "Dow Jones 30",
    y = "price"
  ) + 
  theme(legend.position = "bottom", plot.title = element_text(hjust = 0.5), 
        legend.text = element_text(size = 10)) +
  guides(color = guide_legend(nrow = 2, byrow = TRUE))

1.1.2 Moving Average in Forecasting

The goal of this section is to showcase in which scenarios moving average could be useful and in which ones could it be that they are not that useful.

Summary:

Easy to understand: Moving averages are easy to understand thus not many technicalities are needed to understand this concept.
Expected Volatility: When there is an expected volatility, you run into the risk of having large residuals through the use of moving averages.
Low volatility: Moving averages are useful during time frames of low volatility as it is able to capture the short-term trend.

gs_filtered_data <- filtered_final_stock_tbl %>%
  filter(name == "GS") %>%
  filter_by_time(
    .start_date = "2009", 
    .end_date = "2012"
  ) %>%
  select(date, name, close)

ma_60day_forecasting <- gs_filtered_data %>% 
  mutate(
    mavg_60_days = slidify_vec(close, .f = ~ mean(.x, na.rm = TRUE), .period = 60, .align = "right")) %>% 
  bind_rows(
    future_frame(., .length_out = 60)
  ) %>% 
  # fill function: replace NA with our moving averages
  fill(
    mavg_60_days,
    .direction = "down"
  )

ma_60day_forecasting

## # A tibble: 1,066 × 4
##    date       name  close mavg_60_days
##    <date>     <chr> <dbl>        <dbl>
##  1 2009-01-02 GS     86.8           NA
##  2 2009-01-05 GS     88.8           NA
##  3 2009-01-06 GS     88.7           NA
##  4 2009-01-07 GS     84.5           NA
##  5 2009-01-08 GS     85.4           NA
##  6 2009-01-09 GS     83.9           NA
##  7 2009-01-12 GS     77.7           NA
##  8 2009-01-13 GS     77.9           NA
##  9 2009-01-14 GS     75.7           NA
## 10 2009-01-15 GS     73.8           NA
## # ℹ 1,056 more rows

2 Forecasting plot

ma_60day_forecasting %>% pivot_longer(close:mavg_60_days,names_repair="unique") %>%
  rename(
    "type" = "name...3"
  ) %>% 
  ggplot(aes(x=date, y=value)) + geom_line(aes(color=type), size=1) +
  theme_calc() + scale_color_stata() +
  labs(
    title = "60 day Moving Average",
    subtitle = "MA as a Forecasting Technique"
  ) + theme(plot.title = element_text(hjust=0.5))

3 Residuals of Forecasting with Moving Average

In this scenario we can

actual_60days <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>% 
  filter_by_time(
    .start_date = "2013",
    .end_date = "2013-03-01"
  ) %>% 
  select(date, name, close)



plot_series <- ma_60day_forecasting %>% 
  filter_by_time(
    .start_date = "2013-01-02", .end_date = "2013-03-01"
  ) %>% 
  left_join(actual_60days, by="date") %>% 
  select(date, mavg_60_days, close.y) %>% 
  rename(
    "actual_close_price" = "close.y"
  ) %>% 
  pivot_longer(mavg_60_days:actual_close_price) %>% 
  # Remove weekends from moving average
  drop_na() %>% 
  ggplot(aes(x=date, value)) + geom_line(aes(color=name), size=1) + geom_point(aes(color=name), size=2) + theme_calc() + scale_color_stata() + 
  labs(
    title = "Residuals from 60 day Moving average", 
    subtitle = "GS stock Moving Average"
  ) + theme(legend.position = "bottom") + guides(color = guide_legend(nrow=1, byrow=TRUE))


plot_series

## Rolling Corelation

Another way of finding a relationship between two time series is through the use of Rolling Correlations. What we do here is find a correlation at different points in time between two time series and determine if the relationship between the both is significant.

Summary:

APPL vs GOOGL: Is the relationship between the two technological stocks significant enough?
Time lag of 1 year: A time lag of one year will be used in our monthly time series (which we created). Do the two stocks tend to move together?
End result: In most cases we see a rolling correlation > than 0.5! Which is significant enough and which gives evidence that these two stocks move together.

tech_stocks_tbl <- filtered_final_stock_tbl %>%
  filter(sector == "Technology") %>%
  select(date, name, close) %>%
  pivot_wider(names_from = name, values_from = close) %>%
  summarise_by_time(
    .date_var = date, 
    .by = "month",
    across(AAPL:GOOGL, .fns = mean)
  )

tech_stocks_tbl %>%
  tk_acf_diagnostics(
    date,
    AAPL,
    .ccf_vars = GOOGL
) %>%
  select(lag, CCF_GOOGL)%>% 
  slice(1:10) %>%
  gt() %>%
  data_color(
    columns = vars(CCF_GOOGL),
    colors = scales::col_numeric(
      palette = c("#ffffff", "#f2fbd2", "#FA6047", "#F25555", "#FA1B1B"), 
      domain = NULL
    ))

lag	CCF_GOOGL
0	0.9137803
1	0.8816792
2	0.8501216
3	0.8231274
4	0.8001255
5	0.7770242
6	0.7513068
7	0.7243054
8	0.6970645
9	0.6747878

3.1 Linear Regression and the use of Log Transformations

The main reason why we need to use log transformations in time series analysis is to reduce the impact of extentsive outliers therefore, stablizing the variance across the time series. Log transformations are required in models like Linear Regression where an outlier can cause a significant impact on the adjusted squared error. Although, in this case wen dont have “significant” outliers log transformations can help you mitigate the impact of such

Log Transformation Formula:

\(\large Y = log_b^x\)
\(\large b^y = x\)

Summary:

Log1p function: This function is the same as a log transformation + 1. We use this to mitigate the impact of values which are zero. (This is not the case)
Feature Importance: Notice how most of our months are giving significant importance to reduce our adjusted R-squared. Meaning that the month has a significant impact on determining where the closing price of a stock is heading. This will lead to the question is there some sort of seasonality?
Getting from log values to actual values? To transform back to our actual values we use the expm1 function which will allows us to get the orginial values back. We want to definitely know this whenever we are forecasting!

# We will pick GS as it is one of the ones with the highest volatility

# GS linear regression model using log transformations
log_gs_lm_plot <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>% 
  filter_by_time(
    .start_date = "2016-01-01",
    .end_date = "2017-01-01"
  ) %>% 
  plot_time_series_regression(
    .date_var = date,
    .formula = log1p(close) ~ as.numeric(date) + 
      wday(date, label = TRUE) + 
      lubridate::month(date, label = TRUE),
    .interactive=FALSE,
    .show_summary = TRUE,
    .title = "Log transformations in Linear Regression Models"
  )

## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.114757 -0.018312  0.001485  0.020826  0.105742 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -9.3685671  4.3585702  -2.149  0.03262
## as.numeric(date)                         0.0008535  0.0002566   3.326  0.00102
## wday(date, label = TRUE).L              -0.0018868  0.0050972  -0.370  0.71159
## wday(date, label = TRUE).Q               0.0004393  0.0050410   0.087  0.93063
## wday(date, label = TRUE).C              -0.0015788  0.0049871  -0.317  0.75184
## wday(date, label = TRUE)^4               0.0031286  0.0049447   0.633  0.52753
## lubridate::month(date, label = TRUE).L   0.0212167  0.0939807   0.226  0.82159
## lubridate::month(date, label = TRUE).Q   0.2516840  0.0078410  32.098  < 2e-16
## lubridate::month(date, label = TRUE).C   0.1107287  0.0078093  14.179  < 2e-16
## lubridate::month(date, label = TRUE)^4   0.0581218  0.0078034   7.448 1.79e-12
## lubridate::month(date, label = TRUE)^5  -0.0174280  0.0077749  -2.242  0.02592
## lubridate::month(date, label = TRUE)^6   0.0410392  0.0078056   5.258 3.28e-07
## lubridate::month(date, label = TRUE)^7  -0.0437489  0.0077170  -5.669 4.19e-08
## lubridate::month(date, label = TRUE)^8  -0.0437221  0.0076994  -5.679 3.99e-08
## lubridate::month(date, label = TRUE)^9   0.0105762  0.0077319   1.368  0.17266
## lubridate::month(date, label = TRUE)^10  0.0131328  0.0076836   1.709  0.08874
## lubridate::month(date, label = TRUE)^11 -0.0166546  0.0077086  -2.161  0.03174
##                                            
## (Intercept)                             *  
## as.numeric(date)                        ** 
## wday(date, label = TRUE).L                 
## wday(date, label = TRUE).Q                 
## wday(date, label = TRUE).C                 
## wday(date, label = TRUE)^4                 
## lubridate::month(date, label = TRUE).L     
## lubridate::month(date, label = TRUE).Q  ***
## lubridate::month(date, label = TRUE).C  ***
## lubridate::month(date, label = TRUE)^4  ***
## lubridate::month(date, label = TRUE)^5  *  
## lubridate::month(date, label = TRUE)^6  ***
## lubridate::month(date, label = TRUE)^7  ***
## lubridate::month(date, label = TRUE)^8  ***
## lubridate::month(date, label = TRUE)^9     
## lubridate::month(date, label = TRUE)^10 .  
## lubridate::month(date, label = TRUE)^11 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03551 on 235 degrees of freedom
## Multiple R-squared:  0.933,  Adjusted R-squared:  0.9285 
## F-statistic: 204.6 on 16 and 235 DF,  p-value: < 2.2e-16

log_gs_lm_plot

Transforming back to the original values using expm1 function:

log_transformed_back_plot <- filtered_final_stock_tbl %>% 
  filter(name == "GS") %>%
  filter_by_time(
    .start_date = "2016-01-01",
    .end_date = "2017-01-01"
  ) %>%
  plot_time_series_regression(
    .date_var = date, 
    .formula = expm1(log1p(close)) ~ as.numeric(date) + 
      wday(date, label = TRUE) +
      lubridate::month(date, label = TRUE),
    .interactive = TRUE, 
    .show_summary = TRUE, 
    .title = "Transforming to actual close price using expm1 function"
  )

## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.0393  -3.0182   0.3419   3.3421  18.1935 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -2.804e+03  7.652e+02  -3.664 0.000306
## as.numeric(date)                         1.751e-01  4.505e-02   3.885 0.000133
## wday(date, label = TRUE).L              -3.287e-01  8.948e-01  -0.367 0.713718
## wday(date, label = TRUE).Q               3.947e-02  8.850e-01   0.045 0.964464
## wday(date, label = TRUE).C              -2.123e-01  8.755e-01  -0.242 0.808638
## wday(date, label = TRUE)^4               5.540e-01  8.681e-01   0.638 0.523987
## lubridate::month(date, label = TRUE).L  -2.077e+00  1.650e+01  -0.126 0.899952
## lubridate::month(date, label = TRUE).Q   4.869e+01  1.377e+00  35.375  < 2e-16
## lubridate::month(date, label = TRUE).C   2.412e+01  1.371e+00  17.596  < 2e-16
## lubridate::month(date, label = TRUE)^4   1.309e+01  1.370e+00   9.555  < 2e-16
## lubridate::month(date, label = TRUE)^5  -1.112e+00  1.365e+00  -0.814 0.416229
## lubridate::month(date, label = TRUE)^6   6.582e+00  1.370e+00   4.803 2.78e-06
## lubridate::month(date, label = TRUE)^7  -7.165e+00  1.355e+00  -5.289 2.82e-07
## lubridate::month(date, label = TRUE)^8  -7.312e+00  1.352e+00  -5.410 1.55e-07
## lubridate::month(date, label = TRUE)^9   1.411e+00  1.357e+00   1.040 0.299594
## lubridate::month(date, label = TRUE)^10  1.905e+00  1.349e+00   1.412 0.159213
## lubridate::month(date, label = TRUE)^11 -2.533e+00  1.353e+00  -1.872 0.062459
##                                            
## (Intercept)                             ***
## as.numeric(date)                        ***
## wday(date, label = TRUE).L                 
## wday(date, label = TRUE).Q                 
## wday(date, label = TRUE).C                 
## wday(date, label = TRUE)^4                 
## lubridate::month(date, label = TRUE).L     
## lubridate::month(date, label = TRUE).Q  ***
## lubridate::month(date, label = TRUE).C  ***
## lubridate::month(date, label = TRUE)^4  ***
## lubridate::month(date, label = TRUE)^5     
## lubridate::month(date, label = TRUE)^6  ***
## lubridate::month(date, label = TRUE)^7  ***
## lubridate::month(date, label = TRUE)^8  ***
## lubridate::month(date, label = TRUE)^9     
## lubridate::month(date, label = TRUE)^10    
## lubridate::month(date, label = TRUE)^11 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.234 on 235 degrees of freedom
## Multiple R-squared:  0.9425, Adjusted R-squared:  0.9386 
## F-statistic: 240.6 on 16 and 235 DF,  p-value: < 2.2e-16

log_transformed_back_plot

4 Splitting Time Series for Forecasting:

When we talk about implementing machine learning models we usually talk about splitting the dataset into

# Test is 25% of our data

split_df <- filtered_final_stock_tbl %>%
  filter(name == "GS") %>%
  time_series_split(
    date_var = date, 
    assess = "3 year",
    cumulative = TRUE
  )

splitted_data <- split_df %>%
  tk_time_series_cv_plan()

splitted_data %>% 
  ggplot() + geom_line(aes(x=date, y=close, color=.key), size=1) +
  theme_calc() + scale_color_stata() +
  geom_rect(data=data.frame(xmin=as.Date(c("2015-01-02")),
                               xmax=as.Date(c("2017-12-29")),
                               ymin=-Inf,
                               ymax=Inf),
               aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
               fill=palette_light()[2],alpha=0.3) + 
  geom_label(
    label = "Testing Set",
    vjust = 0.5,
    aes(x=as.Date(c("2016-07-02")),
        y = 100)) + 
       geom_rect(data=data.frame(xmin=as.Date(c("2006-01-03")),
                               xmax=as.Date(c("2014-12-31")),
                               ymin=-Inf,
                               ymax=Inf),
               aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax),
               fill=palette_light()[3],alpha=0.3) + scale_x_date(expand = c(0,0)) + 
    geom_label(
    label = "Training Set",
    vjust = 0.5,
    aes(x=as.Date(c("2012-07-02")),
        y = 220)) + 
  labs(
    title = "Splitting Time Series into Training and Testing Sets",
    subtitle = "Dow Jones 30"
  ) + theme(plot.title = element_text(hjust=0.5), legend.position = "bottom") + 
  guides(color = guide_legend(nrow=1, byrow=TRUE)) + 
  scale_y_continuous(labels = scales::dollar)

## Autoregressive “Integrated” Moving Average (ARIMA)

Most of us have hear the concept of Autoregressive Integrated Moving Average (ARIMA). Before understanding the concept of ARIMA we should answer the following questions:

What is stationarity? There is a concept called “Stationarity”/ which basically means the difference of t - t1. For example, in this case the closing price of in Jan 1st 2016 - the closing price Jan 2nd 2016 and so on.
What is the effect of applying stationarity to a dataset? Stationary datasets creates a constant mean and variance which reduces the impact of trend and seasonality which affect the time series at different points in time. The variance and mean should be constant meaning the time series should look basically the same at any point in time.
Do we need to implement differencing when using ARIMA: The answer to that question is no, since ARIMA does differencing automatically and this is what the “I” in ARIMA does it implements differencing to make the time series stationary and avoid the impact of trend and seasonality.

time_series_p1

Nguyen Bui

`r.Sys.Date()`

0.1 Classifying Companies by Sector

0.2 Companies to Analyze

0.3 Explaining Loess

0.3.1 Day

0.3.2 Week

0.3.3 Month

0.3.4 Quarter

0.3.5 Year

0.4 Determining missing values with Data Explorer

0.5 Which stocks grew the most percent wise on average

0.5.1 Daily

0.5.2 Weekly

0.5.3 Monthly

0.5.4 Yearly

0.6 Exploring Seasonality and Pattern

0.6.1 Non-Interactive STL

0.6.2 Interactive STL

0.7 Outlier Detection

0.7.1 Investigate Anomaly of XOM (Exxon)

1 Statistical Methods

1.0.1 Autocorrelation (ACF)

1.0.2 Partial Auto Correlation (PACF)

1.0.3 Cross Correlation (CFF)

1.1 Simple Moving Averages

1.1.1 Visualization

1.1.2 Moving Average in Forecasting

2 Forecasting plot

3 Residuals of Forecasting with Moving Average

3.1 Linear Regression and the use of Log Transformations

4 Splitting Time Series for Forecasting: