Libraries

- R

library(reticulate)
library(readxl)
library(dplyr)
library(seasonal)
library(ggplot2)
library(forecast)
library(tidyverse)
library(tsibble)
library(feasts)
library(lubridate)
library(tsibbledata)
library(fable)
library(zoo)
library(fst)
library(caret)
library(janitor)
library(fPortfolio)
library(patchwork)

- Python

import time
import datetime
import requests
import pandas as pd
from pandas.tseries.offsets import BDay
from bs4 import BeautifulSoup

Data

- Reading

#
# Python
# this is the only python used. 
#
def data_receive(ticker):
  period1 = int(time.mktime(datetime.datetime(2020, 1, 1, 00, 00).timetuple())) # start date

  period2_d = datetime.datetime.today() # weekday
  period2 = int(datetime.datetime.utcnow().timestamp()) # current date
  
  if datetime.date.weekday(period2_d) == 5: # Saturday               #
    period2 = period2 - datetime.timedelta(days = 1).total_seconds() # if/else statement   
  elif datetime.date.weekday(period2_d) == 6: # Sunday               # changes 'period2' to the 
    period2 = period2 - datetime.timedelta(days = 2).total_seconds() # most recent business day.
  else:                                                              #
    period2 = int(datetime.datetime.utcnow().timestamp());           #
    
  interval = '1d' # 1wk, 1d, 1m
  query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
  #
  # included in 'query_string' link are: "ticker," "period1," "period2," and "interval"
  # which are all specified above. 
  
  
  df = pd.read_csv(query_string)
  return df

- Prep

#
# my.prep function takes data called from python script and formats it to be 
# compatible with model. 
#
my.prep <- function(data){
  dup <- tail(data,2)
  if(dup[1,1] == dup[2,1]){
    data <- slice(data,1:n()-1)
  }
  data$Week = yearweek(as.Date(data$Date))
  data$Wk = as.Date(data$Week, format = "%Y/%U")
  data$day = as.POSIXlt(data$Date)$wday
  
  ts <- data %>%
    group_by(Wk) %>%
    summarize(Close = (sum(Close) / length(day))) %>%
    as_tsibble(index = Wk)
  
  return(ts)
}


AAPL <- py$data_receive("AAPL") # calling py function to scrape data
head(AAPL)

        Date    Open    High     Low   Close Adj Close    Volume
1 2020-01-02 74.0600 75.1500 73.7975 75.0875  73.89433 135480400
2 2020-01-03 74.2875 75.1450 74.1250 74.3575  73.17593 146322800
3 2020-01-06 73.4475 74.9900 73.1875 74.9500  73.75899 118387200
4 2020-01-07 74.9600 75.2250 74.3700 74.5975  73.41212 108872000
5 2020-01-08 74.2900 76.1100 74.2900 75.7975  74.59303 132079200
6 2020-01-09 76.8100 77.6075 76.5500 77.4075  76.17745 170108400

AAPL <- my.prep(AAPL) # calling 'my.prep' to prepare data
head(AAPL)

# A tsibble: 6 x 2 [7D]
  Wk         Close
  <date>     <dbl>
1 2019-12-30  74.7
2 2020-01-06  76.1
3 2020-01-13  78.7
4 2020-01-20  79.5
5 2020-01-27  79.2
6 2020-02-03  79.7

s.test <- ts(AAPL$Close, frequency = 53, start=2020)

- Explore

#

acf <- AAPL %>% ACF(Close) %>%
  autoplot() + labs(subtitle = "Apple closing stock price")
# ACF plot is slow to drop to 0 which is the case for non-stationary data. 

acfD <- AAPL %>% ACF(difference(Close)) %>%
  autoplot() + labs(subtitle = "Changes in Apple closing stock price")

acf + acfD

AAPL %>%
  mutate(diff_close = difference(Close)) %>%
  features(diff_close, ljung_box, lag = 10)

# A tibble: 1 x 2
  lb_stat lb_pvalue
    <dbl>     <dbl>
1    15.0     0.131

# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
#
# Ho: the data is stationary
# 
AAPL %>%
  features(Close, unitroot_kpss)

# A tibble: 1 x 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1      2.27        0.01

# small p-values suggest that differencing is required. 
#
# p = 0.01, it is less than 0.01
# p = 0.1, it is greater than 0.1
#
# Here, the test statistic (2.27) is > than the 1% critical value, so 
# the p-value is less than 0.01, so we reject the null. 
#
# The data is not stationary. 

# can difference the data, seen here. 
AAPL %>%
  mutate(diff_close = difference(Close)) %>%
  features(diff_close, unitroot_kpss)

# A tibble: 1 x 2
  kpss_stat kpss_pvalue
      <dbl>       <dbl>
1    0.0365         0.1

Models

#

# Non-seasonal comparison
AAPL %>%
  stretch_tsibble(.init = 10) %>%
  model(
    ETS(Close),
    ARIMA(Close)
  ) %>%
  forecast(h = 1) %>%
  accuracy(AAPL) %>%
  select(.model, RMSE:MAPE)

# A tibble: 2 x 5
  .model        RMSE   MAE   MPE  MAPE
  <chr>        <dbl> <dbl> <dbl> <dbl>
1 ARIMA(Close)  4.61  3.51 0.300  2.96
2 ETS(Close)    4.51  3.37 0.668  2.76

# ETS better on all metrics except MPE. It is chosen as the more accurate. 



# RMSE - Root Mean Square Error
# standard deviation of the residuals. lower value - better accuracy

# MAE - Mean Absolute Error
# measure of errors between paired observations expressing the same phenomenon. 
# lower value - more accurate

# MPE - Mean Percentage Error
# computed average of percentage errors by which forecasts differ from actual values. 

# MAPE - Mean Absolute Percent Error
# the average absolute percent error for each time period minus actual values divided
# by actual values. lower value - more accurate



AAPL %>% 
  model(ETS(Close)) %>%
  forecast(h = "3 months") %>%
  autoplot(AAPL)

# Seasonal data comparisons
#
train <- AAPL %>% filter_index(. ~"2021-08-03")

# == 
fit_arima <- train %>% model(ARIMA(Close))
report(fit_arima)

Series: Close 
Model: ARIMA(0,1,1) w/ drift 

Coefficients:
         ma1  constant
      0.2043    0.8645
s.e.  0.1050    0.5099

sigma^2 estimated as 15.31:  log likelihood=-230.01
AIC=466.03   AICc=466.33   BIC=473.28

fit_arima %>% gg_tsresiduals(lag_max = 20)

augment(fit_arima) %>%
  features(.innov, ljung_box, lag = 20, dof = 6)

# A tibble: 1 x 3
  .model       lb_stat lb_pvalue
  <chr>          <dbl>     <dbl>
1 ARIMA(Close)    21.6    0.0864

# The residuals on the model exhibit slight serial-correlation. 
# Significant at the 0.1 level but not at 0.05
#

# == 
fit_ets <- train %>% model(ETS(Close))
report(fit_ets)

Series: Close 
Model: ETS(A,N,N) 
  Smoothing parameters:
    alpha = 0.9998999 

  Initial states:
     l[0]
 74.72691

  sigma^2:  16.5474

     AIC     AICc      BIC 
611.8876 612.1876 619.1800

fit_ets %>% gg_tsresiduals(lag_max = 20)

augment(fit_ets) %>%
  features(.innov, ljung_box, lag = 20, dof = 6)

# A tibble: 1 x 3
  .model     lb_stat lb_pvalue
  <chr>        <dbl>     <dbl>
1 ETS(Close)    25.4    0.0310

- Evaluation and Forecasts

# Generate forecasts and compare accuracy over the test set
bind_rows(
  fit_arima %>% accuracy(),
  fit_ets %>% accuracy(),
  fit_arima %>% forecast(h=10) %>% accuracy(AAPL),
  fit_ets %>% forecast(h=10) %>% accuracy(AAPL)
) %>% 
  select(-ME, -MPE, -ACF1)

# A tibble: 4 x 7
  .model       .type     RMSE   MAE  MAPE  MASE RMSSE
  <chr>        <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
1 ARIMA(Close) Training  3.84  2.76  2.66 0.903 0.950
2 ETS(Close)   Training  4.02  3.02  2.90 0.988 0.994
3 ARIMA(Close) Test      7.10  5.30  3.67 1.73  1.76 
4 ETS(Close)   Test      3.99  3.38  2.28 1.10  0.986

# The ARIMA model performs better on the training data, but the 
# ETS model performs better on the test data. 

AAPL %>% 
  model(ETS(Close)) %>%
  forecast(h="6 months") %>%
  autoplot(AAPL) + 
  labs(title = "Apple stock closing price (ETS)",
       y = "Price (USD)")

f1 <- forecast(
  s.test,
  method = c("ets"), 
  etsmodel = "ANN",
  h = 24
)
plot(f1)

AAPL %>% 
  model(ARIMA(Close)) %>%
  forecast(h="6 months") %>%
  autoplot(AAPL) + 
  labs(title = "Apple stock closing price (ARIMA)",
       y = "Price (USD)")

f2 <- forecast(
  s.test,
  method = c("arima"),
  h = 24
)
plot(f2)

Auto.Arima and ETS Model comparison

Silas Selfe

2022-04-06

Libraries

- R

- Python

Data

- Reading

- Prep

- Explore

Models

- Evaluation and Forecasts