Â
Â
library(reticulate)
library(readxl)
library(dplyr)
library(seasonal)
library(ggplot2)
library(forecast)
library(tidyverse)
library(tsibble)
library(feasts)
library(lubridate)
library(tsibbledata)
library(fable)
library(zoo)
library(fst)
library(caret)
library(janitor)
library(fPortfolio)
library(patchwork)
import time
import datetime
import requests
import pandas as pd
from pandas.tseries.offsets import BDay
from bs4 import BeautifulSoup
Â
Â
Â
Â
#
# Python
# this is the only python used.
#
def data_receive(ticker):
period1 = int(time.mktime(datetime.datetime(2020, 1, 1, 00, 00).timetuple())) # start date
period2_d = datetime.datetime.today() # weekday
period2 = int(datetime.datetime.utcnow().timestamp()) # current date
if datetime.date.weekday(period2_d) == 5: # Saturday #
period2 = period2 - datetime.timedelta(days = 1).total_seconds() # if/else statement
elif datetime.date.weekday(period2_d) == 6: # Sunday # changes 'period2' to the
period2 = period2 - datetime.timedelta(days = 2).total_seconds() # most recent business day.
else: #
period2 = int(datetime.datetime.utcnow().timestamp()); #
interval = '1d' # 1wk, 1d, 1m
query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
#
# included in 'query_string' link are: "ticker," "period1," "period2," and "interval"
# which are all specified above.
df = pd.read_csv(query_string)
return df
#
# my.prep function takes data called from python script and formats it to be
# compatible with model.
#
my.prep <- function(data){
dup <- tail(data,2)
if(dup[1,1] == dup[2,1]){
data <- slice(data,1:n()-1)
}
data$Week = yearweek(as.Date(data$Date))
data$Wk = as.Date(data$Week, format = "%Y/%U")
data$day = as.POSIXlt(data$Date)$wday
ts <- data %>%
group_by(Wk) %>%
summarize(Close = (sum(Close) / length(day))) %>%
as_tsibble(index = Wk)
return(ts)
}
AAPL <- py$data_receive("AAPL") # calling py function to scrape data
head(AAPL)
Date Open High Low Close Adj Close Volume
1 2020-01-02 74.0600 75.1500 73.7975 75.0875 73.89433 135480400
2 2020-01-03 74.2875 75.1450 74.1250 74.3575 73.17593 146322800
3 2020-01-06 73.4475 74.9900 73.1875 74.9500 73.75899 118387200
4 2020-01-07 74.9600 75.2250 74.3700 74.5975 73.41212 108872000
5 2020-01-08 74.2900 76.1100 74.2900 75.7975 74.59303 132079200
6 2020-01-09 76.8100 77.6075 76.5500 77.4075 76.17745 170108400
AAPL <- my.prep(AAPL) # calling 'my.prep' to prepare data
head(AAPL)
# A tsibble: 6 x 2 [7D]
Wk Close
<date> <dbl>
1 2019-12-30 74.7
2 2020-01-06 76.1
3 2020-01-13 78.7
4 2020-01-20 79.5
5 2020-01-27 79.2
6 2020-02-03 79.7
s.test <- ts(AAPL$Close, frequency = 53, start=2020)
#
acf <- AAPL %>% ACF(Close) %>%
autoplot() + labs(subtitle = "Apple closing stock price")
# ACF plot is slow to drop to 0 which is the case for non-stationary data.
acfD <- AAPL %>% ACF(difference(Close)) %>%
autoplot() + labs(subtitle = "Changes in Apple closing stock price")
acf + acfD
AAPL %>%
mutate(diff_close = difference(Close)) %>%
features(diff_close, ljung_box, lag = 10)
# A tibble: 1 x 2
lb_stat lb_pvalue
<dbl> <dbl>
1 15.0 0.131
# Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test
#
# Ho: the data is stationary
#
AAPL %>%
features(Close, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 2.27 0.01
# small p-values suggest that differencing is required.
#
# p = 0.01, it is less than 0.01
# p = 0.1, it is greater than 0.1
#
# Here, the test statistic (2.27) is > than the 1% critical value, so
# the p-value is less than 0.01, so we reject the null.
#
# The data is not stationary.
# can difference the data, seen here.
AAPL %>%
mutate(diff_close = difference(Close)) %>%
features(diff_close, unitroot_kpss)
# A tibble: 1 x 2
kpss_stat kpss_pvalue
<dbl> <dbl>
1 0.0365 0.1
Â
Â
Â
Â
#
# Non-seasonal comparison
AAPL %>%
stretch_tsibble(.init = 10) %>%
model(
ETS(Close),
ARIMA(Close)
) %>%
forecast(h = 1) %>%
accuracy(AAPL) %>%
select(.model, RMSE:MAPE)
# A tibble: 2 x 5
.model RMSE MAE MPE MAPE
<chr> <dbl> <dbl> <dbl> <dbl>
1 ARIMA(Close) 4.61 3.51 0.300 2.96
2 ETS(Close) 4.51 3.37 0.668 2.76
# ETS better on all metrics except MPE. It is chosen as the more accurate.
# RMSE - Root Mean Square Error
# standard deviation of the residuals. lower value - better accuracy
# MAE - Mean Absolute Error
# measure of errors between paired observations expressing the same phenomenon.
# lower value - more accurate
# MPE - Mean Percentage Error
# computed average of percentage errors by which forecasts differ from actual values.
# MAPE - Mean Absolute Percent Error
# the average absolute percent error for each time period minus actual values divided
# by actual values. lower value - more accurate
AAPL %>%
model(ETS(Close)) %>%
forecast(h = "3 months") %>%
autoplot(AAPL)
# Seasonal data comparisons
#
train <- AAPL %>% filter_index(. ~"2021-08-03")
# ==
fit_arima <- train %>% model(ARIMA(Close))
report(fit_arima)
Series: Close
Model: ARIMA(0,1,1) w/ drift
Coefficients:
ma1 constant
0.2043 0.8645
s.e. 0.1050 0.5099
sigma^2 estimated as 15.31: log likelihood=-230.01
AIC=466.03 AICc=466.33 BIC=473.28
fit_arima %>% gg_tsresiduals(lag_max = 20)
augment(fit_arima) %>%
features(.innov, ljung_box, lag = 20, dof = 6)
# A tibble: 1 x 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 ARIMA(Close) 21.6 0.0864
# The residuals on the model exhibit slight serial-correlation.
# Significant at the 0.1 level but not at 0.05
#
# ==
fit_ets <- train %>% model(ETS(Close))
report(fit_ets)
Series: Close
Model: ETS(A,N,N)
Smoothing parameters:
alpha = 0.9998999
Initial states:
l[0]
74.72691
sigma^2: 16.5474
AIC AICc BIC
611.8876 612.1876 619.1800
fit_ets %>% gg_tsresiduals(lag_max = 20)
augment(fit_ets) %>%
features(.innov, ljung_box, lag = 20, dof = 6)
# A tibble: 1 x 3
.model lb_stat lb_pvalue
<chr> <dbl> <dbl>
1 ETS(Close) 25.4 0.0310
#
Â
Â
# Generate forecasts and compare accuracy over the test set
bind_rows(
fit_arima %>% accuracy(),
fit_ets %>% accuracy(),
fit_arima %>% forecast(h=10) %>% accuracy(AAPL),
fit_ets %>% forecast(h=10) %>% accuracy(AAPL)
) %>%
select(-ME, -MPE, -ACF1)
# A tibble: 4 x 7
.model .type RMSE MAE MAPE MASE RMSSE
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ARIMA(Close) Training 3.84 2.76 2.66 0.903 0.950
2 ETS(Close) Training 4.02 3.02 2.90 0.988 0.994
3 ARIMA(Close) Test 7.10 5.30 3.67 1.73 1.76
4 ETS(Close) Test 3.99 3.38 2.28 1.10 0.986
# The ARIMA model performs better on the training data, but the
# ETS model performs better on the test data.
AAPL %>%
model(ETS(Close)) %>%
forecast(h="6 months") %>%
autoplot(AAPL) +
labs(title = "Apple stock closing price (ETS)",
y = "Price (USD)")
f1 <- forecast(
s.test,
method = c("ets"),
etsmodel = "ANN",
h = 24
)
plot(f1)
AAPL %>%
model(ARIMA(Close)) %>%
forecast(h="6 months") %>%
autoplot(AAPL) +
labs(title = "Apple stock closing price (ARIMA)",
y = "Price (USD)")
f2 <- forecast(
s.test,
method = c("arima"),
h = 24
)
plot(f2)