# Prepare needed packages
packages <- c("psych", "tidyr", "ggplot2",
"caret", "dplyr", "car", "fpp",
"forecast","ResourceSelection",
"ggpubr", "scales", "stringr",
"gridExtra", "pROC", "reshape2",
"corrplot","glmnet", "plyr",
"knitr", "purrr", "readxl", "seasonal",
"fpp2", "neuralnet", "urca", "vars")
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i])
}
library(packages[i], character.only = TRUE) # Loads package in
}
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: lattice
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
## ResourceSelection 0.3-5 2019-07-22
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:forecast':
##
## gghistogram
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## corrplot 0.84 loaded
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
## The following object is masked from 'package:fma':
##
## ozone
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:scales':
##
## discard
## The following object is masked from 'package:car':
##
## some
## The following object is masked from 'package:caret':
##
## lift
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:psych':
##
## outlier
##
## Attaching package: 'fpp2'
## The following objects are masked from 'package:fpp':
##
## ausair, ausbeer, austa, austourists, debitcards, departures,
## elecequip, euretail, guinearice, oil, sunspotarea, usmelec
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
# Remove unused objects
rm(packages)
rm(i)
# Set work directory
setwd("~/Desktop/Predictive:Forecasting/Week6/HW")
# Load in data set
tsla <- read.csv("TSLA.csv", header = TRUE)
head(tsla)
## Date Open High Low Close Adj.Close Volume
## 1 2016-08-12 45.082 45.330 44.808 45.122 45.122 9067500
## 2 2016-08-15 45.204 45.900 44.986 45.118 45.118 10171500
## 3 2016-08-16 45.098 45.438 44.682 44.722 44.722 11335500
## 4 2016-08-17 44.866 44.966 44.560 44.648 44.648 8935500
## 5 2016-08-18 44.764 45.132 44.458 44.702 44.702 8572500
## 6 2016-08-19 44.708 45.034 44.506 45.000 45.000 8297500
tail(tsla)
## Date Open High Low Close Adj.Close Volume
## 1253 2021-08-04 711.00 724.90 708.93 710.92 710.92 17002600
## 1254 2021-08-05 716.00 720.95 711.41 714.63 714.63 12919600
## 1255 2021-08-06 711.90 716.33 697.63 699.10 699.10 15576200
## 1256 2021-08-09 710.17 719.03 705.13 713.76 713.76 14715300
## 1257 2021-08-10 713.99 716.59 701.88 709.99 709.99 13253000
## 1258 2021-08-11 712.71 715.18 704.21 707.82 707.82 9739200
train <- tsla[1:1007,]
tsla.train <- ts(train$Adj.Close, end = c(2020, 7), frequency = 365)
autoplot(tsla.train)
test <- tsla[1008:1258,]
tsla.test <- ts(test$Adj.Close, start = c(2020,8), frequency = 365)
autoplot(tsla.test)
autoplot(decompose(tsla.train))
acf(tsla.train)
pacf(tsla.train)
The train set was log-transformed to stabilize the variance. Then, I take the first difference of the data until it is stationary. At the end, the forecast presents an increasing trend.
log.train <- log(tsla.train)
adf.test(log.train )
## Warning in adf.test(log.train): p-value greater than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: log.train
## Dickey-Fuller = -0.28632, Lag order = 10, p-value = 0.99
## alternative hypothesis: stationary
Box.test(log.train , lag=25, type="Ljung-Box")
##
## Box-Ljung test
##
## data: log.train
## X-squared = 19451, df = 25, p-value < 2.2e-16
# Test the # of difference is needed
ndiffs(log.train)
## [1] 1
log.train.diff <- diff(log.train, difference=1)
adf.test(log.train.diff)
## Warning in adf.test(log.train.diff): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: log.train.diff
## Dickey-Fuller = -9.4051, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
autoplot(log.train.diff)
acf(log.train.diff, lag.max = 30)
pacf(log.train.diff, lag.max = 30)
## Based on the ACF and PACF plots, both plots shows gradual decrease. Also, the ACF plot drop suddenly after the first spike lag which may indicate that an MA(1) and ARIMA(1,1) are the potential model to use.
# ARIMA(0,1,1)
fit1 <- Arima(log.train.diff, order=c(0,1,1))
fit1
## Series: log.train.diff
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.9940
## s.e. 0.0042
##
## sigma^2 estimated as 0.001296: log likelihood=1913.07
## AIC=-3822.13 AICc=-3822.12 BIC=-3812.31
checkresiduals(fit1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 186.16, df = 200, p-value = 0.7502
##
## Model df: 1. Total lags used: 201
# Use auto.arima and compare which model is the best
auto.fit <- auto.arima(tsla.train, seasonal = FALSE) #Select ARIMA(2,2,2)
checkresiduals(auto.fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,2,2)
## Q* = 324.72, df = 197, p-value = 2.635e-08
##
## Model df: 4. Total lags used: 201
# Forecast on the next 60 days based on ARIMA(0,1,1)
fcast <- forecast(fit1, h = 60)
autoplot(fcast)
# Forecasts ARIMA(2,2,2)
fcast2 <- forecast(auto.fit, h = 60)
autoplot(fcast2)
# Check accuracy of both models
accuracy(fcast, tsla.test)
## ME RMSE MAE MPE MAPE
## Training set 1.160263e-03 0.03596587 0.02393384 -Inf Inf
## Test set 4.162828e+02 417.56458284 416.28279741 99.99851 99.99851
## MASE ACF1 Theil's U
## Training set 6.322051e-01 -0.006379808 NA
## Test set 1.099598e+04 0.694180612 18.23464
accuracy(fcast2, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1510163 4.183066 2.110116 0.08449393 2.434633 0.05718079
## Test set 62.1326534 71.251603 62.450498 14.54358893 14.633301 1.69230927
## ACF1 Theil's U
## Training set 0.009418524 NA
## Test set 0.774388065 3.042335
## From the measurement metrics, we can see that the ARIMA(2,2,2) model is better in almost all aspects beside ME. Therefore, It is chosen to be the best arima model.
Since the data did not show a clear seasonality; hence, I applied the SES method to do the forecasts.
# Using the first difference & build a first ses model
ses.tsla <- ses(log.train.diff, alpha = 0.2, h = 250)
autoplot(ses.tsla)
# difference the test set
log.test <- log(tsla.test)
log.test.diff <- diff(log.test)
accuracy(ses.tsla, log.test.diff)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0000873636 0.03772592 0.02560730 Inf Inf 0.6764089
## Test set -0.0111500527 0.04321839 0.03181607 36.14115 254.7471 0.8404118
## ACF1 Theil's U
## Training set -0.1096320 NA
## Test set -0.0765009 1.413549
# Loop through possible alpha value to identify the optimal
alpha <- seq(.01, .99, by = .01)
RMSE <- NA
for(i in seq_along(alpha)) {
fit <- ses(log.train.diff, alpha = alpha[i], h = 250)
RMSE[i] <- accuracy(fit,log.test.diff)[2,2]
}
summary(RMSE) ## Minimum is at 0.05
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04201 0.04493 0.06228 0.07000 0.09136 0.12575
# Refitting with alpha = 0.05
ses.tsla.opt <- ses(log.train.diff, alpha = 0.05, h = 250)
accuracy(ses.tsla.opt, log.test.diff)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002379214 0.03635988 0.02422615 Inf Inf 0.6399264
## Test set -0.0054138119 0.04210481 0.03051187 61.76043 180.9387 0.8059618
## ACF1 Theil's U
## Training set -0.02783951 NA
## Test set -0.07650090 1.092981
# Plotting results
autoplot(ses.tsla.opt) +
theme(legend.position = "bottom")
autoplot(log.test.diff) +
autolayer(ses.tsla.opt, alpha = .5) +
ggtitle("Predicted vs. actuals for the test data set")
## From the performance metrics, model with alpha = 0.2 or = 0.05 do not have significant difference. But, the confidence intervals are much smaller/narrower than the previous model. In addition, the Predicted versus actuals plot shows that most the data points are within the predicted confidence intervals, indicating that the accuracy is relatively high.
Neural network was designed to detect trend in input and generate an output with no noise. Therefore, I utilized this method to build an neural network model using nnetar().
set.seed(43)
model1 <- nnetar(tsla.train, lambda = NULL)
summary(model1)
## Length Class Mode
## x 1007 ts numeric
## m 1 -none- numeric
## p 1 -none- numeric
## P 1 -none- numeric
## scalex 2 -none- list
## size 1 -none- numeric
## subset 1007 -none- numeric
## model 20 nnetarmodels list
## nnetargs 0 -none- list
## fitted 1007 ts numeric
## residuals 1007 ts numeric
## lags 5 -none- numeric
## series 1 -none- character
## method 1 -none- character
## call 3 -none- call
## Forecasts by computing prediction intervals
fc <- forecast(model1,PI= TRUE, h = 60)
autoplot(fc)
# Check the residuals distribution
autoplot(residuals(fc))
# Outputs measurement metrics on test set
accuracy(fc, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00832095 4.577903 2.590874 -0.183942 2.778522 0.07020857
## Test set 114.80020494 119.348157 114.800205 27.093322 27.093322 3.11090315
## ACF1 Theil's U
## Training set 0.0102998 NA
## Test set 0.6915651 5.129082
lambda <- BoxCox.lambda(tsla.train)
model1 <- stlf(tsla.train, method = "ets", lambda = lambda, h = 30)
summary(model1)
##
## Forecast method: STL + ETS(A,N,N)
##
## Model Information:
## ETS(A,N,N)
##
## Call:
## ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 0.9789
##
## sigma: 4e-04
##
## AIC AICc BIC
## -8674.426 -8674.402 -8659.681
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.187116 4.781363 2.12733 0.08838744 2.196297 0.05764727
## ACF1
## Training set -0.02148084
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020.0192 205.2739 184.6942 231.0147 175.38614 247.4400
## 2020.0219 203.3595 175.9009 240.9760 164.16651 267.1335
## 2020.0247 197.8359 166.8107 243.0380 154.02396 276.4780
## 2020.0274 195.2352 161.0933 247.7400 147.44372 288.8629
## 2020.0301 197.8679 159.5517 260.4019 144.71673 312.7193
## 2020.0329 188.8407 150.9436 252.1450 136.44784 306.5423
## 2020.0356 191.3067 150.0769 263.7692 134.70810 329.9210
## 2020.0384 190.7958 147.5710 269.8294 131.76802 345.6143
## 2020.0411 175.9178 136.7465 246.5372 122.32719 313.0639
## 2020.0438 169.6989 131.4199 239.4397 117.40086 306.0121
## 2020.0466 171.1930 130.8636 247.4501 116.35319 323.8022
## 2020.0493 181.5827 135.3664 275.7130 119.29325 379.9849
## 2020.0521 181.7630 134.0722 282.1094 117.72106 398.5952
## 2020.0548 171.8901 127.4121 264.0721 112.06175 368.7558
## 2020.0575 159.6200 119.5164 240.2250 105.48645 327.8688
## 2020.0603 173.7016 126.1336 278.8652 110.16332 410.3873
## 2020.0630 180.2238 128.4262 302.0406 111.46691 470.3197
## 2020.0658 184.5950 129.5328 321.0738 111.86806 527.5335
## 2020.0685 185.3859 128.8652 330.2138 110.95704 563.0601
## 2020.0712 191.7090 130.8286 358.5541 111.99994 664.8382
## 2020.0740 191.1964 129.5753 364.5612 110.69002 701.0443
## 2020.0767 200.3975 132.6981 409.1041 112.56691 911.7257
## 2020.0795 168.4614 117.1078 300.0206 100.83543 511.4485
## 2020.0822 171.2082 117.6486 314.2789 100.93332 563.5751
## 2020.0849 164.2936 113.6293 296.4833 97.68274 516.4383
## 2020.0877 163.2667 112.4533 297.8486 96.54648 528.4208
## 2020.0904 159.4362 109.9812 289.7005 94.46887 510.4773
## 2020.0932 152.8994 106.2432 272.6115 91.46782 465.5644
## 2020.0959 149.1283 103.8573 264.3566 89.47784 447.3147
## 2020.0986 147.3528 102.4657 262.2196 88.23653 446.4404
autoplot(model1)
model2 <- ets(tsla.train, model ="ZZZ")
## Warning in ets(tsla.train, model = "ZZZ"): I can't handle data with frequency
## greater than 24. Seasonality will be ignored. Try stlf() if you need seasonal
## forecasts.
model2
## ETS(M,N,N)
##
## Call:
## ets(y = tsla.train, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9898
##
## Initial states:
## l = 45.0678
##
## sigma: 0.0362
##
## AIC AICc BIC
## 8743.468 8743.492 8758.212
model2.fc <- forecast(model2, h = 60)
autoplot(model2.fc)
## Checking if the model cannot be improved
plot.ts(model1$residuals)
## The plot shows that forecast errors have roughly constant variance over time, although the magnitude of the fluctuation from June 2018 and July 2019 may be larger than early dates.
## Checking whether the forecast errors are normally distributed with mean zero, a histogram of the forecast errors is plotted.
plotForecastErrors <- function(forecasterrors)
{
# make a histogram of the forecast errors:
mybinsize <- IQR(forecasterrors)/4
mysd <- sd(forecasterrors)
mymin <- min(forecasterrors) - mysd*5
mymax <- max(forecasterrors) + mysd*3
# normally distributed data with mean 0 and standard deviation mysd
mynorm <- rnorm(10000, mean=0, sd=mysd)
mymin2 <- min(mynorm)
mymax2 <- max(mynorm)
if (mymin2 < mymin) { mymin <- mymin2 }
if (mymax2 > mymax) { mymax <- mymax2 }
# make a red histogram of the forecast errors, with the normally distributed data overlaid:
mybins <- seq(mymin, mymax, mybinsize)
hist(forecasterrors, col="blue", freq=FALSE, breaks=mybins)
# freq=FALSE ensures the area under the histogram = 1
# generate normally distributed data with mean 0 and standard deviation mysd
myhist <- hist(mynorm, plot=FALSE, breaks=mybins)
# plot the normal curve as a blue line on top of the histogram of forecast errors:
points(myhist$mids, myhist$density, type="l", col="red", lwd=2)
}
plotForecastErrors(model1$residuals)
accuracy(model1, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.187116 4.781363 2.12733 0.08838744 2.196297 0.05764727
## Test set 228.439702 234.218966 228.43970 55.38405685 55.384057 6.19035297
## ACF1 Theil's U
## Training set -0.02148084 NA
## Test set 0.74085168 7.910217
accuracy(model2.fc, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2663996 4.230519 2.07937 0.1282969 2.386321 0.05634763
## Test set 105.7056953 110.645865 105.70570 24.8926513 24.892651 2.86445639
## ACF1 Theil's U
## Training set -0.0264999 NA
## Test set 0.6941806 4.745873
## Model 2 ETS(M,N,N) gives a better accuracy based on all the performance metrics.
# ARIMA(2,2,2)
accuracy(fcast2, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.1510163 4.183066 2.110116 0.08449393 2.434633 0.05718079
## Test set 62.1326534 71.251603 62.450498 14.54358893 14.633301 1.69230927
## ACF1 Theil's U
## Training set 0.009418524 NA
## Test set 0.774388065 3.042335
# SES with alpha = 0.05
accuracy(ses.tsla.opt, log.test.diff)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.0002379214 0.03635988 0.02422615 Inf Inf 0.6399264
## Test set -0.0054138119 0.04210481 0.03051187 61.76043 180.9387 0.8059618
## ACF1 Theil's U
## Training set -0.02783951 NA
## Test set -0.07650090 1.092981
# Neural Network
accuracy(fc, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.00832095 4.577903 2.590874 -0.183942 2.778522 0.07020857
## Test set 114.80020494 119.348157 114.800205 27.093322 27.093322 3.11090315
## ACF1 Theil's U
## Training set 0.0102998 NA
## Test set 0.6915651 5.129082
# ETS(M,N,N)
accuracy(model2.fc, tsla.test)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.2663996 4.230519 2.07937 0.1282969 2.386321 0.05634763
## Test set 105.7056953 110.645865 105.70570 24.8926513 24.892651 2.86445639
## ACF1 Theil's U
## Training set -0.0264999 NA
## Test set 0.6941806 4.745873
Comparing across these four models, we can see that simple exponential smoothing (SES) model has the lowest ACF1, meaning that the current values was less influenced by the previous values. Comparing the bias metrics RMSE and MAE, the SES model is also the lowest in the test set. Therefore, the SES model is a better one based on its performance metrics.