Time series Analysis on bitcoin prices data part2

Introduction

This is the second part of our time series analysis on bit coin data set. In this notebook we will explore the typical measure associated with time series analysis like autocorrelation, autocovariance, partial autocorrelation and fit some simple models which include ARIMA and exponential smoothing. Subsequently we will explore the order of AR and MA models analytically.

Loading the libraries

libraries <- c('forecast','astsa','tseries','ggplot2','xts','data.table','gridExtra','tidyquant','knitr','AnomalyDetection')
sapply(libraries, require, character = T)

##         forecast            astsa          tseries          ggplot2 
##             TRUE             TRUE             TRUE             TRUE 
##              xts       data.table        gridExtra        tidyquant 
##             TRUE             TRUE             TRUE             TRUE 
##            knitr AnomalyDetection 
##             TRUE             TRUE

setwd("G:\\BTP\\Data")
data1 = read.table('bitcoin_price_daily.csv', sep=',', header = T)
head(data1)

data1$Open <- as.numeric(data1$Open)
data1$High <- as.numeric(data1$High)
data1$Low <- as.numeric(data1$Low)
data1$Close <- as.numeric(data1$Close)

data1$Volume <- as.numeric(data1$Volume)
data1$Date <- as.character(data1$Date)

library(stringr)

x = str_replace_all(data1$Date, fixed(","),"")
x = str_replace_all(x, fixed(" "),"")
data1$Date = as.Date(x,"%B%d%Y")

Plotting the closing prices of bitcoin data.

ggplot(data1, aes(x=Date, y=Close)) + geom_line(col="blue") + theme_tq()

Decomposing the time series into trend and seasonality. The fluctuations increases as time increases, so we must take log transformations to supress these fluctuations and then use the decompostion.

price = ts(rev(data1$Close), frequency = 7)
log.price = log(price)
fit_1 = stl(log.price,"periodic", robust = TRUE)

Seperating out the trend and seasonality component

log.price.seasonal = fit_1$time.series[,"seasonal"]
log.price.trend = fit_1$time.series[,"trend"]
log.price.residuals = fit_1$time.series[,"remainder"]
log.price.expected = log.price.seasonal + log.price.trend
data.decomposed = data.frame(x=1:1655, seasonality = log.price.seasonal, trend=log.price.trend, residuals=log.price.residuals, expected=log.price.expected, time=rev(data1$Date))


p1 <- ggplot(data = data.decomposed,aes(x=x,y=exp(seasonality))) + geom_line() + ylab("Seasonality") + coord_cartesian(xlim=c(1600,1650))
p2 <- ggplot(data=data.decomposed, aes(x=x, y=exp(trend))) + geom_line(col='darkblue')  + ylab("Trend")
#p3 <- ggplot(data=data.decomposed, aes(x=x, y = residuals)) + geom_line()
#p4 <- ggplot(data=data.decomposed, aes(x=x, y=expected)) + geom_line()
grid.arrange(p1,p2, ncol=1)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

** Residual analysis of the decomposition **

ggplot(data=data.decomposed,aes(x=x, y=residuals)) + geom_line(col="red")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

There are a lot of big spikes, which indicates the present of outliers. Lets checkout the distribution of outliers.

ggplot(data=data.decomposed, aes(x=residuals)) + geom_histogram()

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Checking the normality using qqplot

ggplot(data = data.decomposed, aes(sample=residuals)) + geom_qq()

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Anomaly detection in bitcoin closing prices.

anomaly = AnomalyDetectionVec(rev(data1$Close), max_anoms=0.02, period=7, direction='both', only_last=FALSE, plot=TRUE)
anomaly$plot

It’s clear from all the above analysis that this time series in not stationary. Lets try to make it stationary using the differentiation.

diff.prices = diff(log.price)
diff.data = data.frame(x=1:length(diff.prices), prices=diff.prices)
ggplot(data=diff.data, aes(x=x, y=prices)) + geom_line(col="steelblue") + ylab("differenced series")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

The above time series looks quite better than the previous one. Let’s decompose this series and check its residuals.

fit_2 = stl(diff.prices,"periodic", robust = TRUE)

diff.price.seasonal = fit_2$time.series[,"seasonal"]
diff.price.trend = fit_2$time.series[,"trend"]
diff.price.residuals = fit_2$time.series[,"remainder"]

diff.data.decomposed = data.frame(x=1:1654, seasonality = diff.price.seasonal, trend=diff.price.trend, residuals=diff.price.residuals)


p1 <- ggplot(data = diff.data.decomposed, aes(x=x,y=seasonality)) + geom_line() + ylab("Seasonality") + coord_cartesian(xlim=c(500,600))
p2 <- ggplot(data=data.decomposed, aes(x=x, y=trend)) + geom_line(col='darkblue')  + ylab("Trend")
#p3 <- ggplot(data=data.decomposed, aes(x=x, y = residuals)) + geom_line()
#p4 <- ggplot(data=data.decomposed, aes(x=x, y=expected)) + geom_line()
grid.arrange(p1,p2, ncol=1)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

There is still some trend in the data so let’s have one more differencing.

dif.dif.prices = diff(diff.prices)
diff.diff.data = data.frame(x=1:1653, double_differenced_prices = dif.dif.prices)
ggplot(data = diff.diff.data, aes(x=x, y = double_differenced_prices)) + geom_line(col="steelblue")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

fit_3 = stl(dif.dif.prices,"periodic", robust = TRUE)

diff.diff.price.seasonal = fit_3$time.series[,"seasonal"]
diff.diff.price.trend = fit_3$time.series[,"trend"]
diff.diff.price.residuals = fit_3$time.series[,"remainder"]

diff.diff.data.decomposed = data.frame(x=1:1653, seasonality = diff.diff.price.seasonal, trend=diff.diff.price.trend, residuals=diff.diff.price.residuals)


p1 <- ggplot(data = diff.diff.data.decomposed, aes(x=x,y=seasonality)) + geom_line() + ylab("Seasonality") + coord_cartesian(xlim=c(500,600))
p2 <- ggplot(data=diff.data.decomposed, aes(x=x, y=trend)) + geom_line(col='darkblue')  + ylab("Trend")
#p3 <- ggplot(data=data.decomposed, aes(x=x, y = residuals)) + geom_line()
#p4 <- ggplot(data=data.decomposed, aes(x=x, y=expected)) + geom_line()
grid.arrange(p1,p2, ncol=1)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

** Residual analysis of double differenced series decomposition **.

ggplot(data=diff.diff.data.decomposed, aes(x=residuals, y=..density..)) + geom_histogram(col="blue",fill="steelblue",alpha=0.5) + geom_density(col="firebrick",lwd=0.9)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The double differencing of series remove the trend and our residual plot have nice gaussian distribution with zero mean.

** Estimating the order of model with autocorrelation and partial autocorrelation plots.**

autoplot(acf(dif.dif.prices,lag.max=30,plot=FALSE)) + ggtitle("Autocorrelation plot, d=2")

We observe that there is only one significant autocorrelation after lag 0. There are some more spikes but not very significant,hence we can chose the moving average parameter for our model from 0 to 2.

autoplot(pacf(dif.dif.prices, lag.max = 30,plot = FALSE))  + ggtitle("Partial autocorrelation plot, d=2")

There are many significant spikes from 7-8 in partial autocorrelation plot which suggest that the order of autoregressive process should be of 5-8. We have kept the quality metric like AIC, BIC and SSE for our model over a range of value of moving average and autoregressive order.

d=2
for(q in 0:2){
  for(p in 0:7){
    
    if(p+d+q<=7){
    model<-arima(x=data1$Close, order = c((p),d,(q)))
    pval<-Box.test(model$residuals, lag=log(length(model$residuals)))
    sse<-sum(model$residuals^2)
    cat(p,d,q,'AIC=', model$aic, ' SSE=',sse,' p-VALUE=', pval$p.value,'\n')
        }
      }
    }

## 0 2 0 AIC= 19443.85  SSE= 12410253  p-VALUE= 0 
## 1 2 0 AIC= 18926.99  SSE= 9065178  p-VALUE= 0 
## 2 2 0 AIC= 18721.05  SSE= 7992421  p-VALUE= 0 
## 3 2 0 AIC= 18669.5  SSE= 7737230  p-VALUE= 0 
## 4 2 0 AIC= 18595.35  SSE= 7388040  p-VALUE= 0 
## 5 2 0 AIC= 18527.71  SSE= 7082328  p-VALUE= 2.109424e-15 
## 0 2 1 AIC= 18297.91  SSE= 6182963  p-VALUE= 0.1145366 
## 1 2 1 AIC= 18299.52  SSE= 6181475  p-VALUE= 0.1280869 
## 2 2 1 AIC= 18300.91  SSE= 6179175  p-VALUE= 0.1404621 
## 3 2 1 AIC= 18301.95  SSE= 6175556  p-VALUE= 0.185476 
## 4 2 1 AIC= 18294.43  SSE= 6140105  p-VALUE= 0.9308315 
## 0 2 2 AIC= 18299.53  SSE= 6181533  p-VALUE= 0.1278044 
## 1 2 2 AIC= 18301.89  SSE= 6182855  p-VALUE= 0.1165964 
## 2 2 2 AIC= 18302.83  SSE= 6178895  p-VALUE= 0.1433831 
## 3 2 2 AIC= 18300.03  SSE= 6160926  p-VALUE= 0.4210778

Taking care of the parsimony principle, we do not want our model to have total number of parameters to be more than 7. Checking over the possible combinations of order of autoregressive, differencing and moving average, we chose the right order for our model as (4,2,1). Our choice is based on the lowest AIC of model. Also the p-value for this model is highest, which means that the residuals have no autocorrelation coefficients.

sarima.model = sarima(data1$Close, 4,2,1,0,0,0,details=F)
sarima.model

## $fit
## 
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, 
##     Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc, 
##     REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1     ar2     ar3      ar4      ma1
##       -0.0153  0.0206  0.0216  -0.0781  -0.9870
## s.e.   0.0248  0.0253  0.0253   0.0253   0.0045
## 
## sigma^2 estimated as 3714:  log likelihood = -9141.21,  aic = 18294.43
## 
## $degrees_of_freedom
## [1] 1648
## 
## $ttable
##     Estimate     SE   t.value p.value
## ar1  -0.0153 0.0248   -0.6149  0.5387
## ar2   0.0206 0.0253    0.8123  0.4167
## ar3   0.0216 0.0253    0.8541  0.3932
## ar4  -0.0781 0.0253   -3.0911  0.0020
## ma1  -0.9870 0.0045 -218.9446  0.0000
## 
## $AIC
## [1] 9.22603
## 
## $AICc
## [1] 9.227269
## 
## $BIC
## [1] 8.242379

fit.residuals = data.frame(x=1:nrow(data1), res = resid(sarima.model$fit))
ggplot(data=fit.residuals, aes(x=x, y=res)) + geom_line(col="firebrick") + ggtitle("Residuals of the fitted model")+ylab("residuals")

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

QQplot for the fitted residuals.

ggplot(data=fit.residuals, aes(sample=res)) + geom_qq()

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

** Analyse correlation among residuals with autocorrelation plot**

p1 <- autoplot(acf(fit.residuals$res,plot=FALSE)) + ggtitle("Autocorrelation plot for residuals")
p2 <- autoplot(pacf(fit.residuals$res, plot=F)) + ggtitle("partial autocorrelation plot for residuals")

grid.arrange(p1,p2,ncol=2)

Our Lunj-Box test statistics p-vlaues are quite higher which shows that there are no significant correlation coefficients. However, there are some spikes in the acf and pacf plot which appears to be random.

R Notebook

Time series Analysis on bitcoin prices data part2

Introduction