library(tidyverse)
library(lubridate)
library(fpp2)
library(astsa)
library(plotly)
windowsFonts(Georgia = windowsFont("Georgia"))
## Loading the data
bitcoin = read_csv('Bitcoin Historical Data - Investing.com (2).csv')
names(bitcoin)[7] = 'Change'
## The tibble
bit_df = bitcoin %>%
mutate(Date = dmy(Date)) %>%
mutate(Vol. = as.numeric(str_sub(Vol., end = -2))*1000,
Change = as.numeric(str_sub(Change, end = -2))) %>%
arrange(Date)
## Original plot theme
my_theme = theme(panel.grid = element_line(color = '#e6e6e6'),
panel.background = element_rect(fill = 'white'),
plot.title = element_text(hjust = .5, size = 28, colour = '#ffa500'),
text = element_text(family = 'Georgia'),
axis.text = element_text(size = 10),
axis.title = element_text(size = 18, family = 'Georgia', face = 'bold'),
axis.line = element_line(colour = '#737373', size = 1),
strip.background = element_rect(colour = "black", fill = "white"),
strip.text = element_text(face = 'bold')) ggplotly(ggplot(bit_df, aes(Date, Price)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '') +
scale_y_continuous(breaks = c(0, 5000, 10000, 15000),
labels = c('$0', '$5,000', '$10,000', '$15,000')) + my_theme)Date Range: 08/28/2011 - 10/05/2019
ggplotly(bit_df %>%
filter(Date > as.Date('2017-01-01')) %>% ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '') +
scale_y_continuous(breaks = c(0, 5000, 10000, 15000),
labels = c('$0', '$5,000', '$10,000', '$15,000')) + my_theme)Date Range: 01/01/2017 - 10/05/2019
ggAcf(bit_ts, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')
ggPacf(bit_ts, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')ggAcf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')
ggPacf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')
BoxCox.lambda(). When used on our data we get the following value for λ:## [1] -0.09363108
BoxCox(), we can transform the entire data into something that hopefully has a constant variance.## Original Price
cut_bit_df[-1,] %>%
mutate(Price = diff(cut_bit_df$Price)) %>%
ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') + my_theme +
labs(x = '', title = 'Original Price', y = 'Difference')
## Transformed Price
cut_bit_df[-1,] %>%
mutate(Price = diff(BoxCox(cut_bit_df$Price, lambda = BoxCox.lambda(cut_bit_df$Price)))) %>%
ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') + my_theme +
labs(x = '', title = 'Transformed Price', y = '')bit_ts_tran = BoxCox(bit_ts, lambda = BoxCox.lambda(bit_ts))
ggAcf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')
ggPacf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')auto.arima() function comes in very handy. When applied to data, it can tell you what ARIMA model is best suited by minimizing AIC and BIC values. When this function is applied to our transformed data, here is the result:## Series: bit_ts_tran
## ARIMA(3,1,2) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 drift
## 0.0328 -0.9080 -0.0220 -0.0528 0.9467 1e-03
## s.e. 0.0456 0.0824 0.0329 0.0331 0.0657 6e-04
##
## sigma^2 estimated as 0.00039: log likelihood=2523.74
## AIC=-5033.48 AICc=-5033.36 BIC=-4999.08
checkresiduals() is another useful function that will show us whether or not it’s a good fit. Let’s fit an ARIMA(3, 1, 2) model and see how it looks.p-value. The p-value obtained for this model is about 6.1%. The typical threshold for a bad model is a value of 5% or lower. Albeit our p-value is above this standard, it is still pretty low.##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,2) with drift
## Q* = 8.9981, df = 4, p-value = 0.06115
##
## Model df: 6. Total lags used: 10
cut2_bit_df = cut_bit_df %>%
filter(Date >= ymd('2019-01-01'))
ggplotly(cut2_bit_df %>%
mutate(Price = BoxCox(cut2_bit_df$Price, lambda = BoxCox.lambda(cut2_bit_df$Price))) %>%
ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)bit_ts2 = bit_df %>%
filter(Date >= as.Date('2019-01-01')) %>%
arrange(Date) %>%
select(Price) %>%
as.matrix() %>%
ts()
bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))
ggAcf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')
ggPacf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')ggplotly(cut2_bit_df[-1,] %>%
mutate(Price = diff(BoxCox(cut2_bit_df$Price, lambda = BoxCox.lambda(cut2_bit_df$Price)))) %>%
ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') + my_theme +
labs(x = '', title = 'Transformed Price', y = 'Difference'))bit_ts2 = bit_df %>%
filter(Date >= as.Date('2019-01-01')) %>%
arrange(Date) %>%
select(Price) %>%
as.matrix() %>%
ts()
bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))
ggAcf(diff(bit_ts_tran2), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')
ggPacf(diff(bit_ts_tran2), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')auto.arima() function again to see which model is recommended.## Series: bit_ts_tran2
## ARIMA(0,1,0)
##
## sigma^2 estimated as 8.162e-05: log likelihood=910.85
## AIC=-1819.7 AICc=-1819.69 BIC=-1816.08
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 15.771, df = 10, p-value = 0.1064
##
## Model df: 0. Total lags used: 10
p-value looks a little better than before as well. This is a great sign because a Random Walk is much simpler than an ARIMA(3, 1, 2), and it is always suggested that a model should be as simple as possible.bit_ts2017 = bit_df %>%
filter(Date >= as.Date('2017-01-01') & Date <= as.Date('2018-01-01')) %>%
arrange(Date) %>%
select(Price) %>%
as.matrix() %>%
ts()
bit_ts2017 %>%
BoxCox(lambda = BoxCox.lambda(bit_ts2017)) %>%
Arima(order = c(0,1,0), include.drift = T) %>%
checkresiduals()##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0) with drift
## Q* = 5.6861, df = 9, p-value = 0.7709
##
## Model df: 1. Total lags used: 10
bit_ts2018 = bit_df %>%
filter(Date >= as.Date('2018-01-01') & Date <= as.Date('2019-01-01')) %>%
arrange(Date) %>%
select(Price) %>%
as.matrix() %>%
ts()
bit_ts2018 %>%
BoxCox(lambda = BoxCox.lambda(bit_ts2018)) %>%
Arima(order = c(0,1,0), include.drift = T) %>%
checkresiduals()##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0) with drift
## Q* = 11.997, df = 9, p-value = 0.2135
##
## Model df: 1. Total lags used: 10
bit_ts2019 = bit_df %>%
filter(Date >= as.Date('2017-01-01')) %>%
arrange(Date) %>%
select(Price) %>%
as.matrix() %>%
ts()
bit_ts2019 %>%
BoxCox(lambda = BoxCox.lambda(bit_ts2019)) %>%
Arima(order = c(0,1,0), include.drift = T) %>%
checkresiduals()##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0) with drift
## Q* = 15.445, df = 9, p-value = 0.07943
##
## Model df: 1. Total lags used: 10
auto.arima() that Bitcoin was following an ARIMA(3, 1, 2) model, but a Random Walk was consistently present for every year following 2017. To be specific, a Random Walk with a drift. Let’s take another look at this model:## Series: bit_ts_tran
## ARIMA(0,1,0) with drift
##
## Coefficients:
## drift
## 1e-03
## s.e. 6e-04
##
## sigma^2 estimated as 0.0003913: log likelihood=2519.61
## AIC=-5035.22 AICc=-5035.21 BIC=-5025.39
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 5.059098e-06 0.01976218 0.01359546 0.0002911483 0.2328272
## MASE ACF1
## Training set 0.9946843 -0.02040916
err = residuals(Arima(bit_ts_tran, order = c(0,1,0), include.drift = T))
cat('Standard Deviation = ', sd(err))## Standard Deviation = 0.019772
## Mean = 5.059098e-06
invers_BoxCox = function(ts_data, lambda){
original_ts = (ts_data * lambda + 1) ** (1/lambda)
return(original_ts)
}
invers_BoxCox(sd(err), BoxCox.lambda(bit_ts))## [1] 1.019987
bitcoin2 = read_csv('Bitcoin Historical Data - Investing.com.csv')
names(bitcoin2)[7] = 'Change'
bit_df2 = bitcoin2 %>%
mutate(Date = mdy(Date)) %>%
filter(Date > as.Date('2017-01-01')) %>%
mutate(Vol. = as.numeric(str_sub(Vol., end = -2))*1000,
Change = as.numeric(str_sub(Change, end = -2))) %>%
arrange(Date)
ggplotly(ggplot(bit_df2, aes(Date, Price)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '') +
scale_y_continuous(breaks = c(7500, 8000, 8500, 9000),
labels = c('$7,500', '$8,000', '$8,500', '$9,000')) + my_theme)Date Range: 08/28/2011 - 11/14/2019
## h is the the length you want the prediction to be in units of days
fit_model = function(bitcoin_data, h){
bitcoin_df = bitcoin_data %>%
mutate(Date = dmy(Date)) %>%
filter(Date >= as.Date('2017-01-01')) %>%
arrange(Date)
time_series = bitcoin_df %>%
select(Price) %>%
ts()
predictions = time_series %>%
BoxCox(lambda = BoxCox.lambda(time_series)) %>%
auto.arima() %>%
forecast(h)
forecast_df = cbind(data.frame(predictions[4]),
data.frame(predictions[5]),
data.frame(predictions[6]))
the_forecast = invers_BoxCox(forecast_df, lambda = BoxCox.lambda(time_series))
the_forecast = the_forecast %>%
mutate(Date = tail(bitcoin_df$Date, h) + h) %>%
as_tibble()
return(the_forecast)
}ggplotly(fit_model(bitcoin, 100) %>%
ggplot(aes(x = Date, y = mean)) + geom_line(col = '#ff2500') +
geom_ribbon(aes(ymin = lower.80., ymax = upper.80.), alpha = .3, fill = '#ffc04c') +
geom_ribbon(aes(ymin = lower.95., ymax = upper.95.), alpha = .3, fill = '#ffe4b2') +
geom_line(data = bit_df2, aes(Date, Price)) +
geom_line(data = filter(bit_df, Date >= as.Date('2015-01-01')), aes(Date, Price), col = '#ffa500') + my_theme +
labs(title = 'Bitcoin Prediction of 100 Days', y = 'Price', x = '') +
scale_y_continuous(breaks = c(0, 5000, 10000, 15000, 20000),
labels = c('$0', '$5,000', '$10,000', '$15,000', '$20,000')))A work by Timothy Sumner