1. Доступ к данным Bitcoin из сайта https://coinmarketcap.com/

# load libraries
library(rvest)
library(tidyverse)
url <-read_html("https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end=20190328")
data <- url %>% html_table()
df <- data[[1]]
df <- as_tibble(df)
colnames(df) <- c("date", "open", "high", "low", "close", "volume", "market_cap")
df$date <- as.Date(df$date, "%b %d,%Y")
knitr::kable(head(df))
date open high low close volume market_cap
2019-03-28 4087.58 4094.90 4040.27 4069.11 9,353,915,899 71,678,998,915
2019-03-27 3984.24 4087.07 3977.81 4087.07 10,897,131,934 71,987,847,571
2019-03-26 3969.23 3985.08 3944.75 3985.08 10,707,678,815 70,184,147,203
2019-03-25 4024.11 4038.84 3934.03 3963.07 10,359,818,883 69,789,872,373
2019-03-24 4035.16 4040.70 4006.19 4022.17 9,144,851,065 70,823,042,992
2019-03-23 4022.71 4049.88 4015.96 4035.83 9,578,850,549 71,056,017,910

2. Визуализация данных

# Plot using ggplot2
library(ggplot2)
ggplot(data = df, aes(x = date, y = close)) + geom_line(colour = "blue", size = 1) +
  labs(title = "Historical data for Bitcoin",
     subtitle = "Accessing Bitcoin Data for the Forvision project",
     caption = "sources: Bitcoin (BTC); https://coinmarketcap.com",
     x = "Date",
     y = "Close")

# Plot using dygraphs
# Covert data to xts object
df1 <- xts::xts(df$close, order.by=df$date)
names(df1) <- "Close"
library(dygraphs)
dygraph(df1,main = "Historical data for Bitcoin") %>%
        dyRangeSelector(height = 20)
# Last 365 days
df2 <- df[1:365, ]
library(ggplot2)
ggplot(data = df2, aes(x = date, y = close)) + geom_line(colour = "blue", size = 1) +
  labs(title = "Historical data of Bitcoin for last 365 days",
       subtitle = "Accessing Bitcoin Data for the Forvision project",
       caption = "sources: Bitcoin (BTC); https://coinmarketcap.com",
       x = "Date",
       y = "Close")

3. Подготовка данных для прогнозирования

  • Точка origin = “2019-01-01”
  • C точки origin сделать прогноз на 60 дней
library(dplyr)
train_df <- filter(df, date <= "2019-01-01")
test_df <- filter(df, date > "2019-01-01")
knitr::kable(head(train_df))
date open high low close volume market_cap
2019-01-01 3746.71 3850.91 3707.23 3843.52 4,324,200,990 67,098,634,181
2018-12-31 3866.84 3868.74 3725.87 3742.70 4,661,840,806 65,331,499,158
2018-12-30 3822.38 3901.91 3797.22 3865.95 4,770,578,575 67,475,512,827
2018-12-29 3932.49 3963.76 3820.41 3820.41 4,991,655,917 66,672,244,158
2018-12-28 3653.13 3956.14 3642.63 3923.92 5,631,554,348 68,471,837,969
2018-12-27 3854.69 3874.42 3645.45 3654.83 5,130,222,366 63,768,757,101
test_df <- test_df[(nrow(test_df)-60 + 1):nrow(test_df), ]
knitr::kable(tail(test_df))
date open high low close volume market_cap
2019-01-07 4078.59 4092.61 4020.89 4025.25 5,228,625,637 70,316,305,580
2019-01-06 3836.52 4093.30 3826.51 4076.63 5,597,027,440 71,206,795,853
2019-01-05 3851.97 3904.90 3836.90 3845.19 5,137,609,824 67,157,570,935
2019-01-04 3832.04 3865.93 3783.85 3857.72 4,847,965,467 67,368,333,500
2019-01-03 3931.05 3935.69 3826.22 3836.74 4,530,215,219 66,994,920,903
2019-01-02 3849.22 3947.98 3817.41 3943.41 5,244,856,836 68,849,856,732
# covert data to time series 
train_ts <- ts(data = rev(train_df$close), end=c(2019, 1), frequency = 365)
test_ts <- ts(data = rev(test_df$close), start=c(2019, 2), frequency = 365)
plot(train_ts)

# Seasonal decomposition
fit <- stl(train_ts, s.window="period")
plot(fit)

# Test stationary
library(tseries)
adf.test(train_df$close, alternative="stationary", k=0)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_df$close
## Dickey-Fuller = -2.2373, Lag order = 0, p-value = 0.4778
## alternative hypothesis: stationary

big p-value -> data is not stationary

adf.test(diff(log(train_df$close)), alternative="stationary", k=0)
## Warning in adf.test(diff(log(train_df$close)), alternative =
## "stationary", : p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(log(train_df$close))
## Dickey-Fuller = -45.264, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
plot(diff(log(train_ts)))

Small p-values -> diff(log(data)) is stationary

4 Построение auto.arima модели и прогнозирование

library(forecast)
fit <- auto.arima(log(train_ts), d = 1, ic = "aic", trace = TRUE)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,1,2)(1,0,1)[365] with drift         : Inf
##  ARIMA(0,1,0)             with drift         : -7073.575
##  ARIMA(1,1,0)(1,0,0)[365] with drift         : Inf
##  ARIMA(0,1,1)(0,0,1)[365] with drift         : Inf
##  ARIMA(0,1,0)                                : -7072.756
##  ARIMA(0,1,0)(1,0,0)[365] with drift         : Inf
##  ARIMA(0,1,0)(0,0,1)[365] with drift         : Inf
##  ARIMA(0,1,0)(1,0,1)[365] with drift         : Inf
##  ARIMA(1,1,0)             with drift         : -7073.355
##  ARIMA(0,1,1)             with drift         : -7071.62
##  ARIMA(1,1,1)             with drift         : -7071.534
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,1,0)             with drift         : -7079.827
## 
##  Best model: ARIMA(0,1,0)             with drift
# forecast, horizon = 60
forecast_fit <- forecast(fit, h = 60)
forecast <- exp(forecast_fit$mean)
forecast_data <- data.frame(date = rev(test_df$date), value = rev(test_df$close), forecast)
knitr::kable(forecast_data)
date value forecast
2019-01-02 3943.41 3849.742
2019-01-03 3836.74 3855.974
2019-01-04 3857.72 3862.216
2019-01-05 3845.19 3868.469
2019-01-06 4076.63 3874.731
2019-01-07 4025.25 3881.003
2019-01-08 4030.85 3887.286
2019-01-09 4035.30 3893.579
2019-01-10 3678.92 3899.882
2019-01-11 3687.37 3906.195
2019-01-12 3661.30 3912.519
2019-01-13 3552.95 3918.852
2019-01-14 3706.05 3925.196
2019-01-15 3630.68 3931.551
2019-01-16 3655.01 3937.915
2019-01-17 3678.56 3944.290
2019-01-18 3657.84 3950.675
2019-01-19 3728.57 3957.071
2019-01-20 3601.01 3963.476
2019-01-21 3576.03 3969.893
2019-01-22 3604.58 3976.319
2019-01-23 3585.12 3982.756
2019-01-24 3600.87 3989.204
2019-01-25 3599.77 3995.661
2019-01-26 3602.46 4002.130
2019-01-27 3583.97 4008.608
2019-01-28 3470.45 4015.098
2019-01-29 3448.12 4021.597
2019-01-30 3486.18 4028.108
2019-01-31 3457.79 4034.629
2019-02-01 3487.95 4041.160
2019-02-02 3521.06 4047.702
2019-02-03 3464.01 4054.254
2019-02-04 3459.15 4060.818
2019-02-05 3466.36 4067.391
2019-02-06 3413.77 4073.976
2019-02-07 3399.47 4080.571
2019-02-08 3666.78 4087.177
2019-02-09 3671.20 4093.793
2019-02-10 3690.19 4100.420
2019-02-11 3648.43 4107.058
2019-02-12 3653.53 4113.707
2019-02-13 3632.07 4120.366
2019-02-14 3616.88 4127.036
2019-02-15 3620.81 4133.717
2019-02-16 3629.79 4140.409
2019-02-17 3673.84 4147.112
2019-02-18 3915.71 4153.825
2019-02-19 3947.09 4160.549
2019-02-20 3999.82 4167.285
2019-02-21 3954.12 4174.031
2019-02-22 4005.53 4180.788
2019-02-23 4142.53 4187.556
2019-02-24 3810.43 4194.335
2019-02-25 3882.70 4201.125
2019-02-26 3854.36 4207.926
2019-02-27 3851.05 4214.737
2019-02-28 3854.79 4221.560
2019-03-01 3859.58 4228.394
2019-03-02 3864.42 4235.239
accuracy(forecast_data$value, forecast_data$forecast)
##                ME     RMSE      MAE      MPE     MAPE     ACF1 Theil's U
## Test set 330.1088 391.3159 354.2788 8.108618 8.731735 0.858627  60.05321
ggplot() + geom_line(data = forecast_data, aes(x = date, y = value, colour = "value"), size = 1) +
  geom_line(data = forecast_data, aes(date, forecast, colour = "forecast"), size = 1) +
  geom_line(data = train_df[1:100,], aes(date, close, colour = "value"), size = 1) + 
  ggtitle("60-days ahead bitcoin forecast from origin '01/01/2019'")