# load libraries
library(rvest)
library(tidyverse)
url <-read_html("https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end=20190328")
data <- url %>% html_table()
df <- data[[1]]
df <- as_tibble(df)
colnames(df) <- c("date", "open", "high", "low", "close", "volume", "market_cap")
df$date <- as.Date(df$date, "%b %d,%Y")
knitr::kable(head(df))
| 2019-03-28 |
4087.58 |
4094.90 |
4040.27 |
4069.11 |
9,353,915,899 |
71,678,998,915 |
| 2019-03-27 |
3984.24 |
4087.07 |
3977.81 |
4087.07 |
10,897,131,934 |
71,987,847,571 |
| 2019-03-26 |
3969.23 |
3985.08 |
3944.75 |
3985.08 |
10,707,678,815 |
70,184,147,203 |
| 2019-03-25 |
4024.11 |
4038.84 |
3934.03 |
3963.07 |
10,359,818,883 |
69,789,872,373 |
| 2019-03-24 |
4035.16 |
4040.70 |
4006.19 |
4022.17 |
9,144,851,065 |
70,823,042,992 |
| 2019-03-23 |
4022.71 |
4049.88 |
4015.96 |
4035.83 |
9,578,850,549 |
71,056,017,910 |
2. Визуализация данных
# Plot using ggplot2
library(ggplot2)
ggplot(data = df, aes(x = date, y = close)) + geom_line(colour = "blue", size = 1) +
labs(title = "Historical data for Bitcoin",
subtitle = "Accessing Bitcoin Data for the Forvision project",
caption = "sources: Bitcoin (BTC); https://coinmarketcap.com",
x = "Date",
y = "Close")

# Plot using dygraphs
# Covert data to xts object
df1 <- xts::xts(df$close, order.by=df$date)
names(df1) <- "Close"
library(dygraphs)
dygraph(df1,main = "Historical data for Bitcoin") %>%
dyRangeSelector(height = 20)
# Last 365 days
df2 <- df[1:365, ]
library(ggplot2)
ggplot(data = df2, aes(x = date, y = close)) + geom_line(colour = "blue", size = 1) +
labs(title = "Historical data of Bitcoin for last 365 days",
subtitle = "Accessing Bitcoin Data for the Forvision project",
caption = "sources: Bitcoin (BTC); https://coinmarketcap.com",
x = "Date",
y = "Close")

3. Подготовка данных для прогнозирования
- Точка origin = “2019-01-01”
- C точки origin сделать прогноз на 60 дней
library(dplyr)
train_df <- filter(df, date <= "2019-01-01")
test_df <- filter(df, date > "2019-01-01")
knitr::kable(head(train_df))
| 2019-01-01 |
3746.71 |
3850.91 |
3707.23 |
3843.52 |
4,324,200,990 |
67,098,634,181 |
| 2018-12-31 |
3866.84 |
3868.74 |
3725.87 |
3742.70 |
4,661,840,806 |
65,331,499,158 |
| 2018-12-30 |
3822.38 |
3901.91 |
3797.22 |
3865.95 |
4,770,578,575 |
67,475,512,827 |
| 2018-12-29 |
3932.49 |
3963.76 |
3820.41 |
3820.41 |
4,991,655,917 |
66,672,244,158 |
| 2018-12-28 |
3653.13 |
3956.14 |
3642.63 |
3923.92 |
5,631,554,348 |
68,471,837,969 |
| 2018-12-27 |
3854.69 |
3874.42 |
3645.45 |
3654.83 |
5,130,222,366 |
63,768,757,101 |
test_df <- test_df[(nrow(test_df)-60 + 1):nrow(test_df), ]
knitr::kable(tail(test_df))
| 2019-01-07 |
4078.59 |
4092.61 |
4020.89 |
4025.25 |
5,228,625,637 |
70,316,305,580 |
| 2019-01-06 |
3836.52 |
4093.30 |
3826.51 |
4076.63 |
5,597,027,440 |
71,206,795,853 |
| 2019-01-05 |
3851.97 |
3904.90 |
3836.90 |
3845.19 |
5,137,609,824 |
67,157,570,935 |
| 2019-01-04 |
3832.04 |
3865.93 |
3783.85 |
3857.72 |
4,847,965,467 |
67,368,333,500 |
| 2019-01-03 |
3931.05 |
3935.69 |
3826.22 |
3836.74 |
4,530,215,219 |
66,994,920,903 |
| 2019-01-02 |
3849.22 |
3947.98 |
3817.41 |
3943.41 |
5,244,856,836 |
68,849,856,732 |
# covert data to time series
train_ts <- ts(data = rev(train_df$close), end=c(2019, 1), frequency = 365)
test_ts <- ts(data = rev(test_df$close), start=c(2019, 2), frequency = 365)
plot(train_ts)

# Seasonal decomposition
fit <- stl(train_ts, s.window="period")
plot(fit)

# Test stationary
library(tseries)
adf.test(train_df$close, alternative="stationary", k=0)
##
## Augmented Dickey-Fuller Test
##
## data: train_df$close
## Dickey-Fuller = -2.2373, Lag order = 0, p-value = 0.4778
## alternative hypothesis: stationary
big p-value -> data is not stationary
adf.test(diff(log(train_df$close)), alternative="stationary", k=0)
## Warning in adf.test(diff(log(train_df$close)), alternative =
## "stationary", : p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: diff(log(train_df$close))
## Dickey-Fuller = -45.264, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
plot(diff(log(train_ts)))
Small p-values -> diff(log(data)) is stationary
4 Построение auto.arima модели и прогнозирование
library(forecast)
fit <- auto.arima(log(train_ts), d = 1, ic = "aic", trace = TRUE)
##
## Fitting models using approximations to speed things up...
##
## ARIMA(2,1,2)(1,0,1)[365] with drift : Inf
## ARIMA(0,1,0) with drift : -7073.575
## ARIMA(1,1,0)(1,0,0)[365] with drift : Inf
## ARIMA(0,1,1)(0,0,1)[365] with drift : Inf
## ARIMA(0,1,0) : -7072.756
## ARIMA(0,1,0)(1,0,0)[365] with drift : Inf
## ARIMA(0,1,0)(0,0,1)[365] with drift : Inf
## ARIMA(0,1,0)(1,0,1)[365] with drift : Inf
## ARIMA(1,1,0) with drift : -7073.355
## ARIMA(0,1,1) with drift : -7071.62
## ARIMA(1,1,1) with drift : -7071.534
##
## Now re-fitting the best model(s) without approximations...
##
## ARIMA(0,1,0) with drift : -7079.827
##
## Best model: ARIMA(0,1,0) with drift
# forecast, horizon = 60
forecast_fit <- forecast(fit, h = 60)
forecast <- exp(forecast_fit$mean)
forecast_data <- data.frame(date = rev(test_df$date), value = rev(test_df$close), forecast)
knitr::kable(forecast_data)
| 2019-01-02 |
3943.41 |
3849.742 |
| 2019-01-03 |
3836.74 |
3855.974 |
| 2019-01-04 |
3857.72 |
3862.216 |
| 2019-01-05 |
3845.19 |
3868.469 |
| 2019-01-06 |
4076.63 |
3874.731 |
| 2019-01-07 |
4025.25 |
3881.003 |
| 2019-01-08 |
4030.85 |
3887.286 |
| 2019-01-09 |
4035.30 |
3893.579 |
| 2019-01-10 |
3678.92 |
3899.882 |
| 2019-01-11 |
3687.37 |
3906.195 |
| 2019-01-12 |
3661.30 |
3912.519 |
| 2019-01-13 |
3552.95 |
3918.852 |
| 2019-01-14 |
3706.05 |
3925.196 |
| 2019-01-15 |
3630.68 |
3931.551 |
| 2019-01-16 |
3655.01 |
3937.915 |
| 2019-01-17 |
3678.56 |
3944.290 |
| 2019-01-18 |
3657.84 |
3950.675 |
| 2019-01-19 |
3728.57 |
3957.071 |
| 2019-01-20 |
3601.01 |
3963.476 |
| 2019-01-21 |
3576.03 |
3969.893 |
| 2019-01-22 |
3604.58 |
3976.319 |
| 2019-01-23 |
3585.12 |
3982.756 |
| 2019-01-24 |
3600.87 |
3989.204 |
| 2019-01-25 |
3599.77 |
3995.661 |
| 2019-01-26 |
3602.46 |
4002.130 |
| 2019-01-27 |
3583.97 |
4008.608 |
| 2019-01-28 |
3470.45 |
4015.098 |
| 2019-01-29 |
3448.12 |
4021.597 |
| 2019-01-30 |
3486.18 |
4028.108 |
| 2019-01-31 |
3457.79 |
4034.629 |
| 2019-02-01 |
3487.95 |
4041.160 |
| 2019-02-02 |
3521.06 |
4047.702 |
| 2019-02-03 |
3464.01 |
4054.254 |
| 2019-02-04 |
3459.15 |
4060.818 |
| 2019-02-05 |
3466.36 |
4067.391 |
| 2019-02-06 |
3413.77 |
4073.976 |
| 2019-02-07 |
3399.47 |
4080.571 |
| 2019-02-08 |
3666.78 |
4087.177 |
| 2019-02-09 |
3671.20 |
4093.793 |
| 2019-02-10 |
3690.19 |
4100.420 |
| 2019-02-11 |
3648.43 |
4107.058 |
| 2019-02-12 |
3653.53 |
4113.707 |
| 2019-02-13 |
3632.07 |
4120.366 |
| 2019-02-14 |
3616.88 |
4127.036 |
| 2019-02-15 |
3620.81 |
4133.717 |
| 2019-02-16 |
3629.79 |
4140.409 |
| 2019-02-17 |
3673.84 |
4147.112 |
| 2019-02-18 |
3915.71 |
4153.825 |
| 2019-02-19 |
3947.09 |
4160.549 |
| 2019-02-20 |
3999.82 |
4167.285 |
| 2019-02-21 |
3954.12 |
4174.031 |
| 2019-02-22 |
4005.53 |
4180.788 |
| 2019-02-23 |
4142.53 |
4187.556 |
| 2019-02-24 |
3810.43 |
4194.335 |
| 2019-02-25 |
3882.70 |
4201.125 |
| 2019-02-26 |
3854.36 |
4207.926 |
| 2019-02-27 |
3851.05 |
4214.737 |
| 2019-02-28 |
3854.79 |
4221.560 |
| 2019-03-01 |
3859.58 |
4228.394 |
| 2019-03-02 |
3864.42 |
4235.239 |
accuracy(forecast_data$value, forecast_data$forecast)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 330.1088 391.3159 354.2788 8.108618 8.731735 0.858627 60.05321
ggplot() + geom_line(data = forecast_data, aes(x = date, y = value, colour = "value"), size = 1) +
geom_line(data = forecast_data, aes(date, forecast, colour = "forecast"), size = 1) +
geom_line(data = train_df[1:100,], aes(date, close, colour = "value"), size = 1) +
ggtitle("60-days ahead bitcoin forecast from origin '01/01/2019'")
