This is an attempt to model the BTC price using previous days data.
We use the Quandl API to retrieve historical BTC prices, filter >= 2014
# Bitcoin to EUR
btc <- Quandl("BCHARTS/BTCDEEUR")
btc <- subset(btc, Date>='2014-01-01')
dim(btc)
## [1] 1561 8
# Ethereum to EUR
# btc <- Quandl("BITFINEX/ETHUSD")
# btc$`Weighted Price`<-btc$Mid
the daily weighted BTC price
a simple correlationchart, showing the high degree of correlation between intra-day data
historical data via lag
ds <- btc %>%
mutate(Price = `Weighted Price`) %>%
dplyr::select(Date, Price) %>%
arrange(Date) %>%
mutate(Grow_0 = Price > lag(Price, 1),
Grow_1 = lag(Price, 1)>lag(Price, 2),
Grow_2 = lag(Price, 2)>lag(Price, 3),
Grow_3 = lag(Price, 3)>lag(Price, 4),
Grow_4 = lag(Price, 4)>lag(Price, 5),
Grow_5 = lag(Price, 5)>lag(Price, 6),
Grow_6 = lag(Price, 6)>lag(Price, 7),
Grow_5 = lag(Price, 7)>lag(Price, 8),
Grow_1w = lag(Price, 1)>lag(Price, 8),
Grow_2w = lag(Price, 1)>lag(Price, 15))
rownames(ds) <- ds$Date
ds <- ds %>%
dplyr::select(-Date, - Price)
ds <- ds[complete.cases(ds),]
head(ds)
## Grow_0 Grow_1 Grow_2 Grow_3 Grow_4 Grow_5 Grow_6 Grow_1w
## 2014-01-16 FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
## 2014-01-17 FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## 2014-01-18 TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 2014-01-19 TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## 2014-01-20 TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## 2014-01-21 FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE
## Grow_2w
## 2014-01-16 TRUE
## 2014-01-17 TRUE
## 2014-01-18 TRUE
## 2014-01-19 FALSE
## 2014-01-20 FALSE
## 2014-01-21 FALSE
x = ds[, -1]
y = as.factor(ds$Grow_0)
a logistic regression model to see how today’s trend can be predicted based on historical data
Tools: 4 popular ML method x 30 results ( 3 repeats of 10-fold cross validation)
Methods: RF, lvw, svm, gbm
control = trainControl(method='cv',number=10, repeats=3)
set.seed(123)
model.rf = caret::train(x,y, method='rf' ,trControl=control, metric="Accuracy")
model.lvq = caret::train(x,y, method='lvq',trControl=control, metric="Accuracy")
## Loading required package: class
model.svm = caret::train(x,y, method='svmLinearWeights',trControl=control, metric="Accuracy")
# model.gbm = caret::train(x,y, method='gbm',trControl=control, metric="Accuracy", verbose=FALSE)
# model.wsrf = caret::train(x,y, method='glmnet',trControl=control, metric="Accuracy", verbose=FALSE)
# collect resamples
results <- resamples(list(RF = model.rf, LVQ=model.lvq, SVM=model.svm))
# summarize the distributions
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: RF, LVQ, SVM
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.5454545 0.5737013 0.6051739 0.5976005 0.6213786 0.6322581 0
## LVQ 0.4870130 0.5202556 0.5630917 0.5543283 0.5845533 0.6168831 0
## SVM 0.5161290 0.5476854 0.5760369 0.5750733 0.5961039 0.6428571 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## RF 0.06634332 0.13623773 0.1925368 0.17573401 0.2208218 0.2490438 0
## LVQ -0.05479452 0.03299826 0.1259597 0.09997802 0.1668867 0.2360854 0
## SVM 0.01923564 0.08381188 0.1503385 0.14460743 0.1929021 0.2820817 0
All methods perform relatively well in terms of Accuracy, but Gradient Boosting Machine has the lowest median accuracy, same the lowest low Kappa. SVM has also the widest range:
# boxplots of results
bwplot(results)
# dot plots of results
dotplot(results)
Kappa = the amount of agreement correct by the agreement expected by chance. In other words, how much better, the classifier is than what would be expected by random chance.
# boxplots of results
importance.variables <- varImp(model.rf, scale=FALSE)
plot(importance.variables)
model.rf
## Random Forest
##
## 1546 samples
## 8 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1391, 1392, 1391, 1391, 1392, 1392, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.5976005 0.1757340
## 5 0.5930967 0.1669685
## 8 0.5782246 0.1357573
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
table(ds$Grow_0, predict(model.rf))
##
## FALSE TRUE
## FALSE 384 329
## TRUE 215 618
ds <- btc %>%
mutate(Price = `Weighted Price`) %>%
dplyr::select(Date, Price) %>%
arrange(Date) %>%
mutate(Grow_0 = Price - lag(Price, 1),
Grow_1 = lag(Price, 1) - lag(Price, 2),
Grow_2 = lag(Price, 2) - lag(Price, 3),
Grow_3 = lag(Price, 3) - lag(Price, 4),
Grow_4 = lag(Price, 4) - lag(Price, 5),
Grow_5 = lag(Price, 5) - lag(Price, 6),
Grow_6 = lag(Price, 6) - lag(Price, 7),
Grow_5 = lag(Price, 7) - lag(Price, 8),
Grow_1w = lag(Price, 1)- lag(Price, 8),
Grow_2w = lag(Price, 1)- lag(Price, 15))
rownames(ds) <- ds$Date
ds <- ds %>%
dplyr::select(-Date, - Price)
summary(lm(ds$Grow_0 ~ ., data=ds))
##
## Call:
## lm(formula = ds$Grow_0 ~ ., data = ds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2087.28 -7.27 -2.30 4.81 2147.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.755022 4.870120 0.566 0.571681
## Grow_1 0.129440 0.036232 3.573 0.000364 ***
## Grow_2 -0.044416 0.036121 -1.230 0.219023
## Grow_3 0.003266 0.035329 0.092 0.926353
## Grow_4 0.015525 0.038441 0.404 0.686361
## Grow_5 -0.016985 0.035182 -0.483 0.629314
## Grow_6 0.047966 0.038435 1.248 0.212221
## Grow_1w -0.028412 0.027612 -1.029 0.303665
## Grow_2w 0.012896 0.009214 1.400 0.161847
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 191.2 on 1537 degrees of freedom
## (15 observations deleted due to missingness)
## Multiple R-squared: 0.01843, Adjusted R-squared: 0.01332
## F-statistic: 3.607 on 8 and 1537 DF, p-value: 0.0003666
ds <- ds[complete.cases(ds),]
head(ds)
## Grow_0 Grow_1 Grow_2 Grow_3 Grow_4
## 2014-01-16 -3.381252 9.994949 -1.339137 -17.363274 -18.135906
## 2014-01-17 -18.349781 -3.381252 9.994949 -1.339137 -17.363274
## 2014-01-18 5.325823 -18.349781 -3.381252 9.994949 -1.339137
## 2014-01-19 8.485118 5.325823 -18.349781 -3.381252 9.994949
## 2014-01-20 4.655515 8.485118 5.325823 -18.349781 -3.381252
## 2014-01-21 -1.236676 4.655515 8.485118 5.325823 -18.349781
## Grow_5 Grow_6 Grow_1w Grow_2w
## 2014-01-16 1.182519 -1.248237 1.348046 81.530415
## 2014-01-17 -1.248237 28.257132 -3.215724 59.681852
## 2014-01-18 28.257132 -18.135906 -20.317268 3.330761
## 2014-01-19 -18.135906 -17.363274 -43.248577 -1.807973
## 2014-01-20 -17.363274 -1.339137 -16.627553 -42.469236
## 2014-01-21 -1.339137 9.994949 5.391235 -84.459317
x = ds[, -1]
y = as.factor(ds$Grow_0)
control = trainControl(method='cv',number=10, repeats=3)
lmCVFit<-train(Grow_0 ~ ., data = ds, method = "lm", trControl = control, metric="Rsquared")
summary(lmCVFit)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2087.28 -7.27 -2.30 4.81 2147.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.755022 4.870120 0.566 0.571681
## Grow_1 0.129440 0.036232 3.573 0.000364 ***
## Grow_2 -0.044416 0.036121 -1.230 0.219023
## Grow_3 0.003266 0.035329 0.092 0.926353
## Grow_4 0.015525 0.038441 0.404 0.686361
## Grow_5 -0.016985 0.035182 -0.483 0.629314
## Grow_6 0.047966 0.038435 1.248 0.212221
## Grow_1w -0.028412 0.027612 -1.029 0.303665
## Grow_2w 0.012896 0.009214 1.400 0.161847
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 191.2 on 1537 degrees of freedom
## Multiple R-squared: 0.01843, Adjusted R-squared: 0.01332
## F-statistic: 3.607 on 8 and 1537 DF, p-value: 0.0003666
experiment with facebook’s prophet package
## Not run:
library(prophet)
## Loading required package: Rcpp
#history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),y = sin(1:366/200) +rnorm(366)/10)
history <- data.frame(ds = btc$Date,y = btc$`Weighted Price`)
history <- history[seq(dim(history)[1],1),]
head(history)
## ds y
## 1561 2014-01-01 535.5875
## 1560 2014-01-02 554.0548
## 1559 2014-01-03 592.0562
## 1558 2014-01-04 602.5207
## 1557 2014-01-05 651.6671
## 1556 2014-01-06 698.3127
m <- prophet(history)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Initial log joint probability = -22.2709
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
plot(m, forecast)
## End(Not run)