BTC Modeling

This is an attempt to model the BTC price using previous days data.

Retrieving BTC data

We use the Quandl API to retrieve historical BTC prices, filter >= 2014

# Bitcoin to EUR
btc <- Quandl("BCHARTS/BTCDEEUR")
btc <- subset(btc, Date>='2014-01-01')
dim(btc)
## [1] 1561    8
# Ethereum to EUR
# btc <- Quandl("BITFINEX/ETHUSD")
# btc$`Weighted Price`<-btc$Mid

Dataset correlation coefficients

a simple correlationchart, showing the high degree of correlation between intra-day data

Data Preparation

Preparing prediction using historical data

historical data via lag

ds <- btc %>%
    mutate(Price = `Weighted Price`) %>%
    dplyr::select(Date, Price) %>%
    arrange(Date) %>%
    mutate(Grow_0 = Price > lag(Price, 1), 
           Grow_1 = lag(Price, 1)>lag(Price, 2), 
           Grow_2 = lag(Price, 2)>lag(Price, 3), 
           Grow_3 = lag(Price, 3)>lag(Price, 4), 
           Grow_4 = lag(Price, 4)>lag(Price, 5), 
           Grow_5 = lag(Price, 5)>lag(Price, 6), 
           Grow_6 = lag(Price, 6)>lag(Price, 7), 
           Grow_5 = lag(Price, 7)>lag(Price, 8), 
           Grow_1w = lag(Price, 1)>lag(Price, 8), 
           Grow_2w = lag(Price, 1)>lag(Price, 15))
rownames(ds) <- ds$Date
ds <- ds %>%
    dplyr::select(-Date, - Price)

  ds <- ds[complete.cases(ds),]
  head(ds)
##            Grow_0 Grow_1 Grow_2 Grow_3 Grow_4 Grow_5 Grow_6 Grow_1w
## 2014-01-16  FALSE   TRUE  FALSE  FALSE  FALSE   TRUE  FALSE    TRUE
## 2014-01-17  FALSE  FALSE   TRUE  FALSE  FALSE  FALSE   TRUE   FALSE
## 2014-01-18   TRUE  FALSE  FALSE   TRUE  FALSE   TRUE  FALSE   FALSE
## 2014-01-19   TRUE   TRUE  FALSE  FALSE   TRUE  FALSE  FALSE   FALSE
## 2014-01-20   TRUE   TRUE   TRUE  FALSE  FALSE  FALSE  FALSE   FALSE
## 2014-01-21  FALSE   TRUE   TRUE   TRUE  FALSE  FALSE   TRUE    TRUE
##            Grow_2w
## 2014-01-16    TRUE
## 2014-01-17    TRUE
## 2014-01-18    TRUE
## 2014-01-19   FALSE
## 2014-01-20   FALSE
## 2014-01-21   FALSE
  x = ds[, -1]
  y = as.factor(ds$Grow_0)

Prediction comparison

BTC Modeling using several ML algos

a logistic regression model to see how today’s trend can be predicted based on historical data

Tools: 4 popular ML method x 30 results ( 3 repeats of 10-fold cross validation)

Methods: RF, lvw, svm, gbm

control = trainControl(method='cv',number=10, repeats=3)

set.seed(123)
model.rf  = caret::train(x,y, method='rf' ,trControl=control, metric="Accuracy")
model.lvq = caret::train(x,y, method='lvq',trControl=control, metric="Accuracy")
## Loading required package: class
model.svm = caret::train(x,y, method='svmLinearWeights',trControl=control, metric="Accuracy")

# model.gbm = caret::train(x,y, method='gbm',trControl=control, metric="Accuracy", verbose=FALSE)
# model.wsrf = caret::train(x,y, method='glmnet',trControl=control, metric="Accuracy", verbose=FALSE)


# collect resamples
results <- resamples(list(RF = model.rf, LVQ=model.lvq, SVM=model.svm))
# summarize the distributions
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: RF, LVQ, SVM 
## Number of resamples: 10 
## 
## Accuracy 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## RF  0.5454545 0.5737013 0.6051739 0.5976005 0.6213786 0.6322581    0
## LVQ 0.4870130 0.5202556 0.5630917 0.5543283 0.5845533 0.6168831    0
## SVM 0.5161290 0.5476854 0.5760369 0.5750733 0.5961039 0.6428571    0
## 
## Kappa 
##            Min.    1st Qu.    Median       Mean   3rd Qu.      Max. NA's
## RF   0.06634332 0.13623773 0.1925368 0.17573401 0.2208218 0.2490438    0
## LVQ -0.05479452 0.03299826 0.1259597 0.09997802 0.1668867 0.2360854    0
## SVM  0.01923564 0.08381188 0.1503385 0.14460743 0.1929021 0.2820817    0

All methods perform relatively well in terms of Accuracy, but Gradient Boosting Machine has the lowest median accuracy, same the lowest low Kappa. SVM has also the widest range:

# boxplots of results
bwplot(results)

# dot plots of results
dotplot(results)

Optimal choice so far: Random Forest

mean Accuracy 58,2%

mean Kappa: 14,1%

Kappa = the amount of agreement correct by the agreement expected by chance. In other words, how much better, the classifier is than what would be expected by random chance.

# boxplots of results

importance.variables <- varImp(model.rf, scale=FALSE)
plot(importance.variables)

Model details

model.rf 
## Random Forest 
## 
## 1546 samples
##    8 predictor
##    2 classes: 'FALSE', 'TRUE' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1391, 1392, 1391, 1391, 1392, 1392, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.5976005  0.1757340
##   5     0.5930967  0.1669685
##   8     0.5782246  0.1357573
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.

Build confusion matrix for Random Forest

table(ds$Grow_0, predict(model.rf))
##        
##         FALSE TRUE
##   FALSE   384  329
##   TRUE    215  618
ds <- btc %>%
    mutate(Price = `Weighted Price`) %>%
    dplyr::select(Date, Price) %>%
    arrange(Date) %>%
    mutate(Grow_0 = Price - lag(Price, 1), 
           Grow_1 = lag(Price, 1) - lag(Price, 2), 
           Grow_2 = lag(Price, 2) - lag(Price, 3), 
           Grow_3 = lag(Price, 3) - lag(Price, 4), 
           Grow_4 = lag(Price, 4) - lag(Price, 5), 
           Grow_5 = lag(Price, 5) - lag(Price, 6), 
           Grow_6 = lag(Price, 6) - lag(Price, 7), 
           Grow_5 = lag(Price, 7) - lag(Price, 8), 
           Grow_1w = lag(Price, 1)- lag(Price, 8), 
           Grow_2w = lag(Price, 1)- lag(Price, 15))
rownames(ds) <- ds$Date
ds <- ds %>%
    dplyr::select(-Date, - Price)

summary(lm(ds$Grow_0 ~ ., data=ds))
## 
## Call:
## lm(formula = ds$Grow_0 ~ ., data = ds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2087.28    -7.27    -2.30     4.81  2147.22 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.755022   4.870120   0.566 0.571681    
## Grow_1       0.129440   0.036232   3.573 0.000364 ***
## Grow_2      -0.044416   0.036121  -1.230 0.219023    
## Grow_3       0.003266   0.035329   0.092 0.926353    
## Grow_4       0.015525   0.038441   0.404 0.686361    
## Grow_5      -0.016985   0.035182  -0.483 0.629314    
## Grow_6       0.047966   0.038435   1.248 0.212221    
## Grow_1w     -0.028412   0.027612  -1.029 0.303665    
## Grow_2w      0.012896   0.009214   1.400 0.161847    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 191.2 on 1537 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.01843,    Adjusted R-squared:  0.01332 
## F-statistic: 3.607 on 8 and 1537 DF,  p-value: 0.0003666
  ds <- ds[complete.cases(ds),]
  head(ds)
##                Grow_0     Grow_1     Grow_2     Grow_3     Grow_4
## 2014-01-16  -3.381252   9.994949  -1.339137 -17.363274 -18.135906
## 2014-01-17 -18.349781  -3.381252   9.994949  -1.339137 -17.363274
## 2014-01-18   5.325823 -18.349781  -3.381252   9.994949  -1.339137
## 2014-01-19   8.485118   5.325823 -18.349781  -3.381252   9.994949
## 2014-01-20   4.655515   8.485118   5.325823 -18.349781  -3.381252
## 2014-01-21  -1.236676   4.655515   8.485118   5.325823 -18.349781
##                Grow_5     Grow_6    Grow_1w    Grow_2w
## 2014-01-16   1.182519  -1.248237   1.348046  81.530415
## 2014-01-17  -1.248237  28.257132  -3.215724  59.681852
## 2014-01-18  28.257132 -18.135906 -20.317268   3.330761
## 2014-01-19 -18.135906 -17.363274 -43.248577  -1.807973
## 2014-01-20 -17.363274  -1.339137 -16.627553 -42.469236
## 2014-01-21  -1.339137   9.994949   5.391235 -84.459317
  x = ds[, -1]
  y = as.factor(ds$Grow_0)
  
  
control = trainControl(method='cv',number=10, repeats=3)
lmCVFit<-train(Grow_0 ~ ., data = ds, method = "lm", trControl = control, metric="Rsquared")
summary(lmCVFit)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2087.28    -7.27    -2.30     4.81  2147.22 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.755022   4.870120   0.566 0.571681    
## Grow_1       0.129440   0.036232   3.573 0.000364 ***
## Grow_2      -0.044416   0.036121  -1.230 0.219023    
## Grow_3       0.003266   0.035329   0.092 0.926353    
## Grow_4       0.015525   0.038441   0.404 0.686361    
## Grow_5      -0.016985   0.035182  -0.483 0.629314    
## Grow_6       0.047966   0.038435   1.248 0.212221    
## Grow_1w     -0.028412   0.027612  -1.029 0.303665    
## Grow_2w      0.012896   0.009214   1.400 0.161847    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 191.2 on 1537 degrees of freedom
## Multiple R-squared:  0.01843,    Adjusted R-squared:  0.01332 
## F-statistic: 3.607 on 8 and 1537 DF,  p-value: 0.0003666

Prophet’s Time series prediction

experiment with facebook’s prophet package

## Not run:
library(prophet)
## Loading required package: Rcpp
#history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),y = sin(1:366/200) +rnorm(366)/10)
history <- data.frame(ds = btc$Date,y = btc$`Weighted Price`)
history <- history[seq(dim(history)[1],1),]
head(history)
##              ds        y
## 1561 2014-01-01 535.5875
## 1560 2014-01-02 554.0548
## 1559 2014-01-03 592.0562
## 1558 2014-01-04 602.5207
## 1557 2014-01-05 651.6671
## 1556 2014-01-06 698.3127
m <- prophet(history)
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Initial log joint probability = -22.2709
## Optimization terminated normally: 
##   Convergence detected: relative gradient magnitude is below tolerance
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
plot(m, forecast)

## End(Not run)