Source of bitcoin data is https://blockchain.info/stats

Load required libraries

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
library(ggplot2)
library(ggfortify)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.4.2
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(reshape2) # for melt
library(data.table)
## Warning: package 'data.table' was built under R version 3.4.2
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(animation)
library(feather)

Data Import and Cleaning

Read the dataset

bitcoin = read.csv("bitcoin_data.csv", header = TRUE)
str(bitcoin)
## 'data.frame':    1711 obs. of  13 variables:
##  $ Date                       : Factor w/ 1711 levels "1/1/11","1/1/14",..: 110 130 140 150 10 20 30 40 50 65 ...
##  $ Day                        : int  3 5 7 9 11 13 15 17 19 21 ...
##  $ Week                       : int  1 2 2 2 3 3 3 3 4 4 ...
##  $ Price                      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Transaction_fees..BTC.     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ No_of_transactions         : int  1 0 0 14 106 116 136 109 120 115 ...
##  $ Output_value..BTC.         : num  0 0 0 0 329 ...
##  $ estimated_transaction_value: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Miners_revenue             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Cost_per_transaction       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Difficulty                 : num  1 0 0 1 1 1 1 1 1 1 ...
##  $ Hash_rate                  : num  4.97e-08 0.00 0.00 6.96e-07 5.27e-06 5.72e-06 6.31e-06 5.37e-06 5.92e-06 5.62e-06 ...
##  $ Trade_volume               : num  0 0 0 0 0 0 0 0 0 0 ...

Modify the column names to remove “_btc"

bitcoin$Date <- as.Date(bitcoin$Date, format = "%m/%d/%y")
## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'zone/tz/
## 2018c.1.0/zoneinfo/America/Los_Angeles'
head(bitcoin)
##         Date Day Week Price Transaction_fees..BTC. No_of_transactions
## 1 2009-01-03   3    1     0                      0                  1
## 2 2009-01-05   5    2     0                      0                  0
## 3 2009-01-07   7    2     0                      0                  0
## 4 2009-01-09   9    2     0                      0                 14
## 5 2009-01-11  11    3     0                      0                106
## 6 2009-01-13  13    3     0                      0                116
##   Output_value..BTC. estimated_transaction_value Miners_revenue
## 1             0.0000                           0              0
## 2             0.0000                           0              0
## 3             0.0000                           0              0
## 4             0.0000                           0              0
## 5           328.5714                           0              0
## 6          1743.0000                           0              0
##   Cost_per_transaction Difficulty Hash_rate Trade_volume
## 1                    0          1  4.97e-08            0
## 2                    0          0  0.00e+00            0
## 3                    0          0  0.00e+00            0
## 4                    0          1  6.96e-07            0
## 5                    0          1  5.27e-06            0
## 6                    0          1  5.72e-06            0

Check for missing values

anyNA(bitcoin)
## [1] FALSE

Get summary of the columns in bitcoin dataset

summary(bitcoin)
##       Date                 Day             Week          Price          
##  Min.   :2009-01-03   Min.   :  1.0   Min.   : 1.0   Min.   :    0.000  
##  1st Qu.:2011-05-08   1st Qu.: 86.0   1st Qu.:13.0   1st Qu.:    2.528  
##  Median :2013-09-09   Median :176.0   Median :26.0   Median :  128.100  
##  Mean   :2013-09-09   Mean   :178.7   Mean   :26.4   Mean   :  986.565  
##  3rd Qu.:2016-01-12   3rd Qu.:271.0   3rd Qu.:40.0   3rd Qu.:  586.330  
##  Max.   :2018-05-16   Max.   :366.0   Max.   :53.0   Max.   :19289.785  
##  Transaction_fees..BTC. No_of_transactions Output_value..BTC.
##  Min.   :   0.000       Min.   :     0     Min.   :       0  
##  1st Qu.:   3.252       1st Qu.:  4380     1st Qu.:  280139  
##  Median :  16.172       Median : 55371     Median :  850113  
##  Mean   :  52.955       Mean   : 92739     Mean   : 1171801  
##  3rd Qu.:  44.526       3rd Qu.:176783     3rd Qu.: 1658677  
##  Max.   :1128.762       Max.   :425008     Max.   :21158969  
##  estimated_transaction_value Miners_revenue     Cost_per_transaction
##  Min.   :0.000e+00           Min.   :       0   Min.   :  0.000     
##  1st Qu.:2.704e+05           1st Qu.:   15842   1st Qu.:  2.293     
##  Median :2.264e+07           Median :  607245   Median :  7.191     
##  Mean   :1.782e+08           Mean   : 2387408   Mean   : 15.239     
##  3rd Qu.:1.111e+08           3rd Qu.: 1800091   3rd Qu.: 14.148     
##  Max.   :3.987e+09           Max.   :53191582   Max.   :146.595     
##    Difficulty          Hash_rate         Trade_volume      
##  Min.   :0.000e+00   Min.   :       0   Min.   :0.000e+00  
##  1st Qu.:1.097e+05   1st Qu.:       1   1st Qu.:5.952e+04  
##  Median :8.693e+07   Median :     834   Median :6.144e+06  
##  Mean   :2.436e+11   Mean   : 1857248   Mean   :8.856e+07  
##  3rd Qu.:1.085e+11   3rd Qu.:  830330   3rd Qu.:2.620e+07  
##  Max.   :4.140e+12   Max.   :36872804   Max.   :5.352e+09

There are no missing values in data

ggplot(bitcoin, aes(bitcoin$Date, bitcoin$Price,)) +
  geom_point(color = "dark blue") +
  ggtitle("Bitcoin values in USD trend") +
  labs ( x="Date" , y="Bitcoin price in USD") + 
  theme_minimal()

Find variables with high correlation to bitcoin market price

cor = cor(bitcoin[,c(2:13)])
corrplot(cor, method = "square", type="upper", tl.srt = 70, tl.col = "black", tl.cex = 0.5, title = "Correlation of Variables")

correlation = as.data.frame(cor)
highly_correlated = correlation[correlation$Price>= 0.75, ]
rownames(highly_correlated)
## [1] "Price"                       "estimated_transaction_value"
## [3] "Miners_revenue"              "Cost_per_transaction"       
## [5] "Difficulty"                  "Hash_rate"                  
## [7] "Trade_volume"

These are the highly correlated variables to bitcoin market price

Model building

Additive model with all variables

fit.lm1 = lm(Price ~., data=bitcoin)
summary(fit.lm1)
## 
## Call:
## lm(formula = Price ~ ., data = bitcoin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1373.49   -38.98     5.00    49.31  2708.72 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.356e+02  2.202e+02   3.795 0.000153 ***
## Date                        -5.308e-02  1.463e-02  -3.628 0.000294 ***
## Day                          1.044e+01  1.807e+00   5.778 8.97e-09 ***
## Week                        -7.184e+01  1.263e+01  -5.687 1.52e-08 ***
## Transaction_fees..BTC.      -1.007e+00  9.109e-02 -11.052  < 2e-16 ***
## No_of_transactions           3.111e-04  1.521e-04   2.046 0.040954 *  
## Output_value..BTC.          -8.394e-06  3.394e-06  -2.474 0.013474 *  
## estimated_transaction_value  8.710e-07  5.009e-08  17.388  < 2e-16 ***
## Miners_revenue               3.120e-04  4.919e-06  63.417  < 2e-16 ***
## Cost_per_transaction        -3.503e+00  5.714e-01  -6.132 1.08e-09 ***
## Difficulty                   2.437e-09  7.233e-11  33.685  < 2e-16 ***
## Hash_rate                   -1.991e-04  9.975e-06 -19.960  < 2e-16 ***
## Trade_volume                -7.904e-08  2.943e-08  -2.686 0.007305 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194 on 1698 degrees of freedom
## Multiple R-squared:  0.9944, Adjusted R-squared:  0.9943 
## F-statistic: 2.499e+04 on 12 and 1698 DF,  p-value: < 2.2e-16

R^2 is 99%, which might indicate overfitting, some variables like trade volume, output value and estimated transaction value are insignificant in the model.

Check residuals
autoplot(fit.lm1, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals indicate heteroscedasticity and the data deviates from normal.

Running step function to get a model with the lowest AIC

fit_full = lm(Price~., data=bitcoin)
step(fit_full)
## Start:  AIC=18038.74
## Price ~ Date + Day + Week + Transaction_fees..BTC. + No_of_transactions + 
##     Output_value..BTC. + estimated_transaction_value + Miners_revenue + 
##     Cost_per_transaction + Difficulty + Hash_rate + Trade_volume
## 
##                               Df Sum of Sq       RSS   AIC
## <none>                                      63875221 18039
## - No_of_transactions           1    157404  64032624 18041
## - Output_value..BTC.           1    230168  64105388 18043
## - Trade_volume                 1    271371  64146592 18044
## - Date                         1    495113  64370334 18050
## - Week                         1   1216459  65091680 18069
## - Day                          1   1255952  65131173 18070
## - Cost_per_transaction         1   1414393  65289614 18074
## - Transaction_fees..BTC.       1   4594609  68469830 18156
## - estimated_transaction_value  1  11373251  75248471 18317
## - Hash_rate                    1  14987001  78862221 18397
## - Difficulty                   1  42684308 106559529 18912
## - Miners_revenue               1 151287467 215162688 20115
## 
## Call:
## lm(formula = Price ~ Date + Day + Week + Transaction_fees..BTC. + 
##     No_of_transactions + Output_value..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate + 
##     Trade_volume, data = bitcoin)
## 
## Coefficients:
##                 (Intercept)                         Date  
##                   8.356e+02                   -5.308e-02  
##                         Day                         Week  
##                   1.044e+01                   -7.184e+01  
##      Transaction_fees..BTC.           No_of_transactions  
##                  -1.007e+00                    3.111e-04  
##          Output_value..BTC.  estimated_transaction_value  
##                  -8.394e-06                    8.710e-07  
##              Miners_revenue         Cost_per_transaction  
##                   3.120e-04                   -3.503e+00  
##                  Difficulty                    Hash_rate  
##                   2.437e-09                   -1.991e-04  
##                Trade_volume  
##                  -7.904e-08

Fit the model with lowest AIC and check for residuals

fit.lm2= lm(formula = Price ~  Date + Day + Week + Transaction_fees..BTC. + 
    No_of_transactions + Output_value..BTC. + estimated_transaction_value + 
    Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate, , data = bitcoin)
summary(fit.lm2)
## 
## Call:
## lm(formula = Price ~ Date + Day + Week + Transaction_fees..BTC. + 
##     No_of_transactions + Output_value..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate, 
##     data = bitcoin)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1364.96   -38.73     5.20    48.33  2666.94 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.430e+02  2.206e+02   3.821 0.000137 ***
## Date                        -5.362e-02  1.466e-02  -3.659 0.000261 ***
## Day                          1.030e+01  1.809e+00   5.690 1.50e-08 ***
## Week                        -7.084e+01  1.265e+01  -5.600 2.50e-08 ***
## Transaction_fees..BTC.      -1.022e+00  9.109e-02 -11.217  < 2e-16 ***
## No_of_transactions           3.492e-04  1.517e-04   2.302 0.021468 *  
## Output_value..BTC.          -8.377e-06  3.400e-06  -2.464 0.013839 *  
## estimated_transaction_value  8.489e-07  4.951e-08  17.149  < 2e-16 ***
## Miners_revenue               3.100e-04  4.875e-06  63.598  < 2e-16 ***
## Cost_per_transaction        -3.464e+00  5.722e-01  -6.053 1.75e-09 ***
## Difficulty                   2.401e-09  7.126e-11  33.697  < 2e-16 ***
## Hash_rate                   -1.949e-04  9.871e-06 -19.747  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194.3 on 1699 degrees of freedom
## Multiple R-squared:  0.9943, Adjusted R-squared:  0.9943 
## F-statistic: 2.716e+04 on 11 and 1699 DF,  p-value: < 2.2e-16

All the variables are significant in the model

autoplot(fit.lm2, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals indicate heteroscedasticity and the data deviates from normal.

Polynomial transaction will not make much of an impact on the data.

Useful link on heteroscedasticity of time series data http://statisticsbyjim.com/regression/heteroscedasticity-regression/

Another method that can be useful in solving heteroscedasticity is weighted regression, https://newonlinecourses.science.psu.edu/stat501/node/431/

wts = 1/fitted(lm(abs(residuals(fit.lm2)) ~ fitted(fit.lm2)))^2

fit.lm3= lm(formula = Price ~ Date + Day + Week + Transaction_fees..BTC. + 
    No_of_transactions + Output_value..BTC. + estimated_transaction_value + 
    Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate, data = bitcoin, weights=wts)

autoplot(fit.lm3, label = 0, col = "goldenrod1") +
  theme_minimal()

Applied weighted regression did not help much.

For time series data, these forms of transition helps in making the data stationary https://datascienceplus.com/time-series-analysis-in-r-part-2-time-series-transformations/

Transforming the variables to % changes

bitcoin_final <- subset(bitcoin, bitcoin$Price>0)
bitcoin_final = mutate(bitcoin_final, pChange=(bitcoin_final$Price-lag(bitcoin_final$Price))/lag(bitcoin_final$Price)*100)
bitcoin_final = mutate(bitcoin_final, Transaction_fees..BTC._pChange=(bitcoin_final$Transaction_fees..BTC.-lag(bitcoin_final$Transaction_fees..BTC.))/lag(bitcoin_final$Transaction_fees..BTC.)*100)
bitcoin_final = mutate(bitcoin_final, Output_value..BTC._pChange=(bitcoin_final$Output_value..BTC.-lag(bitcoin_final$Output_value..BTC.))/lag(bitcoin_final$Output_value..BTC.)*100)
bitcoin_final = mutate(bitcoin_final, estimated_transaction_value_pChange=(bitcoin_final$estimated_transaction_value-lag(bitcoin_final$estimated_transaction_value))/lag(bitcoin_final$estimated_transaction_value)*100)
bitcoin_final = mutate(bitcoin_final, Miners_revenue_pChange=(bitcoin_final$Miners_revenue-lag(bitcoin_final$Miners_revenue))/lag(bitcoin_final$Miners_revenue)*100)
bitcoin_final = mutate(bitcoin_final, No_of_transactions_pChange=(bitcoin_final$No_of_transactions-lag(bitcoin_final$No_of_transactions))/lag(bitcoin_final$No_of_transactions)*100)
bitcoin_final = mutate(bitcoin_final, Cost_per_transaction_pChange=(bitcoin_final$Cost_per_transaction-lag(bitcoin_final$Cost_per_transaction))/lag(bitcoin_final$Cost_per_transaction)*100)
bitcoin_final = mutate(bitcoin_final, Difficulty_pChange=(bitcoin_final$Difficulty-lag(bitcoin_final$Difficulty))/lag(bitcoin_final$Difficulty)*100)
bitcoin_final = mutate(bitcoin_final, Hash_rate_pChange=(bitcoin_final$Hash_rate-lag(bitcoin_final$Hash_rate))/lag(bitcoin_final$Hash_rate)*100)

is.na(bitcoin_final) = sapply(bitcoin_final, is.infinite)
bitcoin_final[is.na(bitcoin_final)]=0
summary(bitcoin_final)
##       Date                 Day             Week        Price          
##  Min.   :2010-08-18   Min.   :  1.0   Min.   : 1   Min.   :    0.061  
##  1st Qu.:2012-07-25   1st Qu.: 89.0   1st Qu.:14   1st Qu.:   11.805  
##  Median :2014-07-02   Median :183.0   Median :27   Median :  270.680  
##  Mean   :2014-07-02   Mean   :183.1   Mean   :27   Mean   : 1192.942  
##  3rd Qu.:2016-06-08   3rd Qu.:277.0   3rd Qu.:40   3rd Qu.:  645.682  
##  Max.   :2018-05-16   Max.   :366.0   Max.   :53   Max.   :19289.785  
##  Transaction_fees..BTC. No_of_transactions Output_value..BTC.
##  Min.   :   0.00        Min.   :   271     Min.   :   23342  
##  1st Qu.:  11.83        1st Qu.: 28590     1st Qu.:  580806  
##  Median :  25.05        Median : 69721     Median : 1070697  
##  Mean   :  64.02        Mean   :112102     Mean   : 1413487  
##  3rd Qu.:  54.43        3rd Qu.:203423     3rd Qu.: 1800706  
##  Max.   :1128.76        Max.   :425008     Max.   :21158969  
##  estimated_transaction_value Miners_revenue     Cost_per_transaction
##  Min.   :5.300e+02           Min.   :     371   Min.   :  0.1349    
##  1st Qu.:1.916e+06           1st Qu.:   78499   1st Qu.:  5.1827    
##  Median :4.252e+07           Median : 1007436   Median :  8.3437    
##  Mean   :2.155e+08           Mean   : 2886823   Mean   : 18.4272    
##  3rd Qu.:1.386e+08           3rd Qu.: 2091327   3rd Qu.: 18.7835    
##  Max.   :3.987e+09           Max.   :53191582   Max.   :146.5951    
##    Difficulty          Hash_rate         Trade_volume      
##  Min.   :5.120e+02   Min.   :       0   Min.   :4.200e+01  
##  1st Qu.:1.889e+06   1st Qu.:      15   1st Qu.:4.750e+05  
##  Median :1.682e+10   Median :  124571   Median :1.295e+07  
##  Mean   :2.946e+11   Mean   : 2245760   Mean   :1.071e+08  
##  3rd Qu.:1.990e+11   3rd Qu.: 1441005   3rd Qu.:3.445e+07  
##  Max.   :4.140e+12   Max.   :36872804   Max.   :5.352e+09  
##     pChange         Transaction_fees..BTC._pChange
##  Min.   :-65.2000   Min.   :    -100              
##  1st Qu.: -2.1220   1st Qu.:     -16              
##  Median :  0.3511   Median :       0              
##  Mean   :  1.2657   Mean   :   56554              
##  3rd Qu.:  3.3566   3rd Qu.:      18              
##  Max.   :182.2125   Max.   :79999900              
##  Output_value..BTC._pChange estimated_transaction_value_pChange
##  Min.   :-86.2040           Min.   :-79.5447                   
##  1st Qu.: -6.9524           1st Qu.: -6.7300                   
##  Median :  0.5872           Median :  0.6419                   
##  Mean   :  2.4181           Mean   :  3.0586                   
##  3rd Qu.:  8.3175           3rd Qu.:  8.6480                   
##  Max.   :316.1967           Max.   :405.0760                   
##  Miners_revenue_pChange No_of_transactions_pChange
##  Min.   :-63.7608       Min.   : -96.7944         
##  1st Qu.: -8.5491       1st Qu.: -10.6309         
##  Median :  0.5814       Median :  -0.1394         
##  Mean   :  2.1243       Mean   :   6.4857         
##  3rd Qu.: 10.8179       3rd Qu.:  11.6223         
##  Max.   :223.5119       Max.   :2929.3233         
##  Cost_per_transaction_pChange Difficulty_pChange Hash_rate_pChange
##  Min.   : -98.465             Min.   :-18.031    Min.   :-52.778  
##  1st Qu.: -11.916             1st Qu.:  0.000    1st Qu.: -6.122  
##  Median :   0.917             Median :  0.000    Median :  1.442  
##  Mean   :   8.633             Mean   :  1.829    Mean   :  2.506  
##  3rd Qu.:  15.146             3rd Qu.:  0.000    3rd Qu.: 10.423  
##  Max.   :4135.040             Max.   : 78.146    Max.   : 75.000
fit.lm4= lm(formula = pChange ~ ., data = bitcoin_final[,-4])

summary(fit.lm4)
## 
## Call:
## lm(formula = pChange ~ ., data = bitcoin_final[, -4])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.177  -0.698  -0.274   0.416  48.436 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         -2.090e+00  5.115e+00  -0.409  0.68296
## Date                                 1.611e-04  3.323e-04   0.485  0.62790
## Day                                  5.063e-02  2.977e-02   1.701  0.08919
## Week                                -3.526e-01  2.081e-01  -1.695  0.09038
## Transaction_fees..BTC.              -4.364e-03  1.437e-03  -3.037  0.00243
## No_of_transactions                   6.095e-07  2.859e-06   0.213  0.83124
## Output_value..BTC.                   1.081e-08  5.344e-08   0.202  0.83968
## estimated_transaction_value          1.287e-09  7.804e-10   1.649  0.09936
## Miners_revenue                      -1.076e-07  7.645e-08  -1.408  0.15940
## Cost_per_transaction                 9.504e-03  9.013e-03   1.054  0.29185
## Difficulty                          -5.706e-12  1.162e-12  -4.911 1.01e-06
## Hash_rate                            7.500e-07  1.598e-07   4.694 2.95e-06
## Trade_volume                        -5.668e-11  4.547e-10  -0.125  0.90082
## Transaction_fees..BTC._pChange       2.564e-08  3.759e-08   0.682  0.49520
## Output_value..BTC._pChange           5.551e-03  4.284e-03   1.296  0.19521
## estimated_transaction_value_pChange -2.153e-03  4.158e-03  -0.518  0.60463
## Miners_revenue_pChange               8.934e-01  7.451e-03 119.905  < 2e-16
## No_of_transactions_pChange          -3.125e-03  7.455e-04  -4.192 2.94e-05
## Cost_per_transaction_pChange        -3.102e-04  5.266e-04  -0.589  0.55600
## Difficulty_pChange                   7.337e-01  1.291e-02  56.848  < 2e-16
## Hash_rate_pChange                   -8.712e-01  9.724e-03 -89.595  < 2e-16
##                                        
## (Intercept)                            
## Date                                   
## Day                                 .  
## Week                                .  
## Transaction_fees..BTC.              ** 
## No_of_transactions                     
## Output_value..BTC.                     
## estimated_transaction_value         .  
## Miners_revenue                         
## Cost_per_transaction                   
## Difficulty                          ***
## Hash_rate                           ***
## Trade_volume                           
## Transaction_fees..BTC._pChange         
## Output_value..BTC._pChange             
## estimated_transaction_value_pChange    
## Miners_revenue_pChange              ***
## No_of_transactions_pChange          ***
## Cost_per_transaction_pChange           
## Difficulty_pChange                  ***
## Hash_rate_pChange                   ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.993 on 1394 degrees of freedom
## Multiple R-squared:  0.914,  Adjusted R-squared:  0.9127 
## F-statistic: 740.4 on 20 and 1394 DF,  p-value: < 2.2e-16

Removing the insignificant variables one by one, we arrive at the following model or by using step function to get the model with lowest AIC value

step(fit.lm4)
## Start:  AIC=3123.77
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + No_of_transactions + 
##     Output_value..BTC. + estimated_transaction_value + Miners_revenue + 
##     Cost_per_transaction + Difficulty + Hash_rate + Trade_volume + 
##     Transaction_fees..BTC._pChange + Output_value..BTC._pChange + 
##     estimated_transaction_value_pChange + Miners_revenue_pChange + 
##     No_of_transactions_pChange + Cost_per_transaction_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                                       Df Sum of Sq    RSS    AIC
## - Trade_volume                         1         0  12492 3121.8
## - Output_value..BTC.                   1         0  12492 3121.8
## - No_of_transactions                   1         0  12492 3121.8
## - Date                                 1         2  12494 3122.0
## - estimated_transaction_value_pChange  1         2  12494 3122.0
## - Cost_per_transaction_pChange         1         3  12495 3122.1
## - Transaction_fees..BTC._pChange       1         4  12496 3122.2
## - Cost_per_transaction                 1        10  12502 3122.9
## - Output_value..BTC._pChange           1        15  12507 3123.5
## <none>                                              12492 3123.8
## - Miners_revenue                       1        18  12509 3123.8
## - estimated_transaction_value          1        24  12516 3124.5
## - Week                                 1        26  12517 3124.7
## - Day                                  1        26  12518 3124.7
## - Transaction_fees..BTC.               1        83  12574 3131.1
## - No_of_transactions_pChange           1       157  12649 3139.5
## - Hash_rate                            1       197  12689 3144.0
## - Difficulty                           1       216  12708 3146.0
## - Difficulty_pChange                   1     28959  41451 4819.0
## - Hash_rate_pChange                    1     71931  84423 5825.5
## - Miners_revenue_pChange               1    128834 141325 6554.6
## 
## Step:  AIC=3121.78
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + No_of_transactions + 
##     Output_value..BTC. + estimated_transaction_value + Miners_revenue + 
##     Cost_per_transaction + Difficulty + Hash_rate + Transaction_fees..BTC._pChange + 
##     Output_value..BTC._pChange + estimated_transaction_value_pChange + 
##     Miners_revenue_pChange + No_of_transactions_pChange + Cost_per_transaction_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                                       Df Sum of Sq    RSS    AIC
## - Output_value..BTC.                   1         0  12492 3119.8
## - No_of_transactions                   1         0  12492 3119.8
## - Date                                 1         2  12494 3120.0
## - estimated_transaction_value_pChange  1         2  12494 3120.1
## - Cost_per_transaction_pChange         1         3  12495 3120.1
## - Transaction_fees..BTC._pChange       1         4  12496 3120.3
## - Cost_per_transaction                 1        10  12502 3120.9
## - Output_value..BTC._pChange           1        15  12507 3121.5
## <none>                                              12492 3121.8
## - Miners_revenue                       1        19  12510 3121.9
## - estimated_transaction_value          1        24  12516 3122.5
## - Week                                 1        26  12517 3122.7
## - Day                                  1        26  12518 3122.7
## - Transaction_fees..BTC.               1        83  12575 3129.2
## - No_of_transactions_pChange           1       158  12649 3137.5
## - Hash_rate                            1       204  12696 3142.7
## - Difficulty                           1       226  12717 3145.1
## - Difficulty_pChange                   1     28965  41457 4817.2
## - Hash_rate_pChange                    1     71965  84456 5824.1
## - Miners_revenue_pChange               1    128834 141326 6552.6
## 
## Step:  AIC=3119.82
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + No_of_transactions + 
##     estimated_transaction_value + Miners_revenue + Cost_per_transaction + 
##     Difficulty + Hash_rate + Transaction_fees..BTC._pChange + 
##     Output_value..BTC._pChange + estimated_transaction_value_pChange + 
##     Miners_revenue_pChange + No_of_transactions_pChange + Cost_per_transaction_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                                       Df Sum of Sq    RSS    AIC
## - No_of_transactions                   1         1  12493 3117.9
## - Date                                 1         2  12494 3118.1
## - estimated_transaction_value_pChange  1         3  12495 3118.1
## - Cost_per_transaction_pChange         1         3  12495 3118.2
## - Transaction_fees..BTC._pChange       1         4  12496 3118.3
## - Cost_per_transaction                 1        10  12502 3118.9
## - Output_value..BTC._pChange           1        16  12508 3119.6
## <none>                                              12492 3119.8
## - Miners_revenue                       1        19  12511 3119.9
## - estimated_transaction_value          1        25  12517 3120.6
## - Week                                 1        27  12519 3120.8
## - Day                                  1        27  12519 3120.9
## - Transaction_fees..BTC.               1        84  12576 3127.3
## - No_of_transactions_pChange           1       158  12650 3135.6
## - Hash_rate                            1       204  12697 3140.8
## - Difficulty                           1       226  12718 3143.2
## - Difficulty_pChange                   1     28977  41469 4815.6
## - Hash_rate_pChange                    1     71972  84465 5822.2
## - Miners_revenue_pChange               1    128858 141350 6550.8
## 
## Step:  AIC=3117.88
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate + 
##     Transaction_fees..BTC._pChange + Output_value..BTC._pChange + 
##     estimated_transaction_value_pChange + Miners_revenue_pChange + 
##     No_of_transactions_pChange + Cost_per_transaction_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                                       Df Sum of Sq    RSS    AIC
## - estimated_transaction_value_pChange  1         3  12495 3116.2
## - Cost_per_transaction_pChange         1         3  12496 3116.2
## - Transaction_fees..BTC._pChange       1         4  12497 3116.4
## - Cost_per_transaction                 1        10  12503 3117.0
## - Output_value..BTC._pChange           1        16  12509 3117.7
## <none>                                              12493 3117.9
## - Miners_revenue                       1        18  12511 3118.0
## - estimated_transaction_value          1        25  12518 3118.8
## - Week                                 1        26  12519 3118.8
## - Date                                 1        26  12519 3118.8
## - Day                                  1        26  12519 3118.9
## - Transaction_fees..BTC.               1        91  12584 3126.2
## - No_of_transactions_pChange           1       158  12650 3133.6
## - Hash_rate                            1       213  12706 3139.8
## - Difficulty                           1       235  12728 3142.3
## - Difficulty_pChange                   1     29273  41766 4823.7
## - Hash_rate_pChange                    1     72139  84631 5823.0
## - Miners_revenue_pChange               1    130387 142879 6564.0
## 
## Step:  AIC=3116.17
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate + 
##     Transaction_fees..BTC._pChange + Output_value..BTC._pChange + 
##     Miners_revenue_pChange + No_of_transactions_pChange + Cost_per_transaction_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                                  Df Sum of Sq    RSS    AIC
## - Cost_per_transaction_pChange    1         3  12498 3114.5
## - Transaction_fees..BTC._pChange  1         4  12499 3114.7
## - Cost_per_transaction            1        10  12505 3115.3
## - Output_value..BTC._pChange      1        15  12510 3115.8
## - Miners_revenue                  1        18  12513 3116.2
## <none>                                         12495 3116.2
## - estimated_transaction_value     1        25  12520 3116.9
## - Week                            1        26  12522 3117.2
## - Day                             1        27  12522 3117.2
## - Date                            1        28  12523 3117.3
## - Transaction_fees..BTC.          1        92  12587 3124.5
## - No_of_transactions_pChange      1       156  12651 3131.7
## - Hash_rate                       1       212  12707 3138.0
## - Difficulty                      1       235  12730 3140.5
## - Difficulty_pChange              1     29303  41798 4822.8
## - Hash_rate_pChange               1     72139  84634 5821.1
## - Miners_revenue_pChange          1    130385 142880 6562.0
## 
## Step:  AIC=3114.51
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate + 
##     Transaction_fees..BTC._pChange + Output_value..BTC._pChange + 
##     Miners_revenue_pChange + No_of_transactions_pChange + Difficulty_pChange + 
##     Hash_rate_pChange
## 
##                                  Df Sum of Sq    RSS    AIC
## - Transaction_fees..BTC._pChange  1         4  12502 3113.0
## - Cost_per_transaction            1        10  12508 3113.6
## - Output_value..BTC._pChange      1        14  12512 3114.0
## - Miners_revenue                  1        17  12515 3114.5
## <none>                                         12498 3114.5
## - estimated_transaction_value     1        24  12522 3115.2
## - Week                            1        27  12525 3115.5
## - Day                             1        27  12525 3115.5
## - Date                            1        29  12527 3115.8
## - Transaction_fees..BTC.          1        91  12590 3122.8
## - No_of_transactions_pChange      1       154  12652 3129.8
## - Hash_rate                       1       213  12711 3136.4
## - Difficulty                      1       236  12734 3139.0
## - Difficulty_pChange              1     29336  41834 4822.0
## - Hash_rate_pChange               1     72593  85091 5826.7
## - Miners_revenue_pChange          1    130647 143146 6562.7
## 
## Step:  AIC=3112.99
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate + 
##     Output_value..BTC._pChange + Miners_revenue_pChange + No_of_transactions_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                               Df Sum of Sq    RSS    AIC
## - Cost_per_transaction         1        10  12512 3112.1
## - Output_value..BTC._pChange   1        13  12516 3112.5
## - Miners_revenue               1        17  12520 3113.0
## <none>                                      12502 3113.0
## - estimated_transaction_value  1        24  12527 3113.7
## - Week                         1        26  12528 3113.9
## - Day                          1        26  12529 3113.9
## - Date                         1        28  12530 3114.1
## - Transaction_fees..BTC.       1        91  12593 3121.2
## - No_of_transactions_pChange   1       154  12657 3128.4
## - Hash_rate                    1       214  12717 3135.0
## - Difficulty                   1       237  12739 3137.5
## - Difficulty_pChange           1     29334  41836 4820.1
## - Hash_rate_pChange            1     72713  85215 5826.7
## - Miners_revenue_pChange       1    130660 143162 6560.8
## 
## Step:  AIC=3112.08
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Miners_revenue + Difficulty + Hash_rate + Output_value..BTC._pChange + 
##     Miners_revenue_pChange + No_of_transactions_pChange + Difficulty_pChange + 
##     Hash_rate_pChange
## 
##                               Df Sum of Sq    RSS    AIC
## - Miners_revenue               1         8  12520 3111.0
## - Output_value..BTC._pChange   1        12  12524 3111.5
## - estimated_transaction_value  1        18  12530 3112.1
## <none>                                      12512 3112.1
## - Week                         1        27  12539 3113.1
## - Day                          1        27  12539 3113.2
## - Date                         1        30  12542 3113.4
## - Transaction_fees..BTC.       1       113  12625 3122.8
## - No_of_transactions_pChange   1       159  12671 3127.9
## - Hash_rate                    1       205  12717 3133.1
## - Difficulty                   1       227  12739 3135.5
## - Difficulty_pChange           1     29464  41976 4822.8
## - Hash_rate_pChange            1     73094  85606 5831.2
## - Miners_revenue_pChange       1    130812 143325 6560.4
## 
## Step:  AIC=3111
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     Difficulty + Hash_rate + Output_value..BTC._pChange + Miners_revenue_pChange + 
##     No_of_transactions_pChange + Difficulty_pChange + Hash_rate_pChange
## 
##                               Df Sum of Sq    RSS    AIC
## - estimated_transaction_value  1        12  12532 3110.4
## - Output_value..BTC._pChange   1        13  12533 3110.5
## <none>                                      12520 3111.0
## - Week                         1        28  12548 3112.1
## - Day                          1        28  12548 3112.1
## - Date                         1        32  12553 3112.7
## - Transaction_fees..BTC.       1       107  12627 3121.0
## - No_of_transactions_pChange   1       159  12679 3126.8
## - Hash_rate                    1       200  12720 3131.5
## - Difficulty                   1       220  12740 3133.6
## - Difficulty_pChange           1     29469  41989 4821.2
## - Hash_rate_pChange            1     73091  85612 5829.3
## - Miners_revenue_pChange       1    131181 143701 6562.2
## 
## Step:  AIC=3110.37
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + Difficulty + 
##     Hash_rate + Output_value..BTC._pChange + Miners_revenue_pChange + 
##     No_of_transactions_pChange + Difficulty_pChange + Hash_rate_pChange
## 
##                              Df Sum of Sq    RSS    AIC
## - Output_value..BTC._pChange  1        13  12545 3109.9
## <none>                                     12532 3110.4
## - Week                        1        25  12557 3111.2
## - Day                         1        25  12558 3111.2
## - Date                        1        28  12560 3111.5
## - Transaction_fees..BTC.      1       137  12669 3123.7
## - No_of_transactions_pChange  1       162  12695 3126.6
## - Difficulty                  1       246  12778 3135.8
## - Hash_rate                   1       251  12783 3136.4
## - Difficulty_pChange          1     29528  42060 4821.6
## - Hash_rate_pChange           1     73451  85983 5833.4
## - Miners_revenue_pChange      1    131201 143733 6560.5
## 
## Step:  AIC=3109.85
## pChange ~ Date + Day + Week + Transaction_fees..BTC. + Difficulty + 
##     Hash_rate + Miners_revenue_pChange + No_of_transactions_pChange + 
##     Difficulty_pChange + Hash_rate_pChange
## 
##                              Df Sum of Sq    RSS    AIC
## <none>                                     12545 3109.9
## - Week                        1        25  12570 3110.6
## - Day                         1        25  12570 3110.7
## - Date                        1        26  12571 3110.7
## - Transaction_fees..BTC.      1       137  12682 3123.2
## - No_of_transactions_pChange  1       170  12715 3126.9
## - Difficulty                  1       246  12791 3135.3
## - Hash_rate                   1       252  12797 3136.0
## - Difficulty_pChange          1     29556  42101 4821.0
## - Hash_rate_pChange           1     73464  86009 5831.9
## - Miners_revenue_pChange      1    131319 143864 6559.8
## 
## Call:
## lm(formula = pChange ~ Date + Day + Week + Transaction_fees..BTC. + 
##     Difficulty + Hash_rate + Miners_revenue_pChange + No_of_transactions_pChange + 
##     Difficulty_pChange + Hash_rate_pChange, data = bitcoin_final[, 
##     -4])
## 
## Coefficients:
##                (Intercept)                        Date  
##                 -3.061e+00                   2.273e-04  
##                        Day                        Week  
##                  4.855e-02                  -3.374e-01  
##     Transaction_fees..BTC.                  Difficulty  
##                 -3.303e-03                  -5.541e-12  
##                  Hash_rate      Miners_revenue_pChange  
##                  7.396e-07                   8.926e-01  
## No_of_transactions_pChange          Difficulty_pChange  
##                 -3.203e-03                   7.347e-01  
##          Hash_rate_pChange  
##                 -8.716e-01

Fitting the model with lowest AIC and checking residuals…

Removing time effect, difficulty, hash rate and no of transactions cannot be pchange

fit.lm5=lm(formula = pChange ~ Transaction_fees..BTC. + 
    Difficulty + Hash_rate + Miners_revenue_pChange + No_of_transactions, data = bitcoin_final[, -4])
summary(fit.lm5)
## 
## Call:
## lm(formula = pChange ~ Transaction_fees..BTC. + Difficulty + 
##     Hash_rate + Miners_revenue_pChange + No_of_transactions, 
##     data = bitcoin_final[, -4])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.243  -4.124  -0.310   3.579 101.126 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.434e-01  3.230e-01   1.992   0.0465 *  
## Transaction_fees..BTC.  3.186e-04  2.500e-03   0.127   0.8986    
## Difficulty              1.326e-11  2.772e-12   4.784 1.90e-06 ***
## Hash_rate              -1.780e-06  3.668e-07  -4.854 1.34e-06 ***
## Miners_revenue_pChange  3.599e-01  1.245e-02  28.908  < 2e-16 ***
## No_of_transactions     -6.256e-07  2.975e-06  -0.210   0.8335    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.036 on 1409 degrees of freedom
## Multiple R-squared:  0.3733, Adjusted R-squared:  0.371 
## F-statistic: 167.8 on 5 and 1409 DF,  p-value: < 2.2e-16

All the variables are significant and R^2 = 91.12%

autoplot(fit.lm5, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals plot looks much better than before, lets run a test for homogenous variance to check our hypothesis for constant variance

lmtest::bptest(fit.lm5) # Breusch-Pagan test
## 
##  studentized Breusch-Pagan test
## 
## data:  fit.lm5
## BP = 206.1, df = 5, p-value < 2.2e-16
car::ncvTest(fit.lm5)  # Breusch-Pagan test
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 2633.586    Df = 1     p = 0

The test for homogenous variance has p-value < 0.05, therefore we cannot assume constant variance

After trying various transformations and other regression methods, the data assumption of constant variance was not satisfied.

Next step, is to explore the trend in the data that is causing non-constant variance

bitcoin_final$Year <- format(bitcoin_final$Date, "%Y")
bitcoin_final$Month <- format(bitcoin_final$Date, "%b")
bitcoin_final$Day <- format(bitcoin_final$Date, "%d")

bitcoin_final$CommonDate <- as.Date(paste0("2000-",format(bitcoin_final$Date, "%j")), "%Y-%j")
ggplot(data = bitcoin_final,
       mapping = aes(x = CommonDate, y = Price, shape = Year, colour = Year)) +
    geom_point() +
    geom_line() +
    facet_grid(facets = Year ~ .) +
    scale_x_date(labels = function(x) format(x, "%d-%b"))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 9.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 434 rows containing missing values (geom_point).

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 9.
## Consider specifying shapes manually if you must have them.

We can see that most of bitcoin fluctuations started from June/July 2017 and the bitcoin price before that is in a few hundreds, which may be causing high variance in the data.

Useful links on time series ggplots: http://www.sthda.com/english/articles/32-r-graphics-essentials/128-plot-time-series-data-using-ggplot/ http://neondataskills.org/R/time-series-plot-ggplot/ https://plot.ly/ggplot2/time-series/

Start work on new model with data after June 2016

Use bitcoin_final dataset to go ahead, as it has the transformed variables too incase transformation is needed.

bitcoin_new = subset(bitcoin_final, bitcoin_final$Date >= '2016-06-01')
bitcoin_new$Day = as.numeric(bitcoin_new$Day)
head(bitcoin_new)
##            Date Day Week    Price Transaction_fees..BTC.
## 1058 2016-06-01   1   23 539.4700               56.18498
## 1059 2016-06-03   3   23 568.0000               54.69657
## 1060 2016-06-05   5   24 574.0200               44.36412
## 1061 2016-06-07   7   24 577.5400               56.51048
## 1062 2016-06-09   9   24 575.2941               53.21551
## 1063 2016-06-11  11   24 594.4400               46.24529
##      No_of_transactions Output_value..BTC. estimated_transaction_value
## 1058             234385            2703528                   203235833
## 1059             234669            2272984                   186207347
## 1060             190528            1848155                   161858533
## 1061             248576            1703861                   142680383
## 1062             236353            1651160                   143061530
## 1063             213159            1788936                   162636056
##      Miners_revenue Cost_per_transaction Difficulty Hash_rate Trade_volume
## 1058        1972576             8.415964   1.99e+11   1426731     59032596
## 1059        2047468             8.724917   1.99e+11   1406916     26922629
## 1060        2479401            13.013318   1.99e+11   1694243     45168190
## 1061        2169535             8.727854   1.99e+11   1406916     34509545
## 1062        2410789            10.199949   1.96e+11   1608134     28300189
## 1063        2364669            11.093452   1.96e+11   1520418     24377615
##         pChange Transaction_fees..BTC._pChange Output_value..BTC._pChange
## 1058  2.7268400                      -2.881464                  -5.237267
## 1059  5.2885239                      -2.649121                 -15.925274
## 1060  1.0598592                     -18.890496                 -18.690341
## 1061  0.6132191                      27.378798                  -7.807462
## 1062 -0.3888670                      -5.830722                  -3.093061
## 1063  3.3280106                     -13.098093                   8.344216
##      estimated_transaction_value_pChange Miners_revenue_pChange
## 1058                          10.3068451               1.986003
## 1059                          -8.3786832               3.796654
## 1060                         -13.0761829              21.095998
## 1061                         -11.8487114             -12.497627
## 1062                           0.2671331              11.120056
## 1063                          13.6825928              -1.913045
##      No_of_transactions_pChange Cost_per_transaction_pChange
## 1058                 10.8890140                    -8.028758
## 1059                  0.1211682                     3.671038
## 1060                -18.8098982                    49.151184
## 1061                 30.4669130                   -32.931368
## 1062                 -4.9172084                    16.866631
## 1063                 -9.8132878                     8.759875
##      Difficulty_pChange Hash_rate_pChange Year Month CommonDate
## 1058           0.000000        -0.6896552 2016   Jun 2000-06-01
## 1059           0.000000        -1.3888889 2016   Jun 2000-06-03
## 1060           0.000000        20.4225352 2016   Jun 2000-06-05
## 1061           0.000000       -16.9590643 2016   Jun 2000-06-07
## 1062          -1.507538        14.3020864 2016   Jun 2000-06-09
## 1063           0.000000        -5.4545454 2016   Jun 2000-06-11
ggplot(bitcoin_new, aes(bitcoin_new$Date, bitcoin_new$Price,)) + geom_line() +
  geom_point(color = "dark blue") +
  ggtitle("Bitcoin values in USD trend") +
  labs ( x="Date" , y="Bitcoin price in USD") + 
  theme_minimal()

Find variables with high correlation to bitcoin market price

cor = cor(bitcoin_new[,c(2:12)])
corrplot(cor, method = "square", type="upper", tl.srt = 50, tl.col = "black", tl.cex = 0.6, title = "Correlation of Variables")

correlation = as.data.frame(cor)
highly_correlated = correlation[correlation$Price>= 0.75, ]
rownames(highly_correlated)
## [1] "Price"                       "estimated_transaction_value"
## [3] "Miners_revenue"              "Cost_per_transaction"       
## [5] "Difficulty"                  "Hash_rate"

These are the highly correlated variables to bitcoin market price

Model building

Additive model with all variables

bitcoin_new_set = bitcoin_new[,c(1:12)]
fit.lm1_1 = lm(Price ~., data=bitcoin_new_set)
summary(fit.lm1_1)
## 
## Call:
## lm(formula = Price ~ ., data = bitcoin_new_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -953.53 -143.20  -39.01  142.09 1831.33 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -4.529e+04  3.592e+03 -12.609  < 2e-16 ***
## Date                         2.611e+00  2.138e-01  12.211  < 2e-16 ***
## Day                          2.484e-01  1.728e+00   0.144  0.88579    
## Week                         3.527e+00  1.140e+00   3.093  0.00214 ** 
## Transaction_fees..BTC.      -2.492e+00  1.635e-01 -15.242  < 2e-16 ***
## No_of_transactions           3.078e-03  5.485e-04   5.611 4.12e-08 ***
## Output_value..BTC.           3.756e-05  3.122e-05   1.203  0.22971    
## estimated_transaction_value  1.555e-07  8.502e-08   1.829  0.06825 .  
## Miners_revenue               3.120e-04  9.882e-06  31.578  < 2e-16 ***
## Cost_per_transaction         2.097e+01  2.411e+00   8.698  < 2e-16 ***
## Difficulty                   2.409e-09  1.095e-10  22.011  < 2e-16 ***
## Hash_rate                   -3.110e-04  1.579e-05 -19.696  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 280.3 on 346 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.9959 
## F-statistic:  7894 on 11 and 346 DF,  p-value: < 2.2e-16

R^2 is almost 100%, which might indicate overfitting, some variables like trade volume, output value and estimated transaction value are insignificant in the model.

Check residuals
autoplot(fit.lm1_1, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals indicate heteroscedasticity and the data deviates from normal.

Running step function to get a model with the lowest AIC

step(fit.lm1_1)
## Start:  AIC=4047.12
## Price ~ Date + Day + Week + Transaction_fees..BTC. + No_of_transactions + 
##     Output_value..BTC. + estimated_transaction_value + Miners_revenue + 
##     Cost_per_transaction + Difficulty + Hash_rate
## 
##                               Df Sum of Sq       RSS    AIC
## - Day                          1      1623  27189967 4045.1
## - Output_value..BTC.           1    113763  27302106 4046.6
## <none>                                      27188343 4047.1
## - estimated_transaction_value  1    262887  27451230 4048.6
## - Week                         1    751974  27940317 4054.9
## - No_of_transactions           1   2474173  29662516 4076.3
## - Cost_per_transaction         1   5944403  33132746 4115.9
## - Date                         1  11717661  38906004 4173.4
## - Transaction_fees..BTC.       1  18255624  45443967 4229.0
## - Hash_rate                    1  30482339  57670682 4314.3
## - Difficulty                   1  38069298  65257641 4358.6
## - Miners_revenue               1  78355210 105543553 4530.7
## 
## Step:  AIC=4045.14
## Price ~ Date + Week + Transaction_fees..BTC. + No_of_transactions + 
##     Output_value..BTC. + estimated_transaction_value + Miners_revenue + 
##     Cost_per_transaction + Difficulty + Hash_rate
## 
##                               Df Sum of Sq       RSS    AIC
## - Output_value..BTC.           1    113124  27303091 4044.6
## <none>                                      27189967 4045.1
## - estimated_transaction_value  1    261372  27451339 4046.6
## - Week                         1    782817  27972784 4053.3
## - No_of_transactions           1   2496253  29686219 4074.6
## - Cost_per_transaction         1   6036872  33226839 4114.9
## - Date                         1  11718173  38908140 4171.4
## - Transaction_fees..BTC.       1  18517284  45707250 4229.1
## - Hash_rate                    1  30488404  57678370 4312.4
## - Difficulty                   1  38089590  65279557 4356.7
## - Miners_revenue               1  78774086 105964052 4530.1
## 
## Step:  AIC=4044.63
## Price ~ Date + Week + Transaction_fees..BTC. + No_of_transactions + 
##     estimated_transaction_value + Miners_revenue + Cost_per_transaction + 
##     Difficulty + Hash_rate
## 
##                               Df Sum of Sq       RSS    AIC
## <none>                                      27303091 4044.6
## - estimated_transaction_value  1    409702  27712793 4048.0
## - Week                         1    747011  28050102 4052.3
## - No_of_transactions           1   2405751  29708842 4072.9
## - Cost_per_transaction         1   5932360  33235451 4113.0
## - Date                         1  12404640  39707731 4176.7
## - Transaction_fees..BTC.       1  18460738  45763829 4227.5
## - Hash_rate                    1  30476236  57779327 4311.0
## - Difficulty                   1  38725343  66028434 4358.8
## - Miners_revenue               1  78865192 106168283 4528.8
## 
## Call:
## lm(formula = Price ~ Date + Week + Transaction_fees..BTC. + No_of_transactions + 
##     estimated_transaction_value + Miners_revenue + Cost_per_transaction + 
##     Difficulty + Hash_rate, data = bitcoin_new_set)
## 
## Coefficients:
##                 (Intercept)                         Date  
##                  -4.588e+04                    2.652e+00  
##                        Week       Transaction_fees..BTC.  
##                   3.463e+00                   -2.463e+00  
##          No_of_transactions  estimated_transaction_value  
##                   3.006e-03                    1.851e-07  
##              Miners_revenue         Cost_per_transaction  
##                   3.109e-04                    2.051e+01  
##                  Difficulty                    Hash_rate  
##                   2.384e-09                   -3.088e-04

Fit the model with lowest AIC and check for residuals

fit.lm2_2= lm(formula = Price ~ Date + Week + Transaction_fees..BTC. + estimated_transaction_value +
    No_of_transactions + Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate, , data = bitcoin_new_set)
summary(fit.lm2_2)
## 
## Call:
## lm(formula = Price ~ Date + Week + Transaction_fees..BTC. + estimated_transaction_value + 
##     No_of_transactions + Miners_revenue + Cost_per_transaction + 
##     Difficulty + Hash_rate, data = bitcoin_new_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -954.12 -148.53  -33.14  142.41 1859.23 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -4.588e+04  3.552e+03 -12.917  < 2e-16 ***
## Date                         2.652e+00  2.109e-01  12.574  < 2e-16 ***
## Week                         3.463e+00  1.122e+00   3.086  0.00219 ** 
## Transaction_fees..BTC.      -2.463e+00  1.606e-01 -15.339  < 2e-16 ***
## estimated_transaction_value  1.851e-07  8.098e-08   2.285  0.02290 *  
## No_of_transactions           3.006e-03  5.428e-04   5.537 6.06e-08 ***
## Miners_revenue               3.109e-04  9.806e-06  31.705  < 2e-16 ***
## Cost_per_transaction         2.051e+01  2.358e+00   8.696  < 2e-16 ***
## Difficulty                   2.385e-09  1.073e-10  22.217  < 2e-16 ***
## Hash_rate                   -3.088e-04  1.567e-05 -19.709  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 280.1 on 348 degrees of freedom
## Multiple R-squared:  0.996,  Adjusted R-squared:  0.9959 
## F-statistic:  9663 on 9 and 348 DF,  p-value: < 2.2e-16

All the variables are significant in the model

autoplot(fit.lm2_2, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals indicate heteroscedasticity and the data deviates from normal.

Using weighted regression

wts = 1/fitted(lm(abs(residuals(fit.lm2_2)) ~ fitted(fit.lm2_2)))^2

fit.lm3_3= lm(formula = Price ~ Transaction_fees..BTC. + 
    No_of_transactions + Miners_revenue + Cost_per_transaction + Difficulty + Hash_rate, , data = bitcoin_new_set, weights=wts)

autoplot(fit.lm3_3, label = 0, col = "goldenrod1") +
  theme_minimal()

Applied weighted regression did not help much.

residualPlots(fit.lm3_3)

##                        Test stat Pr(>|Test stat|)    
## Transaction_fees..BTC.   -5.9524        6.421e-09 ***
## No_of_transactions        4.9525        1.142e-06 ***
## Miners_revenue           -2.2503         0.025051 *  
## Cost_per_transaction     -0.3181         0.750577    
## Difficulty              -10.0800        < 2.2e-16 ***
## Hash_rate                -7.9906        1.965e-14 ***
## Tukey test               -2.6430         0.008217 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Weighed regression does not add any values, fit a normal regression

Using the transformed variables to % changes

Transaction_fees..BTC. - Keep it as is No_of_transactions - Keep it as is Miners_revenue - Use pChange Cost_per_transaction - Keep it as is Difficulty - As this is the difficulty level, keep it as is Hash_rate - Already a rate, keep it as is

fit.lm4_4= lm(formula = pChange ~ Transaction_fees..BTC. + 
    No_of_transactions + Miners_revenue_pChange + Cost_per_transaction + Difficulty + Hash_rate , data = bitcoin_new)

summary(fit.lm4_4)
## 
## Call:
## lm(formula = pChange ~ Transaction_fees..BTC. + No_of_transactions + 
##     Miners_revenue_pChange + Cost_per_transaction + Difficulty + 
##     Hash_rate, data = bitcoin_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.8903  -3.1624  -0.4798   2.9476  27.2666 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -4.177e+00  1.997e+00  -2.092 0.037200 *  
## Transaction_fees..BTC. -1.126e-02  2.963e-03  -3.801 0.000170 ***
## No_of_transactions      2.607e-05  8.228e-06   3.169 0.001664 ** 
## Miners_revenue_pChange  2.059e-01  2.176e-02   9.461  < 2e-16 ***
## Cost_per_transaction    8.200e-02  2.126e-02   3.856 0.000137 ***
## Difficulty              1.056e-11  2.255e-12   4.683 4.04e-06 ***
## Hash_rate              -1.714e-06  3.234e-07  -5.300 2.05e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.855 on 351 degrees of freedom
## Multiple R-squared:  0.2395, Adjusted R-squared:  0.2265 
## F-statistic: 18.42 on 6 and 351 DF,  p-value: < 2.2e-16
residualPlots(fit.lm4_4)

##                        Test stat Pr(>|Test stat|)    
## Transaction_fees..BTC.   -3.7714        0.0001905 ***
## No_of_transactions       -0.7250        0.4689640    
## Miners_revenue_pChange   -3.9642        8.930e-05 ***
## Cost_per_transaction     -1.6858        0.0927195 .  
## Difficulty               -1.1861        0.2363864    
## Hash_rate                -0.6745        0.5004553    
## Tukey test               -4.0364        5.428e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Apply poly transaformation on Transaction_fees..BTC. and Miners_revenue

fit.lm5_5= lm(formula = pChange ~ poly(Transaction_fees..BTC.,2) + 
    No_of_transactions + poly(Miners_revenue_pChange,2) + Cost_per_transaction + Difficulty + poly(Hash_rate,1) , data = bitcoin_new)

summary(fit.lm5_5)
## 
## Call:
## lm(formula = pChange ~ poly(Transaction_fees..BTC., 2) + No_of_transactions + 
##     poly(Miners_revenue_pChange, 2) + Cost_per_transaction + 
##     Difficulty + poly(Hash_rate, 1), data = bitcoin_new)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.3129  -3.1599  -0.2331   3.1219  25.1745 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)                      -2.189e+01  3.832e+00  -5.713 2.38e-08
## poly(Transaction_fees..BTC., 2)1 -2.968e+01  9.729e+00  -3.051  0.00245
## poly(Transaction_fees..BTC., 2)2 -2.605e+01  6.304e+00  -4.133 4.49e-05
## No_of_transactions                1.378e-05  8.203e-06   1.679  0.09397
## poly(Miners_revenue_pChange, 2)1  6.587e+01  6.157e+00  10.699  < 2e-16
## poly(Miners_revenue_pChange, 2)2 -2.576e+01  5.976e+00  -4.310 2.12e-05
## Cost_per_transaction              9.835e-02  2.087e-02   4.713 3.54e-06
## Difficulty                        1.441e-11  2.265e-12   6.365 6.15e-10
## poly(Hash_rate, 1)               -3.606e+02  5.162e+01  -6.985 1.45e-11
##                                     
## (Intercept)                      ***
## poly(Transaction_fees..BTC., 2)1 ** 
## poly(Transaction_fees..BTC., 2)2 ***
## No_of_transactions               .  
## poly(Miners_revenue_pChange, 2)1 ***
## poly(Miners_revenue_pChange, 2)2 ***
## Cost_per_transaction             ***
## Difficulty                       ***
## poly(Hash_rate, 1)               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.608 on 349 degrees of freedom
## Multiple R-squared:  0.3061, Adjusted R-squared:  0.2902 
## F-statistic: 19.25 on 8 and 349 DF,  p-value: < 2.2e-16
autoplot(fit.lm5_5, label = 0, col = "goldenrod1") +
  theme_minimal()

Residuals now look random, lets do test for homogenous variance

lmtest::bptest(fit.lm5_5) # Breusch-Pagan test
## 
##  studentized Breusch-Pagan test
## 
## data:  fit.lm5_5
## BP = 14.566, df = 8, p-value = 0.06815
car::ncvTest(fit.lm5_5)  # Breusch-Pagan test
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.2503768    Df = 1     p = 0.6168099

p-value > 0.05, we can assume the data has constant variance

shapiro.test(residuals(fit.lm5_5))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(fit.lm5_5)
## W = 0.97965, p-value = 6.041e-05
residualPlots(fit.lm5_5)

##                                 Test stat Pr(>|Test stat|)
## poly(Transaction_fees..BTC., 2)                           
## No_of_transactions                -0.8456           0.3984
## poly(Miners_revenue_pChange, 2)                           
## Cost_per_transaction              -0.8905           0.3738
## Difficulty                         0.0755           0.9399
## poly(Hash_rate, 1)                                        
## Tukey test                         0.6896           0.4904

Residuals vs Predictors plots looks good!

The model fit.lm5_5 with R^2 = 31% can be considered as the final model that has constant variance
bitcoin_new= as.data.table(bitcoin_new)
datas <- rbindlist(list(bitcoin_new[, .(pChange, Date)],data.table(value = fit.lm5_5$fitted.values, data_time = bitcoin_new[, Date])))
datas[, type := rep(c("Real", "Fitted"), each = nrow(bitcoin_new))]
 
ggplot(data = datas, aes(Date, pChange, group = type, colour = type)) +
  geom_line(size = 0.8) +
  theme_bw() +
  labs(x = "Date", y = "Bitcoin price change %",
       title = "Fit from Multiple Linear Regression")

Split the data to train and test set

set.seed(123)
indexes=sample(1:nrow(bitcoin_new), size=0.2*nrow(bitcoin_new))
test = bitcoin_new[indexes,]
train = bitcoin_new[-indexes,]

Use model fit.lm5_5 on train set

final_model = lm(formula = pChange ~ poly(Transaction_fees..BTC.,2) + 
    No_of_transactions + poly(Miners_revenue_pChange,2) + Cost_per_transaction + Difficulty + poly(Hash_rate,1) , data = train)
test$pred = predict(final_model, test)

Get the error in prediction

error = mean(abs(test$pred - test$pChange))
error
## [1] 4.743434