The goal of this data science project is to build a model which can accurately predict cryptocurrency and stock prices solely based on previous time series data. No external predictors will be used. The forecasting methods to be used are:
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
knitr::opts_chunk$set(cache = T, warning = F, message = F)
if(require(pacman)==FALSE) install.packages("pacman") # package manager
## Loading required package: pacman
pacman::p_load(tidyverse, # general functions
hrbrthemes, reactable,
magrittr, # for 2-way pipe
tidyquant, # for time series data
rvest, # stock names
data.table, DT,
h2o, rsample, prophet, # model building
forecast, timetk, imputeTS, fpp2, # time series helpers
doParallel, foreach, furrr) # parallelization
To get the stock names I will scrape the S&P 500 Wikipedia page.
content = read_html("https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1014924736")
tables <- content %>% html_table(fill = TRUE)
table <- tables[[1]]
sp_list <- table[-1,c(1,7)]
names(sp_list)[2]='AddedDate'
To ensure the time series has no empty values, I am filtering stocks which were created after 2009. Also, I am removing Berkshire Hathaway because it was giving problems. After, I will keep just 60 stocks.
sp_list %<>% filter(AddedDate < '2010-01-01')
sp_list = sp_list[-49,]
sp_list %<>% filter(Symbol != "BF.B")
stock_list = sp_list$Symbol[1:60]
Below I am setting the date ranges for the time series.
beg = as_date("2017-01-01")
end = as_date("2020-01-01")
stocks = tq_get(x = stock_list,
from = beg,
to = end, periodicity = "weekly") # getting the time series data
stocks %<>% select(symbol, date, adjusted)
head(stocks)
## # A tibble: 6 x 3
## symbol date adjusted
## <chr> <date> <dbl>
## 1 ABT 2017-01-01 37.0
## 2 ABT 2017-01-08 37.2
## 3 ABT 2017-01-15 37.0
## 4 ABT 2017-01-22 37.5
## 5 ABT 2017-01-29 39.1
## 6 ABT 2017-02-05 39.1
Now, I am getting the data for 5 stock market indexes.
ind_list = c('^DJI','^NYA','^GSPC','^IXIC','^N225')
ind = tq_get(x = ind_list, from = "2015-01-01", to = "2018-01-01", periodicity = "weekly") %>% select(symbol,date,adjusted)
head(ind)
## # A tibble: 6 x 3
## symbol date adjusted
## <chr> <date> <dbl>
## 1 ^DJI 2015-01-01 17585.
## 2 ^DJI 2015-01-08 17427.
## 3 ^DJI 2015-01-15 17554.
## 4 ^DJI 2015-01-22 17191.
## 5 ^DJI 2015-01-29 17673.
## 6 ^DJI 2015-02-05 17862.
To get the cryptocurrency data, I am using the CryptoCompare API. First I need to define some variables for the query, includes he cryptocurrencies I want.
pacman::p_load(jsonlite)
base = "https://min-api.cryptocompare.com/data/v2/histoday?fsym="
name = c("BTC","ETH","BNB","USDT","SOL","DOT","ADA","XRP","DOGE","SHIB","LTC","WBTC","UNI","BUSD","MATIC","MANA","ALGO","LINK","THETA","SAND","HNT","MIOTA","CAKE","MKR","NEO","BTT","EGLD","VET","ICP","XLM","AVAX","CRO","ATOM","NEAR","RUNE")
crypto_list = name
t = "&tsym=USD"
agg = "&aggregate=days"
lim = "&limit=360"
key = "&api_key=7bf4448fc2818704eace0e470e4f87eb108e90e760bef562f1df81687616b040"
Here I am preparing the urls which I will query in the next code chunk.
urls = c(paste0(base,name[1],t,lim,key),paste0(base,name[2],t,lim,key),paste0(base,name[3],t,lim,key),
paste0(base,name[4],t,lim,key),paste0(base,name[5],t,lim,key),paste0(base,name[6],t,lim,key),
paste0(base,name[7],t,lim,key),paste0(base,name[8],t,lim,key),paste0(base,name[9],t,lim,key),
paste0(base,name[10],t,lim,key),paste0(base,name[11],t,lim,key),paste0(base,name[12],t,lim,key),
paste0(base,name[13],t,lim,key),paste0(base,name[14],t,lim,key),paste0(base,name[15],t,lim,key),
paste0(base,name[16],t,lim,key),paste0(base,name[17],t,lim,key),paste0(base,name[18],t,lim,key),
paste0(base,name[19],t,lim,key),paste0(base,name[20],t,lim,key),paste0(base,name[21],t,lim,key),
paste0(base,name[22],t,lim,key),paste0(base,name[23],t,lim,key),paste0(base,name[24],t,lim,key),
paste0(base,name[25],t,lim,key),paste0(base,name[26],t,lim,key),paste0(base,name[27],t,lim,key),
paste0(base,name[28],t,lim,key),paste0(base,name[29],t,lim,key),paste0(base,name[30],t,lim,key),paste0(base,name[31],t,lim,key),paste0(base,name[32],t,lim,key),paste0(base,name[33],t,lim,key),paste0(base,name[34],t,lim,key),paste0(base,name[35],t,lim,key)
)
Now I will make 35 API calls at once for the data.
cryptolist = map(.x = urls, .f = fromJSON)
Here I convert the data types so I can format them neatly.
unlist_cc = function(x){
temp = x$Data$Data
}
crypto = map_df(.x = cryptolist, .f = unlist_cc)
The last step for the crypto data is to format it in a readable format.
crypto %<>% select(time, close)
crypto$time %<>% as_datetime() %>% as_date()
crypto$symbol = rep(name, each = (nrow(crypto)/length(crypto_list)))
crypto %<>% rename(date = time)
crypto %<>% relocate(symbol,.before=date)
head(crypto)
## symbol date close
## 1 BTC 2021-04-21 53803.25
## 2 BTC 2021-04-22 51717.61
## 3 BTC 2021-04-23 51178.03
## 4 BTC 2021-04-24 50115.99
## 5 BTC 2021-04-25 49120.97
## 6 BTC 2021-04-26 54062.29
For the sake of managing my R environment, I remove objects which are no longer useful for the upcoming analysis.
The final pre-processing step is renaming the price variable for consistency.
stocks %<>% rename(price = adjusted)
ind %<>% rename(price = adjusted)
crypto %<>% rename(price = close)
For time series data, it is not appropriate to split time series data into a training and validation set in which the first 80% (or other amount) of the dates are used for training. Rolling origin evaluation fixes this by first partitioning the data, but validation is done sequentially on each date starting at the beginning of the validation partition.
Since I am working with 100 time series, I must apply functions to each stock/crypto. Unfortunately, any apply function will not work for this project since I must aggregate results and run other functions within the loops. The first loop function is a rolling origin function which aggregates results.
rolling = function(dat, dat_list, initial){
results = data.frame(splits = c(), id = c())
for (i in dat_list){
temp = filter(dat, symbol == i)
temp = na_interpolation(temp)
temp_res = rolling_origin(temp, initial = which(temp$date == as.Date(initial)),
assess = 1,
cumulative = TRUE)
results = rbind(results, temp_res)
}
return(results)
}
Here we extract the training and validation data for the models.
getinfo = function(df, dat_list){
df %<>% mutate(
symbol = rep(dat_list, each = (nrow(df)/length(dat_list))),
data = map(.x = splits, .f = analysis),
trainDates = map(.x = data, .f = extract2, 'date'),
trainPrice = map(.x = data, .f = extract2, 'price'),
target_data = map(.x = splits, .f = assessment),
targetDate = map(.x = target_data, .f = extract2, 'date'),
targetPrice = map_dbl(.x = target_data, .f = extract2, 'price'))
return(df)
}
Since I created functions above, now all I have to do is pass each group of data (stocks, indexes, crypto) through the functions.
initial_s = stocks$date[.8*nrow(stocks)/length(stock_list)]
new_stocks = rolling(stocks, stock_list, initial_s)
new_stocks = getinfo(new_stocks, stock_list)
head(new_stocks)
## # A tibble: 6 x 9
## splits id symbol data trainDates trainPrice target_data targetDate
## <list> <chr> <chr> <list> <list> <list> <list> <list>
## 1 <split [~ Slice~ ABT <tibble~ <date [12~ <dbl [125~ <tibble [1 ~ <date [1]>
## 2 <split [~ Slice~ ABT <tibble~ <date [12~ <dbl [126~ <tibble [1 ~ <date [1]>
## 3 <split [~ Slice~ ABT <tibble~ <date [12~ <dbl [127~ <tibble [1 ~ <date [1]>
## 4 <split [~ Slice~ ABT <tibble~ <date [12~ <dbl [128~ <tibble [1 ~ <date [1]>
## 5 <split [~ Slice~ ABT <tibble~ <date [12~ <dbl [129~ <tibble [1 ~ <date [1]>
## 6 <split [~ Slice~ ABT <tibble~ <date [13~ <dbl [130~ <tibble [1 ~ <date [1]>
## # ... with 1 more variable: targetPrice <dbl>
initial_i = ind$date[.8*nrow(ind)/length(ind_list)]
new_index = rolling(ind, ind_list, initial_i)
new_index = getinfo(new_index, ind_list)
head(new_index)
## # A tibble: 6 x 9
## splits id symbol data trainDates trainPrice target_data targetDate
## <list> <chr> <chr> <list> <list> <list> <list> <list>
## 1 <split [~ Slice~ ^DJI <tibble~ <date [12~ <dbl [125~ <tibble [1 ~ <date [1]>
## 2 <split [~ Slice~ ^DJI <tibble~ <date [12~ <dbl [126~ <tibble [1 ~ <date [1]>
## 3 <split [~ Slice~ ^DJI <tibble~ <date [12~ <dbl [127~ <tibble [1 ~ <date [1]>
## 4 <split [~ Slice~ ^DJI <tibble~ <date [12~ <dbl [128~ <tibble [1 ~ <date [1]>
## 5 <split [~ Slice~ ^DJI <tibble~ <date [12~ <dbl [129~ <tibble [1 ~ <date [1]>
## 6 <split [~ Slice~ ^DJI <tibble~ <date [13~ <dbl [130~ <tibble [1 ~ <date [1]>
## # ... with 1 more variable: targetPrice <dbl>
initial_c = crypto$date[.8*nrow(crypto)/length(crypto_list)]
new_crypto = rolling(crypto, crypto_list, initial_c)
new_crypto = getinfo(new_crypto, crypto_list)
head(new_crypto)
## # A tibble: 6 x 9
## splits id symbol data trainDates trainPrice target_data targetDate
## <list> <chr> <chr> <list> <list> <list> <list> <list>
## 1 <split [~ Slice~ BTC <df [28~ <date [288~ <dbl [288~ <df [1 x 3~ <date [1]>
## 2 <split [~ Slice~ BTC <df [28~ <date [289~ <dbl [289~ <df [1 x 3~ <date [1]>
## 3 <split [~ Slice~ BTC <df [29~ <date [290~ <dbl [290~ <df [1 x 3~ <date [1]>
## 4 <split [~ Slice~ BTC <df [29~ <date [291~ <dbl [291~ <df [1 x 3~ <date [1]>
## 5 <split [~ Slice~ BTC <df [29~ <date [292~ <dbl [292~ <df [1 x 3~ <date [1]>
## 6 <split [~ Slice~ BTC <df [29~ <date [293~ <dbl [293~ <df [1 x 3~ <date [1]>
## # ... with 1 more variable: targetPrice <dbl>
In this section, I will apply naive, holt-winters, arima, autoML, and prophet forecasts to each time series. Later I will compare the results of each model.
This function applies naive, holt-winters, and arima forecasts to all the time series. I explain later why we cannot apply this simply to the other two models.
NHA = function(df){
plan(multiprocess)
df %<>% mutate(
Naive = future_map(.x = trainPrice, .f = naive, h = 1) %>%
map_dbl(.f = extract2, "mean"),
Holt = future_map(.x = trainPrice, .f = holt, h = 1) %>%
map_dbl(.f = extract2, "mean"),
Arima = future_map(.x = trainPrice, .f = auto.arima) %>%
map(.f = forecast, h = 1) %>%
map_dbl(.f = extract2, "mean")
)
return(df)
}
new_stocks = NHA(new_stocks)
new_index = NHA(new_index)
new_crypto = NHA(new_crypto)
For some odd reason, the prophet documentation states that in order to use the model, the dates in the time series must be named “ds” and the actual values must be named “y”. If you use any other labeling, R throws errors. Because of this, I had to create a function which converts the columns names and then applies the prophet function.
prophetloop = function(nf){
prophetfun = function(y, dates, targetDate){
pacman::p_load(tidyverse, magrittr, prophet)
df = as.data.frame(cbind(dates, y)) %>%
rename(ds=1,y=2)
df = unnest(df, cols = c(ds,y))
model = prophet(df, yearly.seasonality = "auto",
weekly.seasonality = "auto")
targetDate %<>% as.data.frame() %>%
rename(ds=1)
pred = predict(model, targetDate)
res = pred$yhat
return(res)
}
i = 1
for (i in 1:nrow(nf)){
val = prophetfun(y = nf[i,]$trainPrice, dates = nf[i,]$trainDates,
targetDate = nf[i,]$targetDate)
nf[i,]$Prophet = val
i = i + 1
}
return(nf)
}
Here I simply apply the custom prophet function to the three sets of time series.
new_stocks['Prophet'] = NA
new_stocks = prophetloop(new_stocks)
new_index['Prophet'] = NA
new_index = prophetloop(new_index)
new_crypto['Prophet'] = NA
new_crypto = prophetloop(new_crypto)
AutoML is easily the hardest model to get running for my many time series. This is because autoML does not like to loop more than 30 times in a single h2o connection. As a result, I applied this function in a separate r file four times (for a total of 100 time series). I then appended all the data and brought it back into this file.
Here is the function which creates new variables from just the time series data itself (no external data) so that the machine learning models have more data for prediction.
feat_df = tk_get_timeseries_signature(unique(data$date)) %>%
na.omit()
nums = unlist(lapply(feat_df, is.numeric))
feat_df = feat_df[,nums]
feat_df = feat_df[c(TRUE, lapply(feat_df[-1], var, na.rm = TRUE) != 0)]
Since autoML relies on an API, I had to first initiate autoML session and then convert my data to meet the autoML model input requirements. From there, I compile the model metrics.
automl = function(data, data_list, minutes){
feat_df = tk_get_timeseries_signature(unique(data$date)) %>%
na.omit()
nums = unlist(lapply(feat_df, is.numeric))
feat_df = feat_df[,nums]
feat_df = feat_df[c(TRUE, lapply(feat_df[-1], var, na.rm = TRUE) != 0)]
feat_df = cbind(unique(data$date)[-1], feat_df) %>% rename(index=1)
feat_df %<>% select(-year.iso)
results = data.frame(Symbol = c(), MSE = c(),
MAE = c(), RMSE = c())
h2o.init()
for (i in data_list){
df = filter(data, symbol == i)
df = merge(df, feat_df, by.x = "date", by.y = "index")
df %<>% select(-date)
df %<>% mutate(lag1 = lag(price), lag2 = lag(price, k = 2),
lag3 = lag(price, k = 3))
traindf = df[1:floor(.8*nrow(df)),] %>% as.h2o()
validdf = df[ceiling(.8*nrow(df)):floor(nrow(df)*.9),] %>% as.h2o()
testdf = df[ceiling(nrow(df)*.9):nrow(df),] %>% as.h2o()
x = 3:(ncol(df))
h2o.init()
model = h2o.automl(x = x, y = 'price',
training_frame = traindf,
validation_frame = validdf,
leaderboard_frame = testdf,
nfolds = 5,
stopping_metric = "AUTO",
max_runtime_secs = (minutes * 60))
best = model@leader
autoML_pred = h2o.predict(best, newdata = testdf)
error = h2o.performance(best, newdata = testdf)
MSE = error@metrics$MSE
RMSE = error@metrics$RMSE
MAE = error@metrics$mae
res = cbind(i, MSE, RMSE, MAE)
results = rbind(results, res)
}
results %<>% rename(Symbol = i)
return(results)
}
autoML_stocks = automl(stocks, stock_list, .5)
autoML_index = automl(ind, ind_list, 1)
autoML_crypto = automl(crypto, crypto_list, .5)
Below I calculate the error for all models except autoML (that was done within the autoML custom function) in one function.
res1 = function(df){
results = df %>% mutate(
errorNaive = targetPrice - Naive,
peNaive = errorNaive / targetPrice,
errorHolt = targetPrice - Holt,
peHolt = errorHolt / targetPrice,
errorArima = targetPrice - Arima,
peArima = errorArima / targetPrice,
errorProphet = targetPrice - Prophet,
peProphet = errorProphet / targetPrice
)
results %<>% select(type, symbol, errorNaive, peNaive, errorHolt, peHolt, errorArima, peArima, errorProphet, peProphet)
return(results)
}
This chunk merges the error dataframe completed in the last step with all the stock/crypto names.
res_nhap = rbind(new_stocks, new_index, new_crypto)
res_nhap$type = c(rep('Stock',nrow(new_stocks)), rep('Index',nrow(new_index)), rep('Crypto',nrow(new_crypto)))
res_nhap = res1(res_nhap)
To compare the results for the models (excluding autoML), I calculate the mean absolute error (MAE), mean absolute percent error (MAPE), and root mean square error (RMSE).
options(scipen=999)
results_nhap = res_nhap %>% group_by(symbol) %>% mutate(
MAE_Naive = mean(abs(errorNaive)),
MAPE_Naive = mean(abs(peNaive))*100,
RMSE_Naive = sqrt(mean(errorNaive^2)),
MAE_Holt = mean(abs(errorHolt)),
MAPE_Holt = mean(abs(peHolt))*100,
RMSE_Holt = sqrt(mean(errorHolt^2)),
MAE_Arima = mean(abs(errorArima)),
MAPE_Arima = mean(abs(peArima))*100,
RMSE_Arima = sqrt(mean(errorArima^2)),
MAE_Prophet = mean(abs(errorProphet)),
MAPE_Prophet = mean(abs(peProphet))*100,
RMSE_Prophet = sqrt(mean(errorProphet^2))
) %>% select(c(1,2,11:22)) %>% unique()
For apples-to-apples comparison for all models, I will use RMSE, since autoML does not calculate MAPE.
autoML_Results = readRDS('autoML_Results.rds') %>% select(-type,-MSE) %>% rename(RMSE_autoML=RMSE, MAE_autoML=MAE)
autoML_Results$RMSE_autoML %<>% as.numeric()
autoML_Results$MAE_autoML %<>% as.numeric()
final_results = merge(results_nhap, autoML_Results, by.x = 'symbol', by.y = 'Symbol') %>% mutate(across(is.numeric, base::round, digits=3))
Here is a chart with all accuracy metrics with their respective stock/cryptocurrency.
head(final_results)
## symbol type MAE_Naive MAPE_Naive RMSE_Naive MAE_Holt MAPE_Holt RMSE_Holt
## 1 ^DJI Index 162.350 0.709 202.206 143.243 0.626 184.834
## 2 ^GSPC Index 13.934 0.549 17.719 13.145 0.518 16.580
## 3 ^IXIC Index 62.455 0.956 75.063 62.036 0.950 72.137
## 4 ^N225 Index 258.030 1.212 343.636 257.229 1.211 333.674
## 5 ^NYA Index 71.135 0.585 90.486 67.413 0.554 87.601
## 6 A Stock 1.438 1.979 1.757 1.403 1.935 1.730
## MAE_Arima MAPE_Arima RMSE_Arima MAE_Prophet MAPE_Prophet RMSE_Prophet
## 1 153.128 0.671 190.596 301.782 1.332 360.628
## 2 13.542 0.534 17.401 25.158 0.993 30.525
## 3 62.610 0.959 73.002 56.591 0.872 69.022
## 4 258.030 1.212 343.636 432.946 2.075 553.617
## 5 71.135 0.585 90.486 162.773 1.339 212.442
## 6 1.432 1.972 1.752 3.097 4.222 3.890
## RMSE_autoML MAE_autoML
## 1 346.167 281.519
## 2 49.915 44.011
## 3 154.802 134.157
## 4 690.358 608.160
## 5 402.793 380.504
## 6 1.339 1.139
# All RMSEs
RMSE = final_results %>% select(symbol,type,starts_with('RMSE')) %>% pivot_longer(3:7) %>% rename(RMSE = value)
RMSE$name %<>% str_replace_all("RMSE_","")
Below are some data visualizations comparing the accuracy of the models.
RMSE %>% group_by(name) %>% summarize(RMSE=mean(RMSE)) %>% arrange(RMSE) %>% ggplot(aes(x = reorder(name,-RMSE), y = RMSE, fill = name)) + geom_col() + scale_fill_viridis_d() + coord_flip() + xlab('Forecasting Method') + ggtitle('Average Error by Forecasting Method') + labs(caption = "RMSE used because MAPE was not available for autoML") + geom_text(aes(label=base::round(RMSE,3)), nudge_y = 20) + theme_minimal()
The Arima, Naive, and Holt forecasts are essentially all the same (with the difference likely being due to the sample size of 100). I conclude that since no forecast can improve upon the naive forecast, none of these methods (with my selected parameters) are appropriate for predicting future values of stocks and cryptocurrencies.
It is likely that autoML or Prophet could be improved by feeding more variables (some of which might supply useful information) or adjusting model parameters.
RMSE %>% ggplot(aes(x = name, y = RMSE, fill = name)) + geom_boxplot() + theme_minimal() + coord_flip() + ylim(0,40) + scale_fill_viridis_d() + ggtitle('Error Distribution by Model') + xlab('Model') + theme(legend.position = "none")
RMSE %>% ggplot(aes(x = type, y = RMSE, fill = type)) + geom_violin() + coord_flip() + ylim(0,40) + scale_fill_viridis_d(begin = .2) + theme(legend.position = "none") + xlab('Time Series Type') + ggtitle('Error Distribution by Type of Time Series') + theme_minimal()
Note: There were a few ties between lowest RMSEs for a few stocks. Each tie was between Arima and Naive. Each tie will still be counted as +1 for both.
windf = RMSE %>% group_by(symbol) %>% summarize(RMSE = min(RMSE))
winners = merge(windf, RMSE, by.x = c('symbol','RMSE'), by.y = c('symbol','RMSE')) %>% select(name)
winners_summary = winners %>% count(winners$name) %>% rename(Model=1,NumberWins=2)
winners_summary %>% ggplot(aes(x=reorder(Model,NumberWins),y=NumberWins,fill=Model)) + geom_col() + scale_fill_viridis_d() + coord_flip() + theme_minimal() + ggtitle('Number of Wins for Each Model') + xlab('Model') + theme(legend.position = "none") + geom_text(aes(label=NumberWins), nudge_y = 2)