About me
In the context of economics mathematics, the Hidden Markov Model (HMM) has been widely utilized to forecast economic regimes and stock prices. In this study, we look at a new HMM approach to stock price prediction. In this work, we show how to measure the HMM’s performance with different numbers of states using the Akaike Information Criterion (AIC), the Hannan– Quinn Information Criterion (HQC), the Bayesian Information Criterion (BIC), and the Bozdogan Consistent Akaike Information Criterion (AIC). After that, we’ll show you how to use HMM to forecast stock prices and how to validate the model with a test set.
This involves collecting data for the study, which we performed by following the steps mentioned below :
We apply the HMM for numerous independent variables in this investigation, such as the BTC open, low, high, and closing prices. To keep the model simple and practical with cyclical economic cycles, we limit the number of states in the HMM to six. The AIC, BIC, HQC, and CAIC are four metrics used to choose the best HMM model. We utilize the HMM to predict the BTC price after selecting the best model.
The following R packages are required for this
study.
## Load Packages
# Importing the required packageS
library(Metrics)
library(zoo,warn.conflicts=FALSE)
library(lubridate,warn.conflicts=FALSE)
library(mgcv,warn.conflicts=FALSE)
library(rugarch,warn.conflicts=FALSE)
# visualization
suppressPackageStartupMessages(library(ggplot2))
# calculating returns
suppressPackageStartupMessages(library(PerformanceAnalytics))
library(tidyverse) # ggplot(), %>%, mutate(), and friends
library(tidyr) #For ggplot
library(broom) # Convert models to data frames
library(scales) # Format numbers with functions like comma(), percent(), and dollar()
library(modelsummary) # Create side-by-side regression tables
# To read csv and xlsx files
library(readr)
library(here)
library(readxl)
#library(nlme)
#Transform data to time series
library(zoo)
library(dplyr)
library(ggrepel)
library(ggpubr)
library(forecast)
library(pastecs)
#For Maximum Likelihood method
library(bbmle)
#For Garch model
library(fBasics)
library(fGarch)
library(datetime)
library(depmixS4)
## Numbers X X.1 X.2
## Length:46391 Length:46391 Length:46391 Length:46391
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X.3 X.4 X.5 X.6
## Length:46391 Length:46391 Length:46391 Length:46391
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X.7 X.8 X.9 X.10
## Length:46391 Mode:logical Length:46391 Length:46391
## Class :character NA's:46391 Class :character Class :character
## Mode :character Mode :character Mode :character
## X.11 X.12 X.13 X.14
## Length:46391 Length:46391 Length:46391 Length:46391
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## 'data.frame': 46391 obs. of 16 variables:
## $ Numbers: chr "Numbers" "1650240000000" "1650236400000" "1650232800000" ...
## $ X : chr "date" "4/18/2022 0:00" "4/17/2022 23:00" "4/17/2022 22:00" ...
## $ X.1 : chr "symbol" "BTC/USD" "BTC/USD" "BTC/USD" ...
## $ X.2 : chr "open" "39694.59" "39996.41" "40214.81" ...
## $ X.3 : chr "high" "39773.19" "40001.55" "40333.3" ...
## $ X.4 : chr "low" "39597.36" "39572.66" "39962.41" ...
## $ X.5 : chr "close" "39738.54" "39694.59" "39996.41" ...
## $ X.6 : chr "Volume BTC" "32.48284535" "59.81035106" "21.98654713" ...
## $ X.7 : chr "Volume USD" "1290820.8492548" "2374147.3630828" "879382.9534958" ...
## $ X.8 : logi NA NA NA NA NA NA ...
## $ X.9 : chr "" "" "open" "high" ...
## $ X.10 : chr "" "MIN" "760.38" "769.1" ...
## $ X.11 : chr "" "MAX" "68636.96" "69000" ...
## $ X.12 : chr "" "MEAN" "16903.499974006" "16998.313438917" ...
## $ X.13 : chr "" "S.D" "17563.812175808" "17662.29582737" ...
## $ X.14 : chr "" "MEDIAN" "8953.47" "9013.11" ...
data<-data[1:9]
column_names<-data[1,]
names(data)<-column_names
head(rename(data,volume_BTC = `Volume BTC`))
## Numbers date symbol open high low close
## 1 Numbers date symbol open high low close
## 2 1650240000000 4/18/2022 0:00 BTC/USD 39694.59 39773.19 39597.36 39738.54
## 3 1650236400000 4/17/2022 23:00 BTC/USD 39996.41 40001.55 39572.66 39694.59
## 4 1650232800000 4/17/2022 22:00 BTC/USD 40214.81 40333.3 39962.41 39996.41
## 5 1650229200000 4/17/2022 21:00 BTC/USD 40272.78 40289.72 40145.42 40214.81
## 6 1650225600000 4/17/2022 20:00 BTC/USD 40378.75 40464.42 40269.08 40272.78
## volume_BTC Volume USD
## 1 Volume BTC Volume USD
## 2 32.48284535 1290820.8492548
## 3 59.81035106 2374147.3630828
## 4 21.98654713 879382.9534958
## 5 8.43811461 339337.17579937
## 6 13.71748536 552441.2700565
data<-data[-1,]
data$date<-as_datetime(data$date, format="%m/%d/%Y %H:%M")
for (i in 4:9) {
data[[i]]<-as.numeric(data[[i]])
}
str(data)
## 'data.frame': 46390 obs. of 9 variables:
## $ Numbers : chr "1650240000000" "1650236400000" "1650232800000" "1650229200000" ...
## $ date : POSIXct, format: "2022-04-18 00:00:00" "2022-04-17 23:00:00" ...
## $ symbol : chr "BTC/USD" "BTC/USD" "BTC/USD" "BTC/USD" ...
## $ open : num 39695 39996 40215 40273 40379 ...
## $ high : num 39773 40002 40333 40290 40464 ...
## $ low : num 39597 39573 39962 40145 40269 ...
## $ close : num 39739 39695 39996 40215 40273 ...
## $ Volume BTC: num 32.48 59.81 21.99 8.44 13.72 ...
## $ Volume USD: num 1290821 2374147 879383 339337 552441 ...
summary(data)
## Numbers date symbol
## Length:46390 Min. :2017-01-01 00:00:00.00 Length:46390
## Class :character 1st Qu.:2018-04-29 05:15:00.00 Class :character
## Mode :character Median :2019-08-25 10:30:00.00 Mode :character
## Mean :2019-08-25 11:51:19.21
## 3rd Qu.:2020-12-20 18:45:00.00
## Max. :2022-04-18 00:00:00.00
## open high low close
## Min. : 760.4 Min. : 769.1 Min. : 752 Min. : 760.4
## 1st Qu.: 5915.8 1st Qu.: 5958.4 1st Qu.: 5876 1st Qu.: 5916.5
## Median : 8953.5 Median : 9013.1 Median : 8905 Median : 8954.8
## Mean :16903.5 Mean :16998.3 Mean :16801 Mean :16904.3
## 3rd Qu.:23200.2 3rd Qu.:23312.1 3rd Qu.:23009 3rd Qu.:23208.9
## Max. :68637.0 Max. :69000.0 Max. :68478 Max. :68637.0
## Volume BTC Volume USD
## Min. : 0.00 Min. : 0
## 1st Qu.: 28.52 1st Qu.: 256908
## Median : 68.52 Median : 756279
## Mean : 153.54 Mean : 1782066
## 3rd Qu.: 165.82 3rd Qu.: 2011121
## Max. :8526.75 Max. :103962871
hourly_data <- data %>%
mutate(date_hour = floor_date(date, "hour")) %>%
group_by(date_hour) %>% summarise(open = open,high = high,low = low,close = close,`Volume BTC` = `Volume BTC`,`Volume USD` = `Volume USD`)
time_frame <- as_datetime(c("2017-01-01 00:00:00
", "2022-04-18 00:00:00"))
date_hour <- data.frame(
date_hour = seq(time_frame[1], time_frame[2], by = "hour")
)
hourly_data <- hourly_data %>%
right_join(date_hour, by = "date_hour")
hourly_data <- na.omit(hourly_data)
head(hourly_data)
## # A tibble: 6 × 7
## date_hour open high low close `Volume BTC` `Volume USD`
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2017-01-01 00:00:00 975. 975. 965. 972 10.9 10609.
## 2 2017-01-01 01:00:00 972 972 970. 971. 2.20 2139.
## 3 2017-01-01 02:00:00 971. 974. 968. 970. 81.7 79222.
## 4 2017-01-01 03:00:00 970. 972. 968. 968. 2.38 2305.
## 5 2017-01-01 04:00:00 968. 968. 968. 968. 5.04 4880.
## 6 2017-01-01 05:00:00 968. 968. 965 967. 0.394 381.
tail(hourly_data)
## # A tibble: 6 × 7
## date_hour open high low close `Volume BTC` `Volume USD`
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2022-04-17 19:00:00 40120. 40447. 40102. 40379. 16.3 656464.
## 2 2022-04-17 20:00:00 40379. 40464. 40269. 40273. 13.7 552441.
## 3 2022-04-17 21:00:00 40273. 40290. 40145. 40215. 8.44 339337.
## 4 2022-04-17 22:00:00 40215. 40333. 39962. 39996. 22.0 879383.
## 5 2022-04-17 23:00:00 39996. 40002. 39573. 39695. 59.8 2374147.
## 6 2022-04-18 00:00:00 39695. 39773. 39597. 39739. 32.5 1290821.
Let’s print some statistics such as the mean, median, min, max and standard deviation values for the above feature (column).
stat.desc(hourly_data) # or we can use describe(hourly_data)
## date_hour open high low close
## nbr.val 4.639000e+04 4.639000e+04 4.639000e+04 4.639000e+04 4.639000e+04
## nbr.null 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min 1.483229e+09 7.603800e+02 7.691000e+02 7.520000e+02 7.603800e+02
## max 1.650240e+09 6.863696e+04 6.900000e+04 6.847794e+04 6.863696e+04
## range 1.670112e+08 6.787658e+04 6.823090e+04 6.772594e+04 6.787658e+04
## sum 7.268078e+13 7.841534e+08 7.885518e+08 7.794161e+08 7.841921e+08
## median 1.566729e+09 8.953470e+03 9.013110e+03 8.904650e+03 8.954780e+03
## mean 1.566734e+09 1.690350e+04 1.699831e+04 1.680138e+04 1.690433e+04
## SE.mean 2.238569e+05 8.154674e+01 8.200399e+01 8.105320e+01 8.154750e+01
## CI.mean.0.95 4.387629e+05 1.598328e+02 1.607291e+02 1.588655e+02 1.598343e+02
## var 2.324691e+15 3.084875e+08 3.119567e+08 3.047647e+08 3.084933e+08
## std.dev 4.821505e+07 1.756381e+04 1.766230e+04 1.745751e+04 1.756398e+04
## coef.var 3.077425e-02 1.039064e+00 1.039062e+00 1.039052e+00 1.039022e+00
## Volume BTC Volume USD
## nbr.val 4.639000e+04 4.639000e+04
## nbr.null 2.190000e+02 2.190000e+02
## nbr.na 0.000000e+00 0.000000e+00
## min 0.000000e+00 0.000000e+00
## max 8.526751e+03 1.039629e+08
## range 8.526751e+03 1.039629e+08
## sum 7.122614e+06 8.267002e+10
## median 6.852298e+01 7.562795e+05
## mean 1.535377e+02 1.782066e+06
## SE.mean 1.287140e+00 1.522183e+04
## CI.mean.0.95 2.522815e+00 2.983503e+04
## var 7.685573e+04 1.074876e+13
## std.dev 2.772287e+02 3.278530e+06
## coef.var 1.805606e+00 1.839736e+00
# For Close price
Close_ts <- zoo(
x = hourly_data[["close"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
# For Open price
Open_ts <- zoo(
x = hourly_data[["open"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
# For High price
High_ts <- zoo(
x = hourly_data[["high"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
# For Low price
Low_ts <- zoo(
x = hourly_data[["low"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
# For Volume BTC
Volume_BTC_ts <- zoo(
x = hourly_data[["Volume BTC"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
# For Volume USD
Volume_USD_ts <- zoo(
x = hourly_data[["Volume USD"]],
order.by = hourly_data[["date_hour"]],
frequency = 24
)
head(Close_ts)
## 2017-01-01 00:00:00 2017-01-01 01:00:00 2017-01-01 02:00:00 2017-01-01 03:00:00
## 972.00 970.55 969.89 967.80
## 2017-01-01 04:00:00 2017-01-01 05:00:00
## 967.80 966.56
tail(Close_ts)
## 2022-04-17 19:00:00 2022-04-17 20:00:00 2022-04-17 21:00:00 2022-04-17 22:00:00
## 40378.75 40272.78 40214.81 39996.41
## 2022-04-17 23:00:00 2022-04-18 00:00:00
## 39694.59 39738.54
# For Close price
autoplot(Close_ts,col="Close" ,main = 'Time series plot of hourly close price from Jan. 01,
2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('Close'='blue'), breaks = c('Close'))+
theme(plot.title = element_text(hjust = 0.5))
# For Open price
autoplot(Open_ts,col="Open" ,main = 'Time series plot of hourly open price from Jan. 01,
2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('Open'='green'), breaks = c('Open'))+
theme(plot.title = element_text(hjust = 0.5))
# For High price
autoplot(High_ts,col="High" ,main = 'Time series plot of hourly high price from Jan. 01,
2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('High'='black'), breaks = c('High'))+
theme(plot.title = element_text(hjust = 0.5))
# For Low price
autoplot(Low_ts,col="Low" ,main = 'Time series plot of hourly low price from Jan. 01,
2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('Low'='red'), breaks = c('Low'))+
theme(plot.title = element_text(hjust = 0.5))
The figure above represents the series (Open, Close, High, Low). The first observations we can take from this series are the following: • The series has an increasing trend. • We can use the log of the series to remove this variation of amplitude in the series. • The series is not stationary, the variance is not constant over time.
# For Volume BTC
autoplot(Volume_BTC_ts,col="Volume BTC" ,main = 'Time series plot of hourly volume BTC from
Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('Volume BTC'='brown'), breaks = c('Volume BTC'))+
theme(plot.title = element_text(hjust = 0.5))
# For Volume USD
autoplot(Volume_USD_ts,col="Volume USD" ,main = 'Time series plot of hourly volume USD
from Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM '
,xlab='Date: From Jan. 01, 2017 12h AM, to Apr. 18, 2022 12h AM', ylab="Prices")+
scale_color_manual(values = c('Volume USD'='orange'), breaks = c('Volume USD'))+
theme(plot.title = element_text(hjust = 0.5))
#---bloc of data size 1000----
bloc_data<-hourly_data[2:5]# We chose close, open, high, low prices
bloc_data<-bloc_data[35000:36000,] # Chose periode from 2020 December 29 10:00:00 ---› 2021 February 09 02:00:00
bloc_data<-as.matrix(bloc_data)
The parameters of an HMM are the constant matrix \(A\), the observation probability matrix \(B\) and the vector \(p\), which is summarized in a compact notation: \(\lambda =\{N, M, A, B, p\}\)
N: Number of state.M: Number of different observations that price can make
in one given time step in this case we’re looking at four different
prices (close, open, low, high) that we can observe.A: Two-dimensional matrix in principle of transition
probabilities. The probability of mving from state i to
another state j every time step the price must move from
one state to another even if that transition means staying in
place.B: Vector of probabilities of seeing the different
observations given that you’re in a particular state. Otherwise, B is a
varaible that captures the probability of seeing the different prices
depending on what state you’re in.p: Variable that captures what the probability is
starting in a particular state.#a<-matrix(rep(1/number_state,number_state*number_state), number_state)# A initial
#p<-numeric(number_state) # p initial
#p[1]<-1
b_norm<-function(price,number_state) # function for B initial
{
parameter<-matrix(0,nrow = 2,ncol=number_state)#B is a list of matrice observation
v<-sd(price)
m<-mean(price)
for (j in 1:number_state) {
parameter[1,j]<-m + rnorm(1)
parameter[2,j]<-v
}
symbol<-unique(price)
M<-length(symbol)
B<-matrix(0,nrow = number_state ,ncol= M)
for (j in 1:number_state) {
m<-parameter[1,j]
v<-parameter[2,j]
B[j,]<-pnorm(symbol,m,v) #normal probability
}
return(B)
}
#--data block
move_data<-function(data_set,bloc_number,size_bloc)
{
T<-bloc_number + size_bloc
bloc_data<-data_set[bloc_number:T,]
return(bloc_data)
}
#----------AIC and BIC with depmixs4
criteria_depmix<-function(number_state,size_bloc,v_data){
m<-size_bloc
aic_result<-numeric(m)
bic_result<-numeric(m)
for (i in 1:m) {
#print(i)
block<-move_data(v_data,i,size_bloc)
y1<-block[,1]
y2<-block[,2]
y3<-block[,3]
y4<-block[,4]
df<-data.frame(y1,y2,y3,y4)
m2 <-mix(list(y1~1,y2~1,y3~1,y4~1),data=df,ns=number_state,family=list(gaussian(),gaussian(),gaussian(),gaussian()),ntimes=length(y4))
#mix(y4~1,data=df,ns=number_state,family=gaussian())
fm2<-fit(m2)
aic_result[i]<-AIC(fm2)
bic_result[i]<-BIC(fm2)
}
return(list(aic = aic_result,bic = bic_result))
}
#-----graphes
#result_a<-matrix(0,8,120)
#result_b<-matrix(0,8,120)
#for (i in 2:8) {
# r<-criteria_depmix(i,120,bloc_data)
# result_a[i-1,]<-r[[1]]
# result_b[i-1,]<-r[[2]]
#}
#result_L<-result_L[,1:120]
#AIC graphe
#result_a1<- result_a
#result_a<- result_a1[,1:60]
#plot(result_a[1,],type="l",main="AIC for 120 HMM’s parameter calibrations BTC hourly #prices",ylab="AIC",
# xlab="index",ylim=c(7500,8500),pch=5)
#lines(result_a[2,], pch=1,lty="dashed",col="blue")
#lines(result_a[3,], pch=2,lty="dashed",col="red")
#lines(result_a[4,], pch=3,lty="dashed",col="green")
#lines(result_a[5,], pch=4,lty="dashed",col="maroon")
#lines(result_a[6,], pch=4,lty="dashed",col="yellow")
#lines(result_a[7,], pch=4,lty="dashed",col="magenta")
#legend(x = "topright", # Position
# legend = c("2-state","3-state", "4-state","5-state", "6-state", "7-state", "8-state"),
# Legend texts
# lty = c(1,2,2,2), # Line types
# col = c("black","blue","red","green","maroon","yellow", "magenta"), # Line colors
# lwd = 2 , box.lty=0) # Line width
#BIC graphe
#result_b1<- result_b
#result_b<- result_b1[,1:60]
#plot(result_b[1,],type="l",main="BIC for 120 HMM’s parameter calibrations BTC hourly prices",ylab="BIC",
# xlab="index",ylim=c(7500,8500),pch=5)
#lines(result_b[2,], pch=1,lty="dashed",col="blue")
#lines(result_b[3,], pch=2,lty="dashed",col="red")
#lines(result_b[4,], pch=3,lty="dashed",col="green")
#lines(result_b[5,], pch=4,lty="dashed",col="maroon")
#lines(result_b[6,], pch=4,lty="dashed",col="yellow")
#lines(result_b[7,], pch=4,lty="dashed",col="magenta")
#legend(x = "topright", # Position
# legend = c("2-state","3-state", "4-state","5-state", "6-state", "7-state", "8-state"),
# Legend texts
# lty = c(1,2,2,2), # Line types
# col = c("black","blue","red","green","maroon","yellow", "magenta"), # Line colors
# lwd = 2 , box.lty=0) # Line width