library(quantmod)
## 필요한 패키지를 로딩중입니다: xts
## 필요한 패키지를 로딩중입니다: zoo
##
## 다음의 패키지를 부착합니다: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## 필요한 패키지를 로딩중입니다: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(magrittr)
getSymbols(Symbols = 'NVDA', from = '2016-01-01')
## [1] "NVDA"
head(NVDA)
## NVDA.Open NVDA.High NVDA.Low NVDA.Close NVDA.Volume NVDA.Adjusted
## 2016-01-04 8.0725 8.1450 8.0100 8.0925 35807600 7.902439
## 2016-01-05 8.2450 8.3600 8.1250 8.2225 49027200 8.029387
## 2016-01-06 8.0875 8.1250 7.7900 7.8825 44934400 7.697371
## 2016-01-07 7.6850 7.7375 7.4700 7.5700 64530400 7.392212
## 2016-01-08 7.6675 7.6750 7.3925 7.4075 39847200 7.233527
## 2016-01-11 7.4150 7.4725 7.2875 7.4200 40937200 7.245734
nvda <- NVDA$NVDA.Adjusted
tail(nvda)
## NVDA.Adjusted
## 2023-05-05 286.80
## 2023-05-08 291.51
## 2023-05-09 285.71
## 2023-05-10 288.85
## 2023-05-11 285.78
## 2023-05-12 283.40
#Visualize Stock Price
library(ggplot2)
nvda %>%
ggplot(aes(x=index(nvda),y=NVDA.Adjusted))+
geom_line(color='#4373B6')+
labs(x='Date',y='Price',title='NVDA Daily Price (2016 - 2023)')+
theme_bw()
#Compute return using diff log
nvda_return <- diff(log(nvda)) %>% na.omit()
head(nvda_return)
## NVDA.Adjusted
## 2016-01-05 0.015936740
## 2016-01-06 -0.042229344
## 2016-01-07 -0.040451828
## 2016-01-08 -0.021700268
## 2016-01-11 0.001686136
## 2016-01-12 0.016706137
#Visualize Stock Return
nvda_return %>%
ggplot(aes(x=index(nvda_return),y=NVDA.Adjusted))+
geom_line(color='#4373B6')+
labs(x='Date',y='Return',title='NVDA Daily Return (2016 - 2023)')+
theme_bw()+
scale_y_continuous(labels = scales::percent)
length(nvda_return)
## [1] 1852
#Use data from 2016 to 2022 define the order of (p,d,q) for arima
nvda_return[1761,]
## NVDA.Adjusted
## 2022-12-30 0.0007530183
train <- nvda_return[1:1761,]
head(train)
## NVDA.Adjusted
## 2016-01-05 0.015936740
## 2016-01-06 -0.042229344
## 2016-01-07 -0.040451828
## 2016-01-08 -0.021700268
## 2016-01-11 0.001686136
## 2016-01-12 0.016706137
tail(train)
## NVDA.Adjusted
## 2022-12-22 -0.0730223754
## 2022-12-23 -0.0087085104
## 2022-12-27 -0.0740270523
## 2022-12-28 -0.0060375816
## 2022-12-29 0.0396015280
## 2022-12-30 0.0007530183
library(forecast)
#Custom code for ACF and PACF Plot
#Refernce: https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html#correlogram-formula
ggplot.corr <- function(data, lag.max = 24, ci = 0.95, large.sample.size = TRUE, horizontal = TRUE,...) {
options(dplyr.banner_expression = FALSE)
require(ggplot2)
require(dplyr)
require(cowplot)
if(horizontal == TRUE) {numofrow <- 1} else {numofrow <- 2}
list.acf <- acf(data, lag.max = lag.max, type = "correlation", plot = FALSE)
N <- as.numeric(list.acf$n.used)
df1 <- data.frame(lag = list.acf$lag, acf = list.acf$acf)
df1$lag.acf <- dplyr::lag(df1$acf, default = 0)
df1$lag.acf[2] <- 0
df1$lag.acf.cumsum <- cumsum((df1$lag.acf)^2)
df1$acfstd <- sqrt(1/N * (1 + 2 * df1$lag.acf.cumsum))
df1$acfstd[1] <- 0
df1 <- dplyr::select(df1, lag, acf, acfstd)
list.pacf <- acf(data, lag.max = lag.max, type = "partial", plot = FALSE)
df2 <- data.frame(lag = list.pacf$lag,pacf = list.pacf$acf)
df2$pacfstd <- sqrt(1/N)
if(large.sample.size == TRUE) {
plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
geom_area(aes(x = lag, y = qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*acfstd), fill = "#B9CFE7") +
geom_col(fill = "#4373B6", width = 0.7) +
scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
scale_y_continuous(name = element_blank(),
limits = c(min(df1$acf,df2$pacf),1)) +
ggtitle("ACF") +
theme_bw() +
theme(plot.title = element_text(size=10))
plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
geom_area(aes(x = lag, y = qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
geom_area(aes(x = lag, y = -qnorm((1+ci)/2)*pacfstd), fill = "#B9CFE7") +
geom_col(fill = "#4373B6", width = 0.7) +
scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
scale_y_continuous(name = element_blank(),
limits = c(min(df1$acf,df2$pacf),1)) +
ggtitle("PACF") +
theme_bw() +
theme(plot.title = element_text(size=10))
}
else {
plot.acf <- ggplot(data = df1, aes( x = lag, y = acf)) +
geom_col(fill = "#4373B6", width = 0.7) +
geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N),
colour = "sandybrown",
linetype = "dashed",
size=1) +
geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N),
colour = "sandybrown",
linetype = "dashed",
size=1) +
scale_x_continuous(breaks = seq(0,max(df1$lag),6)) +
scale_y_continuous(name = element_blank(),
limits = c(min(df1$acf,df2$pacf),1)) +
ggtitle("ACF") +
theme_bw() +
theme(plot.title = element_text(size=10))
plot.pacf <- ggplot(data = df2, aes(x = lag, y = pacf)) +
geom_col(fill = "#4373B6", width = 0.7) +
geom_hline(yintercept = qnorm((1+ci)/2)/sqrt(N),
colour = "sandybrown",
linetype = "dashed",
size=1) +
geom_hline(yintercept = - qnorm((1+ci)/2)/sqrt(N),
colour = "sandybrown",
linetype = "dashed",
size=1) +
scale_x_continuous(breaks = seq(0,max(df2$lag, na.rm = TRUE),6)) +
scale_y_continuous(name = element_blank(),
limits = c(min(df1$acf,df2$pacf),1)) +
ggtitle("PACF") +
theme_bw() +
theme(plot.title = element_text(size=10))
}
cowplot::plot_grid(plot.acf, plot.pacf, nrow = numofrow)
}
#Custom code for ARIMA Model ACF PACF reisduals
#Refernce: https://rh8liuqy.github.io/ACF_PACF_by_ggplot2.html#correlogram-formula
library(tidyr)
##
## 다음의 패키지를 부착합니다: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
arima.order <- c(2,0,2)
p <- 0:arima.order[1]
d <- 0:arima.order[2] #Revise this part depending on how many differencing was conducted
q <- 0:arima.order[3]
df.arima.order <- data.frame(t(expand.grid(p=p,d=d,q=q)))
df.colnames <- data.frame(expand.grid(p=p,d=d,q=q))
df.colnames$type <- with(df.colnames,paste0
("ARIMA(",p,",",d,",",q,")"))
list.arima.model <- lapply(df.arima.order,
function(data, x, ...) {
arima(x = data, order = x, ...)
},
data = train) # Data has to be in ts or xts format
df.arima.residual <- sapply(list.arima.model,
function(x) {
x$residuals
})
df.arima.residual <- data.frame(df.arima.residual)
colnames(df.arima.residual) <- df.colnames$type
df.arima.residual$time <- time(train) # Data has to be in ts or xts format
df.arima.residual.plot <- gather(data = df.arima.residual, value = "residual", key = "type",-time)
df.acf <- sapply(subset(df.arima.residual, select = -time),
function(x, lag.max,...) {
acf(x = x, lag.max = lag.max, plot = FALSE)$acf
},
lag.max = 24)
df.acf <- data.frame(df.acf)
colnames(df.acf) <- df.colnames$type
df.acf$lag <- 0:(dim(df.acf)[1]-1)
df.acf.plot <- gather(df.acf,
value = "value",
key = "Model",
-lag)
df.pacf <- sapply(subset(df.arima.residual, select = -time),
function(x, lag.max,...) {
acfvalue <- as.numeric(acf(x = x,
lag.max = lag.max,
type = "partial",
plot = FALSE)$acf)
return(c(1,acfvalue))
},
lag.max = 24)
df.pacf <- data.frame(df.pacf)
colnames(df.pacf) <- df.colnames$type
df.pacf$lag <- 0:(dim(df.pacf)[1]-1)
df.pacf.plot <- gather(df.pacf,
value = "value",
key = "Model",
-lag)
fig.acf <- ggplot(data = df.acf.plot, aes(x = lag, y = value)) +
geom_col(width = 0.4, fill = "#667fa2") +
geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
linetype = "dashed",
colour = "grey50") +
geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.acf.plot)[1]),
linetype = "dashed",
colour = "grey50") +
scale_y_continuous(name = element_blank()) +
scale_x_continuous(breaks = seq(0,24,4)) +
ggtitle("ACF of residuals") +
facet_wrap(~Model) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f8ddc4"))
fig.pacf <- ggplot(data = df.pacf.plot, aes(x = lag, y = value)) +
geom_col(width = 0.4, fill = "#667fa2") +
geom_hline(yintercept = qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
linetype = "dashed",
colour = "grey50") +
geom_hline(yintercept = -qnorm((1+0.95)/2)/sqrt(dim(df.pacf.plot)[1]),
linetype = "dashed",
colour = "grey50") +
scale_y_continuous(name = element_blank()) +
scale_x_continuous(breaks = seq(0,24,4)) +
ggtitle("PACF of residuals") +
facet_wrap(~Model) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f8ddc4"))
fig.residual <- ggplot(data = df.arima.residual.plot, aes(x = time, y = residual)) +
geom_line(colour = "#667fa2") +
ggtitle("Residuals") +
facet_wrap(~type) +
theme_bw() +
theme(panel.grid.minor = element_blank(),
strip.background = element_rect(fill = "#f8ddc4"))
ggplot.corr(train,large.sample.size = FALSE)
## 필요한 패키지를 로딩중입니다: dplyr
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## 필요한 패키지를 로딩중입니다: cowplot
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## i Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
From the correlogram, it seems that AR(1), MA(1), or ARIMA(1,0,1) is adequate for forecast. We will see AIC for final order for ARIMA.
fig.acf
fig.pacf
fig.residual
#Custom Code for AIC
aic <- sapply(list.arima.model, function(X) {X$aic})
aic <- round(as.numeric(aic),2)
df.aic.plot <- data.frame(aic = aic, p = df.colnames$p, q = df.colnames$q)
ggplot(data = df.aic.plot, aes(x = p, y = q)) +
geom_raster(aes(fill = aic)) +
geom_text(aes( label = aic), colour = "black") +
scale_x_continuous(expand = c(0.05,0.05)) +
scale_color_continuous( high = "#FFA800", low = "#4AC200") +
theme_bw() +
theme(panel.grid.minor = element_blank()) +
scale_fill_distiller(palette = "Spectral", direction = -1) +
ggtitle('Akaike Information Criterion (AIC)')
Although ARIMA(0,0,2) gives the lowest AIC, we will adopt AR(1) due to parismony principle.
First, let’s predict 2023 January 3rd NVDA return using AR(1) and compare to actual value
#Predict 2023-01-05 Stock Return
a <- arima(train,order = c(1,0,0)) %>% forecast
a$mean[1]
## [1] 0.001728894
#Actual Value
nvda_return[1762,]
## NVDA.Adjusted
## 2023-01-03 -0.0206721
Now, starting from 2023-01-03, we will predict the stock return as actual observation gets updated. The following code below shows
#Predict 2023-01-05 Return
train1 <- nvda_return[1:1761,] #Data until 2022-12-30
a1 <- arima(train1,order=c(1,0,0)) %>% forecast
a1$mean[1]
## [1] 0.001728894
#Predict 2023-01-06 Return
train2 <- nvda_return[1:1762,] #Data until 2022-01-05
a2 <- arima(train2,order=c(1,0,0)) %>% forecast
a2$mean[1]
## [1] 0.003464433
#Predict 2023-01-07 Return
train3 <- nvda_return[1:1763,]#Data until 2022-01-06
a3 <- arima(train3,order=c(1,0,0)) %>% forecast
a3$mean[1]
## [1] -0.0006542375
Using this logic, we will automate the process and predict all the Y2023 returns from ARIMA model.
# Set the ARIMA order
# Change the value of order for diferent outcome for cumulative return
order <- c(1, 0, 0)
# Set the length of each training set
train_length <- 1761
# Create an empty vector to store the results
a_means <- numeric()
# Iterate over the different subsets of nvda_return
for (i in seq(train_length, length(nvda_return)-1, by = 1)) {
# Extract the current training set
train <- nvda_return[1:i, ]
# Fit an ARIMA model to the training set and generate a forecast
a <- arima(train, order = order) %>% forecast
# Store the first mean forecast in the results vector
a_means[i - train_length + 1] <- a$mean[1]
}
# Print the results
a_means
## [1] 1.728894e-03 3.464433e-03 -6.542375e-04 4.527535e-03 -1.597737e-03
## [6] -2.314524e-03 3.834736e-04 1.368736e-03 -6.971491e-04 -1.198816e-05
## [11] -1.837152e-03 3.381323e-03 4.741396e-03 -3.168510e-03 -3.838217e-03
## [16] 1.643691e-03 1.702481e-03 2.732159e-05 -2.261552e-04 6.752674e-03
## [21] 3.996825e-04 -3.502117e-03 -7.800693e-04 4.201025e-03 2.022231e-03
## [26] -1.895142e-03 1.898210e-03 1.551586e-03 5.816565e-03 8.979662e-05
## [31] -2.087561e-03 2.726395e-03 4.646644e-03 4.161445e-03 4.621498e-03
## [36] 1.591768e-03 -7.969210e-03 3.261831e-03 1.322649e-03 2.959700e-03
## [41] 3.737523e-03 -4.186473e-05 1.620211e-04 3.114134e-03 2.875768e-03
## [46] -8.593897e-04 4.449467e-03 3.572742e-03 2.001527e-03 -1.569207e-03
## [51] 1.508273e-03 -1.989517e-03 1.517240e-03 1.550714e-03 1.197009e-03
## [56] 1.298960e-03 3.714267e-05 3.253827e-03 2.789219e-03 2.423277e-03
## [61] 4.417127e-04 9.729408e-04 1.011332e-03 1.589039e-03 3.499460e-03
## [66] 3.675943e-03 1.639513e-03 5.824284e-04 3.219647e-03 3.968260e-03
## [71] 2.153093e-03 1.227094e-03 1.383743e-03 2.440439e-04 1.371134e-03
## [76] 4.343375e-03 2.027029e-03 2.280930e-03 4.316984e-03 2.239228e-05
## [81] 1.311998e-03 6.369718e-04 -9.870499e-04 3.942354e-03 3.174607e-03
## [86] 2.721769e-03 -9.171190e-04 8.718254e-04 3.601667e-03 1.265322e-03
## [91] 2.891395e-03
#Create a new dataframe that shows 2023 predicted value and actual value
df <- data.frame(Date=index(nvda_return[1762:nrow(nvda_return),]),
Predict = a_means,
Actual = nvda_return[1762:nrow(nvda_return),]%>%as.numeric())
library(DT)
datatable(df)
# create a new column 'signal' based on the predict column
# If the Predicted value is negative, we are going to short
# If the Predicted Valye is positive, we are going to long
df$signal <- ifelse(df$Predict > 0, "Long", "Short")
datatable(df)
#Create signal direction to calculate return
df$direction <- ifelse(df$signal == "Short", -1, 1)
#We are going to multiply -1 * actual return for short position
#Multiplu 1 for long position
df$return <- df$direction*df$Actual
datatable(df)
#Calculate cumulative return for buy&hold strategy Vs. ARIMA strategy
#Buy&Hold return
df$buy_hold <- cumprod(1+df$Actual)
#ARIMA return
df$arima <- cumprod(1+df$return)
# plot the total returns on a graph
ggplot(df, aes(x = Date)) +
geom_line(aes(y = buy_hold, color = "Buy&Hold"),size=1) +
geom_line(aes(y = arima, color = "ARIMA"),size=1) +
scale_color_manual(values = c("Buy&Hold" = "grey30", "ARIMA" = '#4373B6')) +
labs(x = "Date", y = "Cumulative Return", color = "",title='Buy&Hold vs ARIMA') +
scale_x_date(breaks='2 month')+
theme_bw()+
theme(legend.position = "top")
From the result, it seems that Buy&Hold strategy gave more returns for NVDA. However, it still gave a positive return, which means that it is still a valiud strategy.
Now, let’s predict how the stock price would look like in the future period. To decide the order, we will use all the data points from the past and use auto.arima
unloadNamespace('ggfortify')
unloadNamespace('forecast')
library('forecast')
#Convert to ts format
ts_nvda <- nvda%>%ts()
#Forecast the stock price using auto.arima
fit <- auto.arima(ts_nvda) %>% forecast(h=300)
#Plot ARIMA model prediction
autoplot(ts_nvda,color='grey30') +
autolayer(fit,series = 'ARIMA') +
labs(x='Time', y='Return', title='NVDA Stock Price Prediction from ARIMA') +
theme_bw() +
scale_colour_manual(name='Method', values=c('red'))+
theme(legend.position=c(0.02, 0.98),
legend.justification=c('left', 'top'))