# PAGE: Exploratory Analysis
data <- na.omit(read.csv("data/exrates.csv", header = TRUE)) # same for Rich and Damon
data$dates <- as.Date(data$DATE, "%m/%d/%Y")
data <- data[,-1]
#Prepare the data using the tidyverse packages. Collapse the variables into key-value pairs
# Data preparation
library("tidyverse")
df <- data %>%
dplyr::select(dates, USD.EUR,USD.GBP) %>%
gather(key = "variable", value = "value", -dates)
head(df)
#Visualize using one geom_line() call
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
library(scales)
library(ggpmisc)
library(forecast)
# Multiple line plot
p1<-ggplot(df, aes(x = dates, y = value)) +
geom_line(aes(color = variable, linetype = variable), size = 1) + theme(legend.position = c(0.2, 0.8)) + scale_x_date(labels = date_format("%Y-%b"), date_breaks ="1 year", date_minor_breaks = "1 month" ) + scale_color_manual(values = c("darkred", "steelblue")) + ylab("USD to GBR and EUR")+ stat_smooth(
aes(color = variable, fill = variable,
method = "loess"
))
# stat_peaks(geom = "text", colour = "darkred",
# vjust = -0.5, x.label.fmt = "%b-%Y")
p2<-ggplot(data, aes(x = dates, y = USD.JPY)) +
geom_line( size = 1, color="steelblue", linetype="dotdash") + stat_smooth(
color = "#FC4E07", fill = "#FC4E07", size=1, linetype = 11,
method = "loess"
)
p3<-ggplot(data, aes(x = dates, y = USD.CNY)) +
geom_line( size = 1, color="steelblue", linetype="twodash") + stat_smooth(
color = "#FC4E07", fill = "#FC4E07",
method = "loess"
)
ggpubr::ggarrange(p1,p2,p3, nrow=3)
Another indicator of non-stationarity are these ACF plots. For a stationary time series, the ACF will drop to zero relatively quickly, while the ACF of non-stationary data decreases slowly
library(stats)
cnames<-colnames(data[-5])
par(mfrow=c(2,2))
for(i in 1:4){
acf(data[,i], ylab= paste("ACF of", cnames[i]), main = "")
}
#Time-series Analysis of USD.GBP Currency ##Detrending by Localised least-squares regression As we can see in each of the plots, that the trends are non-linear. Localised curve-fitting model is estimated, with optimised span
###Steps for choosing an optimum least-squares model: 1. For a range of spans, perform 10-fold Cross Validation. 2. Calculate the average MSE for each fold of the Cross Validation 3. Select the span with the minimum MSE
library("tidyverse")
library("dplyr")
best.loess.fit = list()
best.spann = list()
loess.fit <- function(currency){
df <- data %>%
select(x=dates,y=currency) %>% mutate(x=as.numeric(x))
span.seq <- seq(from = 0.15, to = 0.95, by = 0.05) #explores range of spans
k <- 10 #number of folds
set.seed(1) # replicate results
folds <- sample(x = 1:k, size = length(df$x), replace = TRUE)
cv.error.mtrx <- matrix(rep(x = NA, times = k * length(span.seq)),
nrow = length(span.seq), ncol = k)
for(i in 1:length(span.seq)) {
for(j in 1:k) {
loess.fit <- loess(formula = y ~ x, data = df[folds != j, ], span = span.seq[i])
preds <- predict(object = loess.fit, newdata = df[folds == j, ])
cv.error.mtrx[i, j] <- mean((df$y[folds == j] - preds)^2, na.rm = TRUE)
# some predictions result in `NA` because of the `x` ranges in each fold
}
}
cv.errors <- rowMeans(cv.error.mtrx)
best.span.i <- which.min(cv.errors)
par(mfrow=c(2,1))
plot(x = span.seq, y = cv.errors, type = "l", main = paste("CV Plot for",currency, "=", span.seq[best.span.i]))
points(x = span.seq, y = cv.errors,
pch = 20, cex = 0.75, col = "blue")
points(x = span.seq[best.span.i], y = cv.errors[best.span.i],
pch = 20, cex = 1, col = "red")
best.loess <- loess(formula = y ~ x, data = df,
span = span.seq[best.span.i])
var.name <- currency #deparse(currency)#(deparse(substitute(currency)))
best.loess.fit[(which(colnames(data) == var.name))] <<- list(best.loess)
best.spann[(which(colnames(data) == var.name))] <<- span.seq[best.span.i]
#x.seq <- seq(from = min(x), to = max(x), length = 100)
plot(x = as.Date(df$x,origin="1970-01-01" ), y = df$y, main = paste("Best Span Plot for",currency, "=", span.seq[best.span.i]), ylab="Least-squares fitted", xlab="Dates")
lines(x = as.Date(df$x,origin="1970-01-01" ), y = predict(object = best.loess, newdata = data.frame(x = df$x) ),col = "steelblue", lwd = 2)
}
loess.fit("USD.CNY")
loess.fit("USD.JPY")
loess.fit("USD.EUR")
loess.fit("USD.GBP")
##Detrending by subtracting out the trend
len<-dim(data)[1]
df_detrended = data
colnames(df_detrended) = colnames(data)
cc<-colnames(data)[-5]
detrend<-function(currency){
idx <- (which(colnames(data) == currency))
df <- data %>%
select(x=dates,y=currency) %>% mutate(x=as.numeric(x))
tt<- df$y - predict(object = best.loess.fit[idx][[1]],
newdata = data.frame(x = df$x))
df_detrended[, currency] <<- tt
}
detrend("USD.CNY")
detrend("USD.JPY")
detrend("USD.EUR")
detrend("USD.GBP")
A decomposition is : \[ Y_t = T_t + S_t+R_t \] Where, \(T_t\) is the trend, \(S_t\) is the seasonal component and \(R_t\) is the residual component. The strength of seasonality is defined on detrended data as: \[ F_s = max\big(0, 1- \frac{Var(R_t)} {Var(R_t+S_t)}\big) \] If \(F_s\) is closer to 0, then there is approximately no seasonality. When \(F_s\) is close to 1, then there is seasonality.
#Reference: https://stackoverflow.com/questions/43054417/how-to-customize-title-axis-labels-etc-in-a-plot-of-a-decomposed-time-series
my_plot.decomposed.ts = function(x, title="", ...) {
xx <- x$x
if (is.null(xx))
xx <- with(x, if (type == "additive")
random + trend + seasonal
else random * trend * seasonal)
plot(cbind(observed = xx, trend = x$trend, seasonal = x$seasonal, random = x$random),
main=title, ...)
} #end my_plot.decompose.ts
#refer:https://stackoverflow.com/questions/23604432/daily-time-series-with-ts-how-to-specify-start-and-end
#answer by: https://stackoverflow.com/users/3001626/david-arenburg
#compute the week number
startW <- as.numeric(strftime(head(df_detrended$dates, 1), format = "%W"))
#compute the day number
startD <- as.numeric(strftime(head(df_detrended$dates, 1) + 1, format =" %w"))
endW <- as.numeric(strftime(tail(df_detrended$dates, 1), format = "%W"))
endD <- as.numeric(strftime(tail(df_detrended$dates, 1) + 1, format =" %w"))
for(i in 1:4){
xx<-ts(df_detrended[,i], start =c(startW,startD), frequency = 7) #for weekly data
m <- decompose(xx)
#taking off the seasonality
df_ <- xx-m$seasonal
fs = max(0,1 - (var(df_)/ var(xx)))
my_plot.decomposed.ts(m, paste("For", colnames(df_detrended)[i], "seasonality=", round(fs, 3)))
}
Both ACF and PACF plots should be taken into consideration before
for(i in 1:4){
par(mfrow=c(1,2))
acf(ts(df_detrended[,i],start = df_detrended[1,5], frequency = 1/7), ylab = paste("ACF",colnames(df_detrended)[i]), main="")
pacf(ts(df_detrended[,i],start = df_detrended[1,5], frequency = 1/7), ylab = paste("ACF",colnames(df_detrended)[i]), lag=10, main="")
}
All these plots of ACF are sinusoidal. They taper to zero. From the PACF plots, we see a cut-off after lag 2. From these references [2,3], we need an AR(2) model.
A specification of the non-seasonal part of the ARIMA model: the three components (p, d, q) are the AR order, the degree of differencing, and the MA order. An order of (2,0,0) has been chosen
library(forecast)
library(ggplot2)
library(xts)
forecast_len = 20
addtrend<-function(currency,df_detrended){
idx <- (which(colnames(data) == currency))
df <- df_detrended %>%
select(x=dates,y=currency) %>% mutate(x=as.numeric(x))
tt<- df$y + predict(object = best.loess.fit[idx][[1]],
newdata = data.frame(x = df$x))
return(tt)
}
par(mfrow = c(2,2))
for(i in 1:4){
print(df_detrended[,i] %>%
Arima(order = c(2,0,0)) %>% forecast(h=forecast_len) %>%
autoplot(ylab=paste("Arima",colnames(df_detrended)[i], main="AR(2) on detrended data"))+
guides( colour=guide_legend(title="Data series"),fill=guide_legend(title="Prediction interval")) )
ff<- df_detrended[,i] %>%
Arima(order = c(2,0,0)) %>% forecast(h=forecast_len)
pred<-predict(object = best.loess.fit[i][[1]], newdata = data.frame(x = as.numeric(df_detrended$dates)))
pred<-ts(pred, start=min(df_detrended$dates), end = max(df_detrended$dates), frequency = 1/7)
new_weeks<-seq(as.Date(max(df_detrended$dates)+7, origin = "1970-01-01"), by = 'weeks', length = forecast_len)
row_orig<- dim(df_detrended)[1]
#scale the forecasted data by the 260th of Y and then add ff$mean
addTrend<-addtrend(colnames(data)[i],df_detrended)
pred_forecasted <- ts(rep(addTrend[row_orig], forecast_len)+ff$mean, start = (max(df_detrended$dates)+7), frequency = 1/7)
new_pred <- ts(c(pred, pred_forecasted), start = min(time(pred)), end = max(time(pred_forecasted)), frequency = 1/7 )
new_fitted_mean<-ts(as.vector(ff$mean)+as.vector(pred),start = (max(df_detrended$dates)+7), frequency = 1/7)
new_fitted_lower_80<-ts(as.vector(ff$lower[,1])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7),
frequency = 1/7)
new_fitted_lower_90<-ts(as.vector(ff$lower[,2])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7),
frequency = 1/7)
#new_fitted_lower<-ts(new_fitted_lower,start = (max(df$x)+7), frequency = 1/7)
new_fitted_upper_80<- ts(as.vector(ff$upper[,1])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7),
frequency = 1/7)
df_forecast <- data.frame(arima_fitted = new_pred,forecasted = FALSE)
df_forecast[,"new_fitted_lower_80"] = c(rep(0,row_orig+forecast_len))
df_forecast[,"new_fitted_upper_80"] = c(rep(0,row_orig+forecast_len))
df_forecast[(row_orig+1):(row_orig+forecast_len),"new_fitted_lower_80"] <-new_fitted_lower_80
df_forecast[(row_orig+1):(row_orig+forecast_len),"new_fitted_upper_80"] <-new_fitted_upper_80
df_forecast[(row_orig+1):(row_orig+forecast_len),"forecasted"] <- c(rep(TRUE,20))
# Turn into date format - added as.Date to origin statement
df_forecast$time<-as.Date(as.vector(time(df_forecast$arima_fitted)), "%d/%m/%y", origin = as.Date("1970-01-01"))
forecast_strt.x<-df_forecast$time[min(which(df_forecast$forecasted == TRUE))]
forecast_str.y <- df_forecast$arima_fitted[min(which(df_forecast$forecasted == TRUE))]
p<-ggplot(data=df_forecast, aes(y = arima_fitted, x=time ) ) +geom_point(size=0.2, alpha = 0.2) + geom_line(aes(color = forecasted), size = 0.8) + geom_ribbon(data = df_forecast[df_forecast$forecasted == TRUE,], aes(ymin=new_fitted_lower_80, ymax=new_fitted_upper_80, x= time), linetype=2, alpha=0.5) +
ggtitle(paste("ARIMA(2) fitted,",colnames(data)[i]), subtitle = "20 weeks are forecasted at 80% confidence interval") +
scale_x_date(breaks = date_breaks("2 month"),
labels = date_format("%Y-%m-%d")) + theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1)) + annotate(geom="text", x=forecast_strt.x , y= forecast_str.y, label=paste("start at ",forecast_strt.x),
color="purple") + geom_point(x=forecast_strt.x , y= forecast_str.y)
print(p)
}
To compute log-returns, simple returns and basic differences (or the inverse operations) from given data we use the qrmtools
library(qrmtools)
## Build and plot log-returns
## Index
prices <- as.data.frame(data)
all.ts <- na.omit(as.xts(prices[,-5], data$dates))
X. <- returns(all.ts) # compute log-returns
plot.zoo(X., xlab = "Time t", ylab = colnames(X.), cey.lab=0.4,
main = "Risk-factor changes (log-returns) of Prices")
library(xts)
## We use monthly data here as basis (and compute monthly log-returns)
X <- apply.monthly(X., FUN = colSums)
my.panel <- function(x, y, ..., pf = parent.frame()) {
lines(x, y, ...)
# if bottom panel
pf <- parent.frame()
if (with(pf)) {
axis(side = 1,at=x, labels = Time, tcl = -0.7, cex.axis = 0.7)
}}
plot.zoo(X, type = "h", xlab = "Time t", ylab = colnames(X), main = "Monthly risk-factor changes (log-returns) of Prices")
labs <-as.yearmon(time(X))
axis(side = 1, labels = labs,at=time(X), cex.axis = 0.7)
References: 1. Wang, X., Smith, K. A., & Hyndman, R. J. (2006). Characteristic-based clustering for time series data. Data Mining and Knowledge Discovery, 13(3), 335–364. 2. Justification for AR(2) 3. ARIMA models for Time-series forecasting 4. Frequency Selection