Pre-process the Data

Split into date and rates

# PAGE: Exploratory Analysis
data <- na.omit(read.csv("data/exrates.csv", header = TRUE)) # same for Rich and Damon

data$dates <- as.Date(data$DATE, "%m/%d/%Y")
data <- data[,-1]
#Prepare the data using the tidyverse packages. Collapse the variables into key-value pairs
# Data preparation
library("tidyverse")
df <- data %>%
  dplyr::select(dates, USD.EUR,USD.GBP) %>%
  gather(key = "variable", value = "value", -dates)
head(df)

#Visualize using one geom_line() call
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
library(scales)
library(ggpmisc)
library(forecast)


# Multiple line plot
p1<-ggplot(df, aes(x = dates, y = value)) + 
  geom_line(aes(color = variable, linetype = variable), size = 1) + theme(legend.position = c(0.2, 0.8)) + scale_x_date(labels = date_format("%Y-%b"), date_breaks ="1 year", date_minor_breaks = "1 month" ) + scale_color_manual(values = c("darkred", "steelblue")) + ylab("USD to GBR and EUR")+ stat_smooth(
  aes(color = variable, fill = variable,
  method = "loess"
  ))
  # stat_peaks(geom = "text", colour = "darkred", 
  #            vjust = -0.5, x.label.fmt = "%b-%Y")

p2<-ggplot(data, aes(x = dates, y = USD.JPY)) + 
  geom_line( size = 1, color="steelblue", linetype="dotdash") + stat_smooth(
  color = "#FC4E07", fill = "#FC4E07", size=1, linetype = 11,
  method = "loess"
  )

p3<-ggplot(data, aes(x = dates, y = USD.CNY)) + 
  geom_line( size = 1, color="steelblue", linetype="twodash") + stat_smooth(
  color = "#FC4E07", fill = "#FC4E07",
  method = "loess"
  )
ggpubr::ggarrange(p1,p2,p3, nrow=3)

Another indicator of non-stationarity are these ACF plots. For a stationary time series, the ACF will drop to zero relatively quickly, while the ACF of non-stationary data decreases slowly

library(stats)
cnames<-colnames(data[-5])
par(mfrow=c(2,2))
for(i in 1:4){
 acf(data[,i], ylab= paste("ACF of", cnames[i]), main = "") 
}

#Time-series Analysis of USD.GBP Currency ##Detrending by Localised least-squares regression As we can see in each of the plots, that the trends are non-linear. Localised curve-fitting model is estimated, with optimised span
###Steps for choosing an optimum least-squares model: 1. For a range of spans, perform 10-fold Cross Validation. 2. Calculate the average MSE for each fold of the Cross Validation 3. Select the span with the minimum MSE

library("tidyverse")
library("dplyr")

best.loess.fit = list()
best.spann = list()

loess.fit <- function(currency){
  df <- data %>%
  select(x=dates,y=currency) %>% mutate(x=as.numeric(x))

span.seq <- seq(from = 0.15, to = 0.95, by = 0.05) #explores range of spans
k <- 10 #number of folds
set.seed(1) # replicate results
folds <- sample(x = 1:k, size = length(df$x), replace = TRUE)
cv.error.mtrx <- matrix(rep(x = NA, times = k * length(span.seq)), 
                        nrow = length(span.seq), ncol = k)

for(i in 1:length(span.seq)) {
  for(j in 1:k) {
    loess.fit <- loess(formula = y ~ x, data = df[folds != j, ], span = span.seq[i])
    preds <- predict(object = loess.fit, newdata = df[folds == j, ])
    cv.error.mtrx[i, j] <- mean((df$y[folds == j] - preds)^2, na.rm = TRUE)
    # some predictions result in `NA` because of the `x` ranges in each fold
 }
}
cv.errors <- rowMeans(cv.error.mtrx)

best.span.i <- which.min(cv.errors)

par(mfrow=c(2,1))
plot(x = span.seq, y = cv.errors, type = "l", main = paste("CV Plot for",currency, "=", span.seq[best.span.i]))
points(x = span.seq, y = cv.errors, 
       pch = 20, cex = 0.75, col = "blue")
points(x = span.seq[best.span.i], y = cv.errors[best.span.i], 
       pch = 20, cex = 1, col = "red")

best.loess <- loess(formula = y ~ x, data = df, 
                        span = span.seq[best.span.i])
var.name <- currency #deparse(currency)#(deparse(substitute(currency)))


best.loess.fit[(which(colnames(data) == var.name))] <<- list(best.loess)
best.spann[(which(colnames(data) == var.name))] <<- span.seq[best.span.i]
  
#x.seq <- seq(from = min(x), to = max(x), length = 100)

plot(x = as.Date(df$x,origin="1970-01-01" ), y = df$y, main = paste("Best Span Plot for",currency, "=", span.seq[best.span.i]), ylab="Least-squares fitted", xlab="Dates")

lines(x = as.Date(df$x,origin="1970-01-01" ), y = predict(object = best.loess, newdata = data.frame(x = df$x) ),col = "steelblue", lwd = 2)
}
loess.fit("USD.CNY")

loess.fit("USD.JPY")

loess.fit("USD.EUR")

loess.fit("USD.GBP")

##Detrending by subtracting out the trend

len<-dim(data)[1]
 df_detrended = data
colnames(df_detrended) = colnames(data)
 cc<-colnames(data)[-5]
 
 
detrend<-function(currency){
  idx <- (which(colnames(data) == currency))
  df <- data %>%
  select(x=dates,y=currency) %>% mutate(x=as.numeric(x))
  
tt<- df$y - predict(object = best.loess.fit[idx][[1]], 
                                 newdata = data.frame(x = df$x))

df_detrended[, currency] <<- tt 
}

detrend("USD.CNY")
detrend("USD.JPY")
detrend("USD.EUR")
detrend("USD.GBP")

Test for Sesonality

A decomposition is : \[ Y_t = T_t + S_t+R_t \] Where, \(T_t\) is the trend, \(S_t\) is the seasonal component and \(R_t\) is the residual component. The strength of seasonality is defined on detrended data as: \[ F_s = max\big(0, 1- \frac{Var(R_t)} {Var(R_t+S_t)}\big) \] If \(F_s\) is closer to 0, then there is approximately no seasonality. When \(F_s\) is close to 1, then there is seasonality.

#Reference: https://stackoverflow.com/questions/43054417/how-to-customize-title-axis-labels-etc-in-a-plot-of-a-decomposed-time-series

my_plot.decomposed.ts = function(x, title="", ...) {
  xx <- x$x
  if (is.null(xx)) 
    xx <- with(x, if (type == "additive") 
      random + trend + seasonal
      else random * trend * seasonal)
  plot(cbind(observed = xx, trend = x$trend, seasonal = x$seasonal, random = x$random), 
       main=title, ...)
} #end my_plot.decompose.ts

#refer:https://stackoverflow.com/questions/23604432/daily-time-series-with-ts-how-to-specify-start-and-end
#answer by: https://stackoverflow.com/users/3001626/david-arenburg

#compute the week number
startW <- as.numeric(strftime(head(df_detrended$dates, 1), format = "%W"))
#compute the day number
startD <- as.numeric(strftime(head(df_detrended$dates, 1) + 1, format =" %w")) 

endW <- as.numeric(strftime(tail(df_detrended$dates, 1), format = "%W"))
endD <- as.numeric(strftime(tail(df_detrended$dates, 1) + 1, format =" %w")) 

for(i in 1:4){
xx<-ts(df_detrended[,i], start =c(startW,startD), frequency = 7) #for weekly data
m <- decompose(xx)
#taking off the seasonality
df_ <- xx-m$seasonal
fs = max(0,1 - (var(df_)/ var(xx)))

my_plot.decomposed.ts(m, paste("For", colnames(df_detrended)[i], "seasonality=", round(fs, 3)))

}

Rules for identifying the ARIMA model

Both ACF and PACF plots should be taken into consideration before

for(i in 1:4){
par(mfrow=c(1,2))
acf(ts(df_detrended[,i],start = df_detrended[1,5], frequency = 1/7),  ylab = paste("ACF",colnames(df_detrended)[i]), main="")
pacf(ts(df_detrended[,i],start = df_detrended[1,5], frequency = 1/7),  ylab = paste("ACF",colnames(df_detrended)[i]), lag=10, main="")
}

All these plots of ACF are sinusoidal. They taper to zero. From the PACF plots, we see a cut-off after lag 2. From these references [2,3], we need an AR(2) model.

Solving for AR(2) Model

A specification of the non-seasonal part of the ARIMA model: the three components (p, d, q) are the AR order, the degree of differencing, and the MA order. An order of (2,0,0) has been chosen

library(forecast)
library(ggplot2)
library(xts)
forecast_len = 20

addtrend<-function(currency,df_detrended){
  idx <- (which(colnames(data) == currency))
  df <- df_detrended %>%
  select(x=dates,y=currency) %>% mutate(x=as.numeric(x))
  
tt<- df$y + predict(object = best.loess.fit[idx][[1]], 
                                 newdata = data.frame(x = df$x))

return(tt) 
}

par(mfrow = c(2,2))
for(i in 1:4){
  print(df_detrended[,i]  %>% 
Arima(order = c(2,0,0)) %>% forecast(h=forecast_len) %>%
  autoplot(ylab=paste("Arima",colnames(df_detrended)[i], main="AR(2) on detrended data"))+
    guides( colour=guide_legend(title="Data series"),fill=guide_legend(title="Prediction interval")) )
  
  
  ff<- df_detrended[,i]  %>% 
Arima(order = c(2,0,0)) %>% forecast(h=forecast_len)


pred<-predict(object = best.loess.fit[i][[1]], newdata = data.frame(x = as.numeric(df_detrended$dates)))

pred<-ts(pred, start=min(df_detrended$dates), end = max(df_detrended$dates), frequency = 1/7)


new_weeks<-seq(as.Date(max(df_detrended$dates)+7, origin = "1970-01-01"), by = 'weeks', length = forecast_len)
row_orig<- dim(df_detrended)[1]


#scale the forecasted data by the 260th of Y and then add ff$mean
addTrend<-addtrend(colnames(data)[i],df_detrended)

pred_forecasted <- ts(rep(addTrend[row_orig], forecast_len)+ff$mean, start = (max(df_detrended$dates)+7), frequency = 1/7)


new_pred <- ts(c(pred, pred_forecasted), start = min(time(pred)), end = max(time(pred_forecasted)), frequency = 1/7 )

new_fitted_mean<-ts(as.vector(ff$mean)+as.vector(pred),start = (max(df_detrended$dates)+7), frequency = 1/7)
new_fitted_lower_80<-ts(as.vector(ff$lower[,1])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7), 
                      frequency = 1/7)


new_fitted_lower_90<-ts(as.vector(ff$lower[,2])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7), 
                      frequency = 1/7)

#new_fitted_lower<-ts(new_fitted_lower,start = (max(df$x)+7), frequency = 1/7)
new_fitted_upper_80<- ts(as.vector(ff$upper[,1])+as.vector(pred_forecasted), start = (max(df_detrended$dates)+7), 
                      frequency = 1/7)
df_forecast <- data.frame(arima_fitted = new_pred,forecasted = FALSE)

df_forecast[,"new_fitted_lower_80"] = c(rep(0,row_orig+forecast_len))
df_forecast[,"new_fitted_upper_80"] = c(rep(0,row_orig+forecast_len))
df_forecast[(row_orig+1):(row_orig+forecast_len),"new_fitted_lower_80"] <-new_fitted_lower_80
df_forecast[(row_orig+1):(row_orig+forecast_len),"new_fitted_upper_80"] <-new_fitted_upper_80
df_forecast[(row_orig+1):(row_orig+forecast_len),"forecasted"]  <- c(rep(TRUE,20))

  # Turn into date format - added as.Date to origin statement
df_forecast$time<-as.Date(as.vector(time(df_forecast$arima_fitted)), "%d/%m/%y", origin = as.Date("1970-01-01"))
forecast_strt.x<-df_forecast$time[min(which(df_forecast$forecasted == TRUE))]
forecast_str.y <- df_forecast$arima_fitted[min(which(df_forecast$forecasted == TRUE))]
 p<-ggplot(data=df_forecast, aes(y = arima_fitted, x=time ) ) +geom_point(size=0.2, alpha = 0.2)  + geom_line(aes(color = forecasted), size = 0.8) + geom_ribbon(data = df_forecast[df_forecast$forecasted == TRUE,], aes(ymin=new_fitted_lower_80, ymax=new_fitted_upper_80, x= time), linetype=2, alpha=0.5) +
   ggtitle(paste("ARIMA(2) fitted,",colnames(data)[i]), subtitle = "20 weeks are forecasted at 80% confidence interval") +
scale_x_date(breaks = date_breaks("2 month"),
             labels = date_format("%Y-%m-%d")) + theme(axis.text.x  = element_text(angle = 90, vjust = 1, hjust = 1)) +  annotate(geom="text", x=forecast_strt.x , y= forecast_str.y, label=paste("start at ",forecast_strt.x),
              color="purple") + geom_point(x=forecast_strt.x , y= forecast_str.y)
 print(p)
  
}

Risk Assessments

To compute log-returns, simple returns and basic differences (or the inverse operations) from given data we use the qrmtools

library(qrmtools)
## Build and plot log-returns
## Index
prices <- as.data.frame(data)
all.ts <- na.omit(as.xts(prices[,-5], data$dates))

X. <- returns(all.ts) # compute log-returns
plot.zoo(X., xlab = "Time t", ylab = colnames(X.), cey.lab=0.4,
         main = "Risk-factor changes (log-returns) of Prices")

Monthly Risk Assessments

library(xts)
## We use monthly data here as basis (and compute monthly log-returns)


X <- apply.monthly(X., FUN = colSums) 
my.panel <- function(x, y, ..., pf = parent.frame()) {
   lines(x, y, ...)
    # if bottom panel
    pf <- parent.frame()
    if (with(pf)) {
        axis(side = 1,at=x, labels = Time,  tcl = -0.7, cex.axis = 0.7)
  
    }}
    
plot.zoo(X, type = "h", xlab = "Time t", ylab = colnames(X), main = "Monthly risk-factor changes (log-returns) of Prices")

labs <-as.yearmon(time(X))
 axis(side = 1, labels = labs,at=time(X), cex.axis = 0.7)

References: 1. Wang, X., Smith, K. A., & Hyndman, R. J. (2006). Characteristic-based clustering for time series data. Data Mining and Knowledge Discovery, 13(3), 335–364. 2. Justification for AR(2) 3. ARIMA models for Time-series forecasting 4. Frequency Selection

Financial Analytics: Currency Exchange

FIN 645. Final Project

Joseph Labarbera, David Muniz, Richard Patterson, Damon Porter