Emily Gill - CVEN6833 - Homework #3

December 16, 2013

Question 2: ARMA and Nonparametric Time Series Simulation ::: NONSEASONAL ARMA

rm(list = ls())
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW3")
source("bestARIMA.r")
source("skew.r")
source("stats.r")
source("plotMay.r")
library(sm)

## Package `sm', version 2.2-5: type help(sm) for summary information

Read in the monthly Lees Ferry streamflow and remove the seasonal cycle (remove the mean and divide by the standard deviation) :: NONSEASONAL

LFmon <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/LeesFerry-monflows-1906-2006.txt")

years <- LFmon[, 1]
LFmon <- as.data.frame(LFmon[, 2:13]/(10^6))  # convert to Million Acre Feet (MaF)
names(LFmon) <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", 
    "Oct", "Nov", "Dec")

LFmean <- apply(LFmon, 2, mean)  # monthly mean
LFsd <- apply(LFmon, 2, sd)  # monthly sd

X <- t((t(LFmon) - LFmean)/LFsd)  # same as scale(X)
X <- array(t(X))  # turn into an array to use for ARMA

Find the best ARIMA model. I created a function that evaluates AIC for various combinations of p and q. $bestmod is the model chosen with lowest AIC.

p <- c(1, 2, 3, 4)  # run the code for a range of p values
q <- c(1, 2, 3, 4)  # run the code for a range of q values
z <- bestARIMA(X, p, q)  # home-made code compares AIC values
z$aics

##     AIC p q
## 1  2755 1 1
## 2  2753 1 2
## 3  2750 1 3
## 4  2804 1 4
## 5  2749 2 1
## 6  2757 2 2
## 7  2751 2 3
## 8  2752 2 4
## 9  2749 3 1
## 10 2751 3 2
## 11 2752 3 3
## 12 2754 3 4
## 13 2751 4 1
## 14 2832 4 2
## 15 2753 4 3
## 16 2758 4 4

p <- z$bestp  # selects p-val with lowest AIC
q <- z$bestq  # selects q-val with lowest AIC
zz <- z$bestmod  # zz is the best model - contains coefs

Set up for 250 simulations. Initialize matrices to store statistics of simulations.

nsim <- 250  # number of simulations 
nyrs <- length(LFmon[, 1])  # number of years of historical data
nyrs1 <- nyrs - 1  # number of years minus one

# initialize the matrices that store the statistics
armean <- matrix(0, nsim, 12)
arstdev <- matrix(0, nsim, 12)
arcor <- matrix(0, nsim, 12)
arcor2 <- matrix(0, nsim, 12)
arcor3 <- matrix(0, nsim, 12)
arskw <- matrix(0, nsim, 12)
armax <- matrix(0, nsim, 12)
armin <- matrix(0, nsim, 12)

# define points to evaluate the May pdf over
May <- LFmon[, 5]
xeval <- seq(min(May) - 0.25 * sd(May), max(May) + 0.25 * sd(May), length = 100)
nevals <- length(xeval)

# store the May simulations and create an array for the PDF
maysim <- 1:nyrs
simpdf <- matrix(0, nrow = nsim, ncol = nevals)

Perform simulation

for (k in 1:nsim) {
    nmons <- nyrs * 12  # number of values to be generated

    xsim <- arima.sim(n = nmons, list(ar = c(zz$coef[1:p]), ma = c(zz$coef[(p + 
        1):(p + q)])), sd = sqrt(zz$sigma2)) + mean(X)

    simdismon <- matrix(xsim, nrow = 12)  # put the array into a matrix 
    simdismon <- simdismon * LFsd + LFmean  # back standardize
    simdismon <- t(simdismon)  # transpose

    # evaluate May simulated PDF at each time step
    maysim <- simdismon[, 5]
    simpdf[k, ] <- sm.density(maysim, eval.points = xeval, display = "none")$estimate

    # compute stats on simulated timeseries
    ar.stats <- stats(simdismon, corr = TRUE, multicorr = TRUE)
    armean[k, ] <- ar.stats$mean
    armax[k, ] <- ar.stats$max
    armin[k, ] <- ar.stats$min
    arstdev[k, ] <- ar.stats$stdev
    arskw[k, ] <- ar.stats$skew
    arcor[k, ] <- ar.stats$cor
    arcor2[k, ] <- ar.stats$mcor2
    arcor3[k, ] <- ar.stats$mcor3
}

Compute statistics from historical data and boxplot versus simulated

obs.stats <- stats(LFmon, corr = TRUE, multicorr = TRUE)
mos <- as.character(c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", 
    "Sep", "Oct", "Nov", "Dec"))
par(mfrow = c(2, 3))
boxplot(armean, xaxt = "n")
lines(obs.stats$mean, col = "red")
title(main = "Mean")
axis(1, at = 1:12, labels = mos)
boxplot(armax, xaxt = "n")
lines(obs.stats$max, col = "red")
title(main = "Max")
axis(1, at = 1:12, labels = mos)
boxplot(armin, xaxt = "n")
lines(obs.stats$min, col = "red")
title(main = "Min")
axis(1, at = 1:12, labels = mos)
boxplot(arskw, xaxt = "n")
lines(obs.stats$skew, col = "red")
title(main = "Skew")
axis(1, at = 1:12, labels = mos)
boxplot(arstdev, xaxt = "n")
lines(obs.stats$stdev, col = "red")
title(main = "SD")
axis(1, at = 1:12, labels = mos)
boxplot(arcor, xaxt = "n")
lines(obs.stats$cor, col = "red")
title(main = "Lag-1 Corr")
axis(1, at = 1:12, labels = mos)
par(oma = c(2, 2, 3, 2))
title("Statistics for ARMA model", outer = TRUE)

plot of chunk unnamed-chunk-6

Comments on Statistics Plots:

Using an ARMA nonseasonal model we can notice a couple things from the statistical plots comparing simulatd and observed timeseries:

The mean, max and SD are captured nicely by the simulations. The min is generally captured.

Skew is not captured, as expected since this is a parametric method assuming normality

Lag-1 correlation is not captured since this is a nonseasonal model.

Boxplot the May PDF

plotMay(May, simpdf)  # used own function to consolidate May PDf plotting commands
par(oma = c(2, 2, 3, 2))
title("May PDF from Simulation", outer = TRUE)

plot of chunk unnamed-chunk-7

Comments on May PDF:

The bimodality in the May PDF is not captured. This is expected since this code is utilizing ARMA (parametric) without any alterations to capture non-normality. We will need to add a non-parametric alteration (such as KNN) in order to capture the bimodality.

Compare multiple lag correlation results

par(mfrow = c(1, 3))
boxplot(arcor, xaxt = "n")
lines(obs.stats$cor, col = "red")
title(main = "Lag-1 Corr")
axis(1, at = 1:12, labels = mos)
boxplot(arcor2, xaxt = "n")
lines(obs.stats$mcor2, col = "red")
title(main = "Lag-2 Corr")
axis(1, at = 1:12, labels = mos)
boxplot(arcor3, xaxt = "n")
lines(obs.stats$mcor3, col = "red")
title(main = "Lag-3 Corr")
axis(1, at = 1:12, labels = mos)
par(oma = c(2, 2, 3, 2))
title("Comparison of multiple lags", outer = TRUE)

plot of chunk unnamed-chunk-8

Comments on lag correlations

This method does not capture any of the lag correlations - not even the lag 1 - since it is a nonseasonal model. We will compare this with the remaining techniques of simulating a time series.