Emily Gill - CVEN6833 - Homework #3

December 16, 2013

Question 4: ARMA & Nonparametric Time Series Simulation ::: NONPARAMETRIC SEASONAL LAG-1 MODEL

rm(list = ls())
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW3")
source("bestARIMA.r")
source("skew.r")
source("stats.r")
source("plotMay.r")
library(sm)

## Package `sm', version 2.2-5: type help(sm) for summary information

Read in the monthly Lees Ferry streamflow and remove the seasonal cycle (remove the mean and divide by the standard deviation) :: NONSEASONAL

LFmon <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/LeesFerry-monflows-1906-2006.txt")

years <- LFmon[, 1]
LFmon <- as.data.frame(LFmon[, 2:13]/(10^6))  # convert to Million Acre Feet (MaF)
names(LFmon) <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", 
    "Oct", "Nov", "Dec")

X <- array(t(LFmon))  # turn into an array to use for ARMA
N <- length(X)
nmon <- N/12
nyrs <- length(LFmon[, 1])  # number of years of historical data
nyrs1 <- nyrs - 1  # number of years minus one

Fit an Np-AR-1 model

ilag <- 1  # for a lag-1 model
K <- round(sqrt(N))  # define K nearest neighbors based on N
W <- 1:K
W <- 1/W
W <- W/sum(W)
W <- cumsum(W)

Set up for 250 simulations. Initialize matrices to store statistics of simulations.

nsim <- 100
nsim1 <- nsim + 1

# initialize the matrices that store the statistics
armean <- matrix(0, nsim, 12)
arstdev <- matrix(0, nsim, 12)
arcor <- matrix(0, nsim, 12)
arcor2 <- matrix(0, nsim, 12)
arcor3 <- matrix(0, nsim, 12)
arskw <- matrix(0, nsim, 12)
armax <- matrix(0, nsim, 12)
armin <- matrix(0, nsim, 12)

# set up points to evaluate May PDFs over
May <- LFmon[, 5]
xeval <- seq(min(May) - 0.25 * sd(May), max(May) + 0.25 * sd(May), length = 100)
nevals <- length(xeval)

# store the May simulations and create an array for the PDF
maysim <- 1:nyrs
simpdf <- matrix(0, nrow = nsim, ncol = nevals)

Perform simulation

for (i in 1:nsim) {
    it <- round(runif(1, 2, nyrs))
    zsim <- 1:N

    ilag1 <- ilag - 1
    index <- (it - 1) * 12 + 1
    zsim[1:ilag] <- X[index:(index + ilag1)]
    xp <- zsim[1:ilag]

    for (j in (ilag + 1):N) {
        mo <- j%%12
        if (mo == 0) {
            data <- LFmon
        }
        if (mo > 0) {
            data <- cbind(LFmon[, (mo + 1):12], LFmon[, 1:mo])
        }
        if (ilag == 1) {
            xdist <- order(abs(xp - data[, 11]))
        }
        if (ilag > 1) {
            xx <- rbind(xp, data[, (12 - ilag):11])
            xdist <- order(as.matrix(dist(xx))[1, 2:nyrs + 1])
        }

        xx <- runif(1, 0, 1)
        xy <- c(xx, W)
        xx <- rank(xy)
        i1 <- xdist[xx[1]]

        ydata <- data[, 12]
        zsim[j] <- ydata[i1]
        xp <- zsim[(j - ilag + 1):j]
    }

    simdismon = t(matrix(zsim, nrow = 12))  # makes a 12 column matrix with jan thru dec

    maysim = simdismon[, 5]
    simpdf[i, ] = sm.density(maysim, eval.points = xeval, display = "none")$estimate

    # compute stats on simulated timeseries
    ar.stats <- stats(simdismon, corr = TRUE, multicorr = TRUE)
    armean[i, ] <- ar.stats$mean
    armax[i, ] <- ar.stats$max
    armin[i, ] <- ar.stats$min
    arstdev[i, ] <- ar.stats$stdev
    arskw[i, ] <- ar.stats$skew
    arcor[i, ] <- ar.stats$cor
    arcor2[i, ] <- ar.stats$mcor2
    arcor3[i, ] <- ar.stats$mcor3

}

Compute statistics from historical data and boxplot versus simulated

obs.stats <- stats(LFmon, corr = TRUE, multicorr = TRUE)
mos <- as.character(c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", 
    "Sep", "Oct", "Nov", "Dec"))
par(mfrow = c(2, 3))
boxplot(armean, xaxt = "n")
lines(obs.stats$mean, col = "red")
title(main = "Mean")
axis(1, at = 1:12, labels = mos)
boxplot(armax, xaxt = "n")
lines(obs.stats$max, col = "red")
title(main = "Max")
axis(1, at = 1:12, labels = mos)
boxplot(armin, xaxt = "n")
lines(obs.stats$min, col = "red")
title(main = "Min")
axis(1, at = 1:12, labels = mos)
boxplot(arskw, xaxt = "n")
lines(obs.stats$skew, col = "red")
title(main = "Skew")
axis(1, at = 1:12, labels = mos)
boxplot(arstdev, xaxt = "n")
lines(obs.stats$stdev, col = "red")
title(main = "SD")
axis(1, at = 1:12, labels = mos)
boxplot(arcor, xaxt = "n")
lines(obs.stats$cor, col = "red")
title(main = "Lag-1 Corr")
axis(1, at = 1:12, labels = mos)
par(oma = c(2, 2, 3, 2))
title("Statistics for Nonparametric KNN model", outer = TRUE)

plot of chunk unnamed-chunk-6

Comments on Statistical Plots:

Using the KNN approach, we are able to simulate and replicate the skew seen in the original dataset. As before the remaining statistics match. Additionally, the minimum values match better now than with the previous two models. This could be due to better simulation of the tails since (seasonal) lag-1 ARMA gives us good fit on the Lag-1 Correlation plot (compare this to the lack of one in the stationary nonseasonal ARMA). This is because we are basically using 12 different models taking into account the change from month to month. As expected, mean, maximum, minimum and SD are also good fits. The skew is off since this is still assuming normal distribution.

Boxplot the May PDF

plotMay(May, simpdf)  # used own function to consolidate May PDf plotting commands
par(oma = c(2, 2, 3, 2))
title("May PDF from Simulation", outer = TRUE)

plot of chunk unnamed-chunk-7

Comments on May PDF

With the use of this nonparametric method, we successfully capture the bimodality in the May PDF. This is because we are using a method that does not assume a normal distribution.

Compare multiple lag correlation results

par(mfrow = c(1, 3))
boxplot(arcor, xaxt = "n")
lines(obs.stats$cor, col = "red")
title(main = "Lag-1 Corr")
axis(1, at = 1:12, labels = mos)
boxplot(arcor2, xaxt = "n")
lines(obs.stats$mcor2, col = "red")
title(main = "Lag-2 Corr")
axis(1, at = 1:12, labels = mos)
boxplot(arcor3, xaxt = "n")
lines(obs.stats$mcor3, col = "red")
title(main = "Lag-3 Corr")
axis(1, at = 1:12, labels = mos)
par(oma = c(2, 2, 3, 2))
title("Comparison of multiple lags", outer = TRUE)

plot of chunk unnamed-chunk-8

Comments on lag correlations

This method does not capture any correlation greater than one. This is because when we simulate, we are only using the previous month (i.e Jan-Feb, Feb-Mar). Therefore, lag correlations greater than one month would not be captured by the simulation.