Emily Gill - CVEN6833 - Homework #3

December 16, 2013

Note that rawspec.r, smoothspec.r, ssab.r and skew.r from Balaji's functions were used in this homework. This particular file outlines the functions I made and used in addition to these.

bestARIMA.r

It uses an inputs of p and q vectors, tests all ARIMA models for various combinations of p and q, and then chooses the best model as the one with the lowest AIC value. It outputs all of the information, as well as 'ar' and 'ma' coefficients of the best ARIMA to be called upon in the actual problem codes.

bestARIMA <- function(array, p, q) {
    np <- length(p)
    nq <- length(q)
    aics <- matrix(1, (np * nq), 3)
    it <- 0
    for (i in 1:np) {
        for (j in 1:nq) {
            it <- it + 1
            zz <- arima0(array, order = c(p[i], 0, q[j]))
            aics[it, ] <- cbind(zz$aic, p[i], q[j])
        }
    }
    aics <- as.data.frame(aics)
    names(aics) <- c("AIC", "p", "q")

    min <- which(aics == min(aics[, 1]))
    bestp <- aics[min, 2]
    bestq <- aics[min, 3]
    bestmodel <- arima0(array, order = c(bestp, 0, bestq))
    results <- list(aics = aics, bestp = bestp, bestq = bestq, bestmod = bestmodel)
    return(results)
}

stats.r

It is used in most of the problems of this homework to store the statistics on both the simulated nad historical data. It was generalized to fit both and to fit in scenarios when the lag 1 correlation or multiple month (2, 3) correlations were not appropriate. In this, you can specify “corr=TRUE” for lag-1 correlations and “multicorr=TRUE” for lag-2 and lag-3 correlations.

stats <- function(data, corr, multicorr) {
    setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW3")
    source("skew.r")

    nc <- length(data[1, ])

    nyrs <- length(data[, 1])
    nyrs1 <- nyrs - 1

    mean <- 1:nc
    stdev <- 1:nc
    cor <- 1:nc
    mcor2 <- 1:nc
    mcor3 <- 1:nc
    skw <- 1:nc
    max <- 1:nc
    min <- 1:nc

    for (i in 1:nc) {
        max[i] <- max(data[, i])
        min[i] <- min(data[, i])
        mean[i] <- mean(data[, i])
        stdev[i] <- sd(data[, i])
        skw[i] <- skew(data[, i])
    }

    results <- list(mean = mean, stdev = stdev, min = min, max = max, skew = skw)


    # if-statement to perform lag-1 correlation or not
    if (corr == "TRUE") {
        for (i in 2:12) {
            i1 <- i - 1
            cor[i] <- cor(data[, i], data[, i1])
        }

        cor[1] <- cor(data[1:nyrs1, 12], data[2:nyrs, 1])

        results <- list(mean = mean, stdev = stdev, min = min, max = max, skew = skw, 
            cor = cor)

    }


    # if-statement to perform lag-2 and lag-3 correlation or not
    if (multicorr == "TRUE") {
        for (i in 3:12) {
            i2 <- i - 2
            mcor2[i] <- cor(data[, i], data[, i2])
        }
        mcor2[1] <- cor(data[1:nyrs1, 11], data[2:nyrs, 1])
        mcor2[2] <- cor(data[1:nyrs1, 10], data[2:nyrs, 2])

        for (i in 4:12) {
            i3 <- i - 3
            mcor3[i] <- cor(data[, i], data[, i3])
        }
        mcor3[1] <- cor(data[1:nyrs1, 10], data[2:nyrs, 1])
        mcor3[2] <- cor(data[1:nyrs1, 11], data[2:nyrs, 2])
        mcor3[3] <- cor(data[1:nyrs1, 12], data[2:nyrs, 2])

        results <- list(mean = mean, stdev = stdev, min = min, max = max, skew = skw, 
            cor = cor, mcor2 = mcor2, mcor3 = mcor3)
    }

    return(results)

}

plotMay.r

This is a simple function used to plot the May flow PDFs. This is mostly taken from Balaji's code online, but put into a function in order to tighten-up the problem codes.

plotMay <- function(May, simpdf) {

    # re-define the evaluation points for the purpose of the function - these
    # are the same as were used during simulation creation
    xeval = seq(min(May) - 0.25 * sd(May), max(May) + 0.25 * sd(May), length = 100)

    # calculate the historical May pdf
    zz = sm.density(May, eval.points = xeval, display = "none")
    xdensityorig = zz$estimate

    # obtain boxplot of simulated PDFs splot into the number of evaluation
    # points
    par(mfrow = c(1, 1))
    xs = 1:length(xeval)
    zz = boxplot(split(t(simpdf), xs), plot = F, cex = 1)
    zz$names = rep("", length(zz$names))
    z1 = bxp(zz, ylim = range(simpdf, xdensityorig), xlab = "May flow MAF", 
        ylab = "PDF", cex = 1.25)

    # organize for better axis plotting:
    z2 = 1:6
    n1 = 1:6
    z2[1] = z1[1]
    z2[2] = z1[20]
    z2[3] = z1[40]
    z2[4] = z1[60]
    z2[5] = z1[80]
    z2[6] = z1[100]

    n1[1] = xeval[1]
    n1[2] = xeval[20]
    n1[3] = xeval[40]
    n1[4] = xeval[60]
    n1[5] = xeval[80]
    n1[6] = xeval[100]
    n1 = round(n1, dig = 0)
    n1 = as.character(n1)

    axis(1, at = z2, labels = n1, cex = 1)

    # overlay the actual Lees Ferry May pdf in red
    lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

}

bestTREE.r

This function fits trees for all combinations of 3 predictors (i.e. seven total tree models) and spits out the deviance of each in order to select the tree with the lowest deviance.

bestTREE <- function(data) {

    n <- length(data[1, ]) - 1

    deviances <- 1:7
    models <- c("full", "2,3", "3,4", "2,4", "2", "3", "4")

    ztree <- tree(data[, 1] ~ data[, 2:4])
    deviances[1] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 2:3])
    deviances[2] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 3:4])
    deviances[3] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 2] + data[, 4])
    deviances[4] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 2])
    deviances[5] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 3])
    deviances[6] <- deviance(ztree)

    ztree <- tree(data[, 1] ~ data[, 4])
    deviances[7] <- deviance(ztree)

    low <- which(deviances == min(deviances))
    deviances <- cbind(deviances, models)

    return(deviances)

}