Rebecca Di Bari

CVEN 6833 Homework 3

# function to compute skew and interquartile range together
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/skew-iqr.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/skew.r")

# functions to produce boxplots with whiskers at 5th and 95th percentile
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/myboxplot.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/myboxplot-stats.r")

library(sm)
library(MASS)

CART.pred = function(X, Y) # X is data outside fo specific epoch. Must be a dataframe
{
    LF.pred = tree(X[, 2] ~ X[, 3] + X[, 4] + X[, 5], data = X)
    X = as.data.frame(Y)
    LFpred = predict(LF.pred, newdata = X)

    plot(Y[, 2], LFpred)
    lines(Y[, 2], Y[, 2], col = "red")
    points(mean(LFpred), mean(LFpred), type = "p", bg = "green", pch = 21, col = "black")

    rmse = sqrt(mean((Y[, 2] - LFpred)^2))
    SST = sum((Y[, 2] - mean(Y[, 2]))^2)
    SSR = sum((Y[, 2] - LFpred)^2)
    rsq = 1 - (SSR/SST)

    myreturn = list(rmse = rmse, rsq = rsq)
}

fun = function(p, q, X) {
    modelaic = matrix(0, nrow = length(p), ncol = length(q))
    for (i in p[1]:p[length(p)]) {
        for (j in q[1]:q[length(q)]) {
            zz = arima0(X, order = c(i, 0, j))
            modelaic[i, j] = zz$aic  #this gives the AIC
            colnames(modelaic) = paste("q=", q[1]:q[length(q)], sep = "")
        }
        rownames(modelaic) = paste("p=", p[1]:p[length(p)], sep = "")
    }
    inds = which(modelaic == min(modelaic), arr.ind = TRUE)
    myresult = list(lowest.aic = modelaic[order(modelaic)[1]], p = inds[1], 
        q = inds[2])
}

CART & Forecasting

1. Colorado River at Lees Ferry, AZ, is an important location on the river, through which 85% of the flow passes through. It has been shown that it is modulated by large scale climate features such as ENSO, PDO and AMO. You wish to predict using CART, to this end perform the following.

### Data: Lee's Ferry: Used Question 1, Question 2, Question 3, Question
### 4, Question 5
LFannual = read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LF_1906-2005.txt", 
    F)
LF.annual = LFannual[, 2]/1e+06
year = LFannual[, 1]

AMO = read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt", 
    F)
AMO = AMO[51:150, 2]  #get 1906- 2005
PDO = read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt", 
    F)
PDO = PDO[7:106, 2]
ENSO = read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt", 
    F)
ENSO = ENSO[2:101, 2]

(i) Fit a best regression tree for the entire period - using CV and Deviance and display the tree

library(tree)

LFdata = cbind(year, LF.annual, ENSO, PDO, AMO)
names(LFdata) <- c("year", "LF.annual", "ENSO", "PDO", "AMO")
LFdata = as.data.frame(LFdata)

# Fit regression tree
ztree <- tree(LFdata[, 2] ~ LFdata[, 3] + LFdata[, 4] + LFdata[, 5], LFdata)

plot(ztree)
text(ztree, cex = 0.75)

plot of chunk unnamed-chunk-3

(ii) Evaluate the performance of the regression tree on three different epochs - 1906 -1921; 1975 - 1990 and 1990 - 2006. Essentialy, fit a best tree on the data outside of these epochs and predict the mean flow for these epochs. Plot the observed and predicted values; compute R2and RMSE

par(mfrow = c(3, 1))
# first epoch: 1906-1921
LFdata.1 = as.data.frame(LFdata[1:16, ])
LFdata.not1 = as.data.frame(LFdata[17:100, ])
epoch.1 = CART.pred(LFdata.not1, LFdata.1)

# second epoch: 1975-1990
LFdata.2 = LFdata[70:85, ]
LFdata.not2 = rbind(LFdata[1:69, ], LFdata[86:100, ])
epoch.2 = CART.pred(LFdata.not2, LFdata.2)

# third epoch: 1990-2005
LFdata.3 = LFdata[1:84, ]
LFdata.not3 = LFdata[85:100, ]
epoch.3 = CART.pred(LFdata.not3, LFdata.3)

plot of chunk unnamed-chunk-4


performance.table = matrix(c(epoch.1$rmse, epoch.1$rsq, epoch.2$rmse, epoch.2$rsq, 
    epoch.3$rmse, epoch.3$rsq), ncol = 2, byrow = T)
rownames(performance.table) = c("epoch1", "epoch2", "epoch3")
colnames(performance.table) = c("rmse", "rsq")
as.table(performance.table)
##           rmse     rsq
## epoch1  4.7955 -0.6207
## epoch2  5.4047  0.1131
## epoch3  5.3122 -0.4772

Cart does not seem to be a great method for predicting, as the predicted verse actual is not great. CART is more tuned for analyzing data and determining relationships.

Question 2. Stochastic simulation of system variables (e.g., streamflow) is important for risk-based decision making. ARMA models are standard linear time series simulation methods. Fit an ARMA model and simulate the monthly Lees Ferry streamflow, the steps are as follows. Remove the seasonal cycle from the data - i.e., remove the monthly mean and divide by monthly standard deviation.

monthdata = matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/Leesferry-mon-data.txt"), 
    ncol = 13, byrow = T)
WYmonflow = monthdata[, 2:13]
WYmonflow = WYmonflow/10^6  # convert to MAF
yrs = monthdata[, 1]
nyrs = length(yrs)
nyrs1 = nyrs - 1

### Scale the data..
fmean = apply(WYmonflow, 2, mean)  #monthly mean
fsdev = apply(WYmonflow, 2, sd)  #monthly standard deviation

# get the data into one long array...
X = t((t(WYmonflow) - fmean)/fsdev)
X = array(t(X))  #standardized anomalies..

(i) Fit a best ARMA model for the entire monthly time series.

## Fit various ARMA models
bestmodel = fun(1:3, 1:3, X)
p = bestmodel$p
q = bestmodel$q
pq = matrix(c(p, q), ncol = 2, byrow = T)
colnames(pq) = c("p", "q")
rownames(pq) = "[1]"
as.table(pq)
##     p q
## [1] 3 1
zz = arima0(X, order = c(p, 0, q))

(ii) Generate 250 simulations each of same length as the historical data.

Add the seasonal cycle back and create boxplots of annual and monthly, mean, variance,skew, lag-1 correlation and, PDFs of May and annual flows. Comment on what you observe and also on why some of the monthly statistics are not reproduced by the “best” model.

################### Simulate
nsim = 250

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/arma.sim.txt")
fu = arma.sim(nsim, "mean", zz, X, nyrs, p, q, 12)
LF.sim.mean = fu$LF.mean
fu = arma.sim(nsim, "sd", zz, X, nyrs, p, q, 12)
LF.sim.sd = fu$LF.sd
fu = arma.sim(nsim, "skew", zz, X, nyrs, p, q, 12)
LF.sim.skew = fu$LF.skew
fu = arma.sim(nsim, "cor", zz, X, nyrs, p, q, 12)
LF.sim.cor = fu$LF.cor
fu = arma.sim(nsim, "max", zz, X, nyrs, p, q, 12)
LF.sim.max = fu$LF.max
fu = arma.sim(nsim, "min", zz, X, nyrs, p, q, 12)
LF.sim.min = fu$LF.min

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/arma.sim.month.txt")
May = WYmonflow[, 5]
fu = arma.sim.month(nsim, May, zz, X, p, q, nyrs)
May.sim = fu$Monthpdf
xeval = fu$xeval

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/obs.stats.txt")
fu = obs.stats("mean", WYmonflow, nyrs)
Obs.mean = fu$obs.mean
fu = obs.stats("cor", WYmonflow, nyrs)
Obs.cor = fu$obs.cor
fu = obs.stats("sd", WYmonflow, nyrs)
Obs.sd = fu$obs.sd
fu = obs.stats("skew", WYmonflow, nyrs)
Obs.skew = fu$obs.skew
fu = obs.stats("max", WYmonflow, nyrs)
Obs.max = fu$obs.max
fu = obs.stats("min", WYmonflow, nyrs)
Obs.min = fu$obs.min

########### boxplot the stats ###############
months = month.abb
sim.list = rbind(LF.sim.mean, LF.sim.sd, LF.sim.skew, LF.sim.cor, LF.sim.max, 
    LF.sim.min)
o = list(1:250, 251:500, 501:750, 751:1000, 1001:1250, 1251:1500)
obs.list = rbind(Obs.mean, Obs.sd, Obs.skew, Obs.cor, Obs.max, Obs.min)
titles = c("Mean", "Standard Deviation", "Skew", "Correlation", "Max", "Min")

par(mfrow = c(3, 2))
for (i in 1:length(o)) {
    xmeans = rbind(obs.list[i, ], sim.list[o[i][[1]], ])  #the first row is the means of the original data
    xmeans1 = sim.list[o[i][[1]], ]
    zz = boxplot(split(xmeans1, col(xmeans1)), plot = F, cex = 1)
    zz$names = rep("", length(zz$names))
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    axis(1, at = 1:12, labels = months)
    points(z1, xmeans[1, ], pch = 16, col = "red")
    lines(z1, xmeans[1, ], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-7


# boxplots of the May PDF....
zz = sm.density(May, eval.points = xeval, display = "none")
xdensityorig = zz$estimate

par(mfrow = c(1, 1))
xs = 1:length(xeval)
zz = boxplot(split(t(May.sim), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(May.sim, xdensityorig), xlab = "May flow MAF", ylab = "PDF", 
    cex = 1.25)

evaluate = c(1, 20, 40, 60, 80, 100)
index = 1:6
z2 = 1:6
n1 = 1:6
for (i in 1:length(index)) {
    z2[index[i]] = z1[evaluate[i]]
    n1[index[i]] = xeval[evaluate[i]]
}
axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

plot of chunk unnamed-chunk-7

ARMA assumes constant stationarity and therefore cannot accurately calculate correleation and skew. This is reflected in the results above. Further, the MAY bimodality is not captured because this is a parametric method.

3) To improve on the above, fit a seasonal AR(1) model - i.e., nonstationary time series model. Fit a best GLM for each monthly flow with residual resampling and repeat 2. Comment on the performance in comparison.

nsim = 250

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/seasonalAR1.sim.txt")
fu = seasonalAR1.sim(nsim, "mean", WYmonflow, nyrs)
LF.sim.mean = fu$LF.mean
fu = seasonalAR1.sim(nsim, "sd", WYmonflow, nyrs)
LF.sim.sd = fu$LF.sd
fu = seasonalAR1.sim(nsim, "skew", WYmonflow, nyrs)
LF.sim.skew = fu$LF.skew
fu = seasonalAR1.sim(nsim, "cor", WYmonflow, nyrs)
LF.sim.cor = fu$LF.cor
fu = seasonalAR1.sim(nsim, "max", WYmonflow, nyrs)
LF.sim.max = fu$LF.max
fu = seasonalAR1.sim(nsim, "min", WYmonflow, nyrs)
LF.sim.min = fu$LF.min

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/seasonalAR1.sim.month.txt")
May = WYmonflow[, 5]
fu = seasonalAR1.sim.month(nsim, May, WYmonflow, nyrs)
May.sim = fu$Monthpdf
xeval = fu$xeval

########### boxplot the stats ###############
months = month.abb
sim.list = rbind(LF.sim.mean, LF.sim.sd, LF.sim.skew, LF.sim.cor, LF.sim.max, 
    LF.sim.min)
o = list(1:250, 251:500, 501:750, 751:1000, 1001:1250, 1251:1500)
obs.list = rbind(Obs.mean, Obs.sd, Obs.skew, Obs.cor, Obs.max, Obs.min)
titles = c("Mean", "Standard Deviation", "Skew", "Correlation", "Max", "Min")

par(mfrow = c(3, 2))
for (i in 1:length(o)) {
    xmeans = rbind(obs.list[i, ], sim.list[o[i][[1]], ])  #the first row is the means of the original data
    xmeans1 = sim.list[o[i][[1]], ]
    zz = boxplot(split(xmeans1, col(xmeans1)), plot = F, cex = 1)
    zz$names = rep("", length(zz$names))
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    axis(1, at = 1:12, labels = months)
    points(z1, xmeans[1, ], pch = 16, col = "red")
    lines(z1, xmeans[1, ], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-8


# boxplots of the May PDF....
zz = sm.density(May, eval.points = xeval, display = "none")
xdensityorig = zz$estimate

par(mfrow = c(1, 1))
xs = 1:length(xeval)
zz = boxplot(split(t(May.sim), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(May.sim, xdensityorig), xlab = "May flow MAF", ylab = "PDF", 
    cex = 1.25)

evaluate = c(1, 20, 40, 60, 80, 100)
index = 1:6
z2 = 1:6
n1 = 1:6
for (i in 1:length(index)) {
    z2[index[i]] = z1[evaluate[i]]
    n1[index[i]] = xeval[evaluate[i]]
}
axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

plot of chunk unnamed-chunk-8

Correlation is matched because we are suing a seasonal AR, which is a non-stationarity time serires model. However, the MAY bimodality is not captured because this is a parametric method.

4) Fit a nonparametric seasonal lag-1 model and repeat 1. You canuse either the the K-nn bootstrap technique or LOCFIT/residual resampling (which will be a complement to the GLM/resampling approach) and repeat 1. What advantages/disadvantages you see with this nonparametric approach.

# you have to string it to create a long vector
x = array(t(WYmonflow))
N = length(x)
nsim = 100

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/knn1.sim.txt")
fu = knn1.sim(nsim, "mean", WYmonflow, x, nyrs, N)
LF.sim.mean = fu$LF.mean
fu = knn1.sim(nsim, "sd", WYmonflow, x, nyrs, N)
LF.sim.sd = fu$LF.sd
fu = knn1.sim(nsim, "skew", WYmonflow, x, nyrs, N)
LF.sim.skew = fu$LF.skew
fu = knn1.sim(nsim, "cor", WYmonflow, x, nyrs, N)
LF.sim.cor = fu$LF.cor
fu = knn1.sim(nsim, "max", WYmonflow, x, nyrs, N)
LF.sim.max = fu$LF.max
fu = knn1.sim(nsim, "min", WYmonflow, x, nyrs, N)
LF.sim.min = fu$LF.min

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/knn1.sim.month.txt")
May = WYmonflow[, 5]
fu = knn1.sim.month(nsim, May, WYmonflow, nyrs, N)
May.sim = fu$Monthpdf
xeval = fu$xeval

########### boxplot the stats ###############
months = month.abb
sim.list = rbind(LF.sim.mean, LF.sim.sd, LF.sim.skew, LF.sim.cor, LF.sim.max, 
    LF.sim.min)
o = list(1:101, 102:202, 203:303, 304:404, 405:505, 506:606)
obs.list = rbind(Obs.mean, Obs.sd, Obs.skew, Obs.cor, Obs.max, Obs.min)
titles = c("Mean", "Standard Deviation", "Skew", "Correlation", "Max", "Min")

par(mfrow = c(3, 2))
for (i in 1:length(o)) {
    xmeans = rbind(obs.list[i, ], sim.list[o[i][[1]], ])  #the first row is the means of the original data
    xmeans1 = sim.list[o[i][[1]], ]
    zz = boxplot(split(xmeans1, col(xmeans1)), plot = F, cex = 1)
    zz$names = rep("", length(zz$names))
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    axis(1, at = 1:12, labels = months)
    points(z1, xmeans[1, ], pch = 16, col = "red")
    lines(z1, xmeans[1, ], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-9


# boxplots of the May PDF....
zz = sm.density(May, eval.points = xeval, display = "none")
xdensityorig = zz$estimate

par(mfrow = c(1, 1))
xs = 1:length(xeval)
zz = boxplot(split(t(May.sim), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(May.sim, xdensityorig), xlab = "May flow MAF", ylab = "PDF", 
    cex = 1.25)

evaluate = c(1, 20, 40, 60, 80, 100)
index = 1:6
z2 = 1:6
n1 = 1:6
for (i in 1:length(index)) {
    z2[index[i]] = z1[evaluate[i]]
    n1[index[i]] = xeval[evaluate[i]]
}
axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

plot of chunk unnamed-chunk-9

K-nn timeseries model provides the best results for all metrics. K-nn is a non-parametric approach, which allows it to caputre the bimodality. It also assumes no underlying trend about the data.

5) Another approach to simulate the monthly streamflow is using PCA.

(i) Perform PCA on the monthly streamflow data. Seem how many PCs to retain based on the Eigen spectrum, retain as many as you need to capture most of the variance, say ~75-80%

monthdata = matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/Leesferry-mon-data.txt"), 
    ncol = 13, byrow = T)
nyrs = length(monthdata[, 1])
WYmonflow = monthdata[, 2:13]
WYmonflow = WYmonflow/10^6
fmean = apply(WYmonflow, 2, mean)  #monthly mean
fsdev = apply(WYmonflow, 2, sd)
# scale data

WYmonflow1 = t((t(WYmonflow) - fmean)/fsdev)

zs = var(WYmonflow1)  #variance matrix
zsvd = svd(zs)  #Eigen decomposition..
pcsLF = t(t(zsvd$u) %*% t(WYmonflow1))
evect = zsvd$u  # EigenVector
lambdas = (zsvd$d/sum(zsvd$d))

plot(1:12, lambdas[1:12], type = "l", xlab = "Modes", ylab = "Frac. Var. explained")
points(1:12, lambdas[1:12], col = "red")

plot of chunk unnamed-chunk-10

# keep first 4 PCs

(ii) Fit a best ARMA model for each retained PC and a best fit Normal distribution for the others (i.e., the noise PCs)

## Fit various ARMA models
pq = matrix(0, nrow = 4, ncol = 2)
for (i in 1:4) {
    bestmodel = fun(1:3, 1:3, pcsLF[, i])
    p = bestmodel$p
    q = bestmodel$q
    pq[i, 1] = p
    pq[i, 2] = q
}
colnames(pq) = c("p", "q")
as.table(pq)
##   p q
## A 1 1
## B 1 1
## C 1 1
## D 3 1

(iii) Simulate each PC from their respective model and invert back to the flow space using Eigen Vector matrix. Repeat to generate ensembles Boxplot the statistics listed in 2. and compare with the results from the previous methods

################### Simulate
nsim = 100
nyrs1 = nyrs - 1

May = WYmonflow[, 5]
xeval = seq(min(May) - 0.25 * sd(May), max(May) + 0.25 * sd(May), length = 100)
nevals = length(xeval)
zz = sm.density(May, eval.points = xeval, display = "none")
xdensityorig = zz$estimate

armean = matrix(0, nsim, 12)
arstdev = matrix(0, nsim, 12)
arcor = matrix(0, nsim, 12)
arskw = matrix(0, nsim, 12)
armax = matrix(0, nsim, 12)
armin = matrix(0, nsim, 12)
simpdf = matrix(0, nrow = nsim, ncol = nevals)
for (k in 1:nsim) {
    p = pq[1, 1]
    q = pq[1, 2]
    zz = arima0(pcsLF[, 1], order = c(p, 0, q))
    xsim1 = arima.sim(n = nyrs, list(ar = c(zz$coef[1:p]), ma = c(zz$coef[(p + 
        1):(p + q)])), sd = sqrt(zz$sigma2)) + mean(pcsLF)
    p = pq[2, 1]
    q = pq[2, 2]
    zz = arima0(pcsLF[, 2], order = c(p, 0, q))
    xsim2 = arima.sim(n = nyrs, list(ar = c(zz$coef[1:p]), ma = c(zz$coef[(p + 
        1):(p + q)])), sd = sqrt(zz$sigma2)) + mean(pcsLF)
    p = pq[3, 1]
    q = pq[3, 2]
    zz = arima0(pcsLF[, 3], order = c(p, 0, q))
    xsim3 = arima.sim(n = nyrs, list(ar = c(zz$coef[1:p]), ma = c(zz$coef[(p + 
        1):(p + q)])), sd = sqrt(zz$sigma2)) + mean(pcsLF)
    p = pq[4, 1]
    q = pq[4, 2]
    zz = arima0(pcsLF[, 4], order = c(p, 0, q))
    xsim4 = arima.sim(n = nyrs, list(ar = c(zz$coef[1:p]), ma = c(zz$coef[(p + 
        1):(p + q)])), sd = sqrt(zz$sigma2)) + mean(pcsLF)

    # Simulate the remaining pcs with a normal distribution
    pred = matrix(0, nrow = nyrs, ncol = 8)
    j1 = c(5, 6, 7, 8, 9, 10, 11, 12)
    for (j in 1:8) {
        PC = pcsLF[, j1[j]]
        pred[, j] = rnorm(nyrs, mean = mean(PC), sd = sd(PC))
    }

    # combine all simulated pcs
    PC.sim = matrix(cbind(xsim1, xsim2, xsim3, xsim4), nrow = 95)
    pc.sim = cbind(PC.sim, pred)

    # Put statistics back
    LF.sim = pc.sim %*% t(evect)

    # Back standardize
    LFsim = t((t(LF.sim) * fsdev) + fmean)

    for (j in 1:12) {
        armean[k, j] = mean(LFsim[, j])
        armax[k, j] = max(LFsim[, j])
        armin[k, j] = min(LFsim[, j])
        arstdev[k, j] = sd(LFsim[, j])
        arskw[k, j] = skew(LFsim[, j])
    }
    # correlation between one month to another..
    for (j in 2:12) {
        j1 = j - 1
        arcor[k, j] = cor(LFsim[, j], LFsim[, j1])
    }
    arcor[k, 1] = cor(LFsim[1:nyrs1, 12], LFsim[2:nyrs, 1])

    ###### Simulate May May.sim=LFsim[,5]
    ###### xeval=seq(min(May)-0.25*sd(May),max(May)+0.25*sd(May),length=length(May))
    ###### nevals=length(xeval)

    # simpdf[k,]=sm.density(May.sim,eval.points=xeval,display='none')$estimate
}

########### boxplot the stats ###############
months = month.abb
sim.list = rbind(armean, arstdev, arskw, arcor, armax, armin)
o = list(1:100, 101:200, 201:300, 301:400, 401:500, 501:600)
obs.list = rbind(Obs.mean, Obs.sd, Obs.skew, Obs.cor, Obs.max, Obs.min)
titles = c("Mean", "Standard Deviation", "Skew", "Lag 1 Cor", "Max", "Min")

par(mfrow = c(3, 2))
for (i in 1:length(o)) {
    xmeans = rbind(obs.list[i, ], sim.list[o[i][[1]], ])  #the first row is the means of the original data
    xmeans1 = sim.list[o[i][[1]], ]
    zz = boxplot(split(xmeans1, col(xmeans1)), plot = F, cex = 1)
    zz$names = rep("", length(zz$names))
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    axis(1, at = 1:12, labels = months)
    points(z1, xmeans[1, ], pch = 16, col = "red")
    lines(z1, xmeans[1, ], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-12


#### PDF of may
par(mfrow = c(1, 1))
xs = 1:length(xeval)
zz = boxplot(split(t(simpdf), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(simpdf, xdensityorig), xlab = "May flow MAF", ylab = "PDF", 
    cex = 1.25)

evaluate = c(1, 20, 40, 60, 80, 100)
index = 1:6
z2 = 1:6
n1 = 1:6
for (i in 1:length(index)) {
    z2[index[i]] = z1[evaluate[i]]
    n1[index[i]] = xeval[evaluate[i]]
}
axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

plot of chunk unnamed-chunk-12

This method produces better results than the arma model as it matches correlations relatively well. However, this method still can't accurately generate Skew or Minimum. Again, because the ARMA model assumes stationarity, the bimodality is not captured.

6. Fit AR model and simulate sprecum. Calculate/ boxplot all basic statistics as as well as the May PDF

library(tseries)
library(MASS)
source("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/rawspec.R")
source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/smoothspec.txt")
source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/spec.txt")

lf = matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/Leesferry-mon-data.txt"), 
    ncol = 13, byrow = T)
lf = lf[, 2:13]
lf = lf/10^6  #convert to MAF

## Select the May monthly flow..
y = lf[, 5]

nsim = 100
deltaT = 1
t = 1:length(y)

f = spec1(y, nsim)
specsimrawar = f$raw.spec.ar
specsimrawars = f$smooth.spec.ar
specsimrawsur = f$raw.spec.sur
specsimrawsurs = f$smooth.spec.sur
pgram = f$raw.spec.orig
pgrams = f$smooth.spec.orig
plotfreqs = f$plotfreqs

######################### Plot the raw spectrum from AR simulations
plot(plotfreqs, pgram, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawar, 
    pgram), col = "red", lwd = 2)

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawar[, i], col = "grey")
}
lines(plotfreqs, pgram, col = "red", lwd = 3)

plot of chunk unnamed-chunk-13


############## plot the raw spectrum from surrogate data
plot(plotfreqs, pgram, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawsur, 
    pgram), col = "red", lwd = 3, type = "l")

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawsur[, i], col = "grey")
}
lines(plotfreqs, pgram, col = "red", lwd = 3)

plot of chunk unnamed-chunk-13


############## Plot the smooth spectrum of the AR simulations
plot(plotfreqs, pgrams, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawars, 
    pgrams), col = "red", lwd = 3, type = "l")

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawars[, i], col = "grey")
}

lines(plotfreqs, pgrams, col = "red", lwd = 3)

plot of chunk unnamed-chunk-13


############## Plot the smooth spectrum of the surrogate data
plot(plotfreqs, pgrams, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawsurs, 
    pgrams), col = "red", lwd = 3, type = "l")

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawsurs[, i], col = "grey")
}
lines(plotfreqs, pgrams, col = "red", lwd = 3)

plot of chunk unnamed-chunk-13


############## Calculate mean, sd, skew, cor, max, min from AR model
N = length(specsimrawar[, 1])
armean = matrix(0, nrow = N, ncol = 1)
armax = matrix(0, nrow = N, ncol = 1)
armin = matrix(0, nrow = N, ncol = 1)
arcor = matrix(0, nrow = N, ncol = 1)
arskw = matrix(0, nrow = N, ncol = 1)
arstdev = matrix(0, nrow = N, ncol = 1)
for (j in 1:N) {
    armean[j] = mean(specsimrawar[j, ])
    armax[j] = max(specsimrawar[j, ])
    armin[j] = min(specsimrawar[j, ])
    arstdev[j] = sd(specsimrawar[j, ])
    arskw[j] = skew(specsimrawar[j, ])
}
# correlation between one year to another..
for (j in 2:N) {
    j1 = j - 1
    arcor[j] = cor(specsimrawar[j, ], specsimrawar[j1, ])
}
arcor[1] = cor(specsimrawar[1:(N - 1), 2], specsimrawar[2:N, 2])

# Observed stats
obs.mean = as.matrix(mean(pgram))
obs.skew = as.matrix(skew(pgram))
obs.sd = as.matrix(sd(pgram))
obs.min = as.matrix(min(pgram))
obs.max = as.matrix(max(pgram))
obs.cor = as.matrix(cor(pgram[1:(N - 1)], pgram[2:N]))

########### boxplot the stats ###############
simulated = c("armean", "arstdev", "arskw", "arcor", "armax", "armin")
observ = c("obs.mean", "obs.sd", "obs.skew", "obs.cor", "obs.max", "obs.min")
titles = c("Mean", "Standard Deviation", "Skew", "Correlation", "Max", "Min")

par(mfrow = c(1, 2))
for (i in 1:length(observ)) {
    xmeans = rbind(get(observ[i]), get(simulated[i]))
    xmeans1 = get(simulated[i])
    zz = boxplot(xmeans1, plot = F)
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    points(z1, xmeans[1], pch = 16, col = "red")
    lines(z1, xmeans[1], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-13 plot of chunk unnamed-chunk-13 plot of chunk unnamed-chunk-13


#### Plot the PDF of May
simpdf = matrix(0, nrow = 48, ncol = nsim)
# xeval=seq(min(y)-0.25*sd(y),max(y)+0.25*sd(y),length=95)
for (i in 1:100) {
    xeval = seq(min(specsimrawar[, i]) - 0.25 * sd(specsimrawar[, i]), max(specsimrawar[, 
        i]) + 0.25 * sd(specsimrawar[, i]), length = N)
    zz = sm.density(specsimrawar[, i], eval.points = xeval, display = "none")
    simpdf[, i] = zz$estimate
}

xeval = seq(min(pgram) - 0.25 * sd(pgram), max(pgram) + 0.25 * sd(pgram), length = N)
zz = sm.density(pgram, eval.points = xeval, display = "none")
xdensityorig = zz$estimate

par(mfrow = c(1, 1))
xs = 1:length(xeval)
zz = boxplot(split(t(simpdf), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(simpdf, xdensityorig), xlab = "May flow MAF", ylab = "PDF", 
    cex = 1.25)

evaluate = c(1, 20, 40, 60, 80, 100)
index = 1:6
z2 = 1:6
n1 = 1:6
for (i in 1:length(index)) {
    z2[index[i]] = z1[evaluate[i]]
    n1[index[i]] = xeval[evaluate[i]]
}
axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, lty = 2, lwd = 2, col = "red")

plot of chunk unnamed-chunk-13

7. Another way to simulate a time series is using Markov Chains + resampling

(i) For the annual Lees Ferry streamflow time series using median flow as the threshold create a binary time series of the flow data (i.e. sequences of 1s and 0s - 1 being above threshold and 0 below)

# Put data into binary.
LFann1 <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LF_1906-2005.txt")
LFann = LFann1/(10^6)
ENSOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt")
PDOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt")
AMOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt")

# Compile data from matching years
years <- seq(1906, 2005, 1)

LFann = LFann[, 2]
N = length(LFann)

yth = median(LFann)
LFann[LFann < yth] = 0
LFann[LFann >= yth] = 1

ydes1 = LFann[-1]
ydeslag = LFann[-N]

ENSOann1 = ENSOann[match(years, ENSOann[, 1]), 2]
PDOann1 = PDOann[match(years, PDOann[, 1]), 2]
AMOann1 = AMOann[match(years, AMOann[, 1]), 2]
ENSOann1 = ENSOann1[-1]
PDOann1 = PDOann1[-1]
AMOann1 = AMOann1[-1]

X = as.data.frame(cbind(ydeslag, ENSOann1, PDOann1, AMOann1))

names(X) <- c("LF1", "ENSO", "PDO", "AMO")

(ii) Fit an best GLM for the state series - S_t = f(S_t-1, ENSO_t, AMO_t, PDO_t) + eps

X = cbind(ydeslag, PDOann1)

glmmarkov = glm(ydes1 ~ X, family = binomial())

sim = predict(glmmarkov)
Nl = length(sim)
statesim = 1:Nl
for (i in 1:Nl) {
    pk = sim[i]
    statesim[i] <- ifelse(runif(1) < pk, 1, 0)
}
statesim = as.matrix(statesim)

(iii) Get state sequence from the best fit GLM.

##### get states from original data
historical = LFann1[, 2]/(10^6)

LFann2 = LFann1[, 2]/(10^6)
N = length(LFann2)

yth = median(LFann2)
LFann2[LFann2 < yth] = 0
LFann2[LFann2 >= yth] = 1

pairs.obs = cbind(LFann2[1:(N - 1)], LFann2[2:N])
p00.obs = length(which(pairs.obs[, 1] == 0 & pairs.obs[, 2] == 0))
p10.obs = length(which(pairs.obs[, 1] == 1 & pairs.obs[, 2] == 0))
p01.obs = length(which(pairs.obs[, 1] == 0 & pairs.obs[, 2] == 1))
p11.obs = length(which(pairs.obs[, 1] == 1 & pairs.obs[, 2] == 1))

trans.obs = matrix(0, nrow = 2, ncol = 2)
rownames = c("0", "1")
colnames = c("0", "1")
trans.obs[1, 1] = p00.obs/(N - 1)
trans.obs[1, 2] = p01.obs/(N - 1)
trans.obs[2, 1] = p10.obs/(N - 1)
trans.obs[2, 2] = p11.obs/(N - 1)
trans.obs
##        [,1]   [,2]
## [1,] 0.2626 0.2424
## [2,] 0.2424 0.2525

##### get pairs from simulated data
n = length(statesim)
pairs.sim = cbind(statesim[1:(n - 1)], statesim[2:n])
colnames = c("t-1", "t")

p00 = length(which(pairs.sim[, 1] == 0 & pairs.sim[, 2] == 0))
p10 = length(which(pairs.sim[, 1] == 1 & pairs.sim[, 2] == 0))
p01 = length(which(pairs.sim[, 1] == 0 & pairs.sim[, 2] == 1))
p11 = length(which(pairs.sim[, 1] == 1 & pairs.sim[, 2] == 1))

# Transition Prob Matirx
trans.sim1 = matrix(0, nrow = 2, ncol = 2)
rownames = c("0", "1")
colnames = c("0", "1")
trans.sim1[1, 1] = p00/(n - 1)
trans.sim1[1, 2] = p01/(n - 1)
trans.sim1[2, 1] = p10/(n - 1)
trans.sim1[2, 2] = p11/(n - 1)
trans.sim1
##         [,1]    [,2]
## [1,] 0.83673 0.07143
## [2,] 0.07143 0.02041
# the prediction of the glm seems to return alot mor 0s than the original
# data; the transition matrix is very different from the historical. Use
# the function simulate and see if it returns better results.

sim.fnc = simulate(glmmarkov, 1)

n = length(sim.fnc[, 1])
pairs.sim = cbind(sim.fnc[1:(n - 1), ], sim.fnc[2:n, ])
colnames = c("t-1", "t")

p00 = length(which(pairs.sim[, 1] == 0 & pairs.sim[, 2] == 0))
p10 = length(which(pairs.sim[, 1] == 1 & pairs.sim[, 2] == 0))
p01 = length(which(pairs.sim[, 1] == 0 & pairs.sim[, 2] == 1))
p11 = length(which(pairs.sim[, 1] == 1 & pairs.sim[, 2] == 1))

# Transition Prob Matirx
trans.sim2 = matrix(0, nrow = 2, ncol = 2)
rownames = c("0", "1")
colnames = c("0", "1")
trans.sim2[1, 1] = p00/(n - 1)
trans.sim2[1, 2] = p01/(n - 1)
trans.sim2[2, 1] = p10/(n - 1)
trans.sim2[2, 2] = p11/(n - 1)
trans.sim2
##        [,1]   [,2]
## [1,] 0.2857 0.2143
## [2,] 0.2143 0.2857
# this is much closer to the transition matrix of the historical data

(iv) For each simulated state, simulate a flow magnitude. Use K-nearest neighbor resampling of the form Xt conditioned on [Xt-1, St, St-1]

############## Fist flow ##############
nsim = 500
N = length(sim.fnc[, 1])
simflow = matrix(0, ncol = nsim, nrow = N)

## Since p00.obs, p01.obs, p10.obs and p11.obs are close, the weight
## metric is the same
N1 = p00.obs
K = round(sqrt(N1))
W = 1:K
W = 1/W
W = W/sum(W)
W = cumsum(W)

#### Simulate flow
for (i in 1:nsim) {
    for (j in 2:N) {
        # what state is flow in
        j1 = j - 1
        val1 = sim.fnc[j1, ]

        # associate that value with a flow. Randomly chose that flow
        if (val1 == 0) {
            indX = which(LFann2 == 0)
            if (j == 2) {
                indX = sample(indX, 1)
            }
            Hist0 = historical[indX]
        }

        if (val1 == 1) {
            indX = which(LFann2 == 1)
            if (j == 2) {
                index = sample(indX, 1)
            }
            Hist1 = historical[indX]
        }

        # look at second value in simulated sequence
        val2 = sim.fnc[j, ]

        if (val1 == 0 && val2 == 0) {
            # associate flow values for all 00 sequences
            all00 = which(pairs.obs[, 1] == 0 & pairs.obs[, 2] == 0)
            Hist00 = historical[all00]

            # calculate distance
            xdist = order(abs(Hist0 - Hist00))

            # select one of the nearest neighbor with the weight function above.
            xx = runif(1, 0, 1)
            xy = c(xx, W)
            xx = rank(xy)
            index = xdist[xx[1]]

            # Next flow
            simflow[j, i] = historical[index]
        }

        if (val1 == 0 && val2 == 1) {
            # associate flow values for all 01 sequences
            all01 = which(pairs.obs[, 1] == 0 & pairs.obs[, 2] == 1)
            Hist01 = historical[all01]

            # calculate distance
            xdist = order(abs(Hist0 - Hist01))

            # select one of the nearest neighbor with the weight function above.
            xx = runif(1, 0, 1)
            xy = c(xx, W)
            xx = rank(xy)
            index = xdist[xx[1]]

            # Next flow
            simflow[j, i] = historical[index]
        }

        if (val1 == 1 && val2 == 0) {
            # associate flow values for all 01 sequences
            all10 = which(pairs.obs[, 1] == 1 & pairs.obs[, 2] == 0)
            Hist10 = historical[all10]

            # calculate distance
            xdist = order(abs(Hist1 - Hist10))

            # select one of the nearest neighbor with the weight function above.
            xx = runif(1, 0, 1)
            xy = c(xx, W)
            xx = rank(xy)
            index = xdist[xx[1]]

            # Next flow
            simflow[j, i] = historical[index]
        }

        if (val1 == 1 && val2 == 1) {
            # associate flow values for all 01 sequences
            all11 = which(pairs.obs[, 1] == 1 & pairs.obs[, 2] == 1)
            Hist11 = historical[all11]

            # calculate distance
            xdist = order(abs(Hist1 - Hist11))

            # select one of the nearest neighbor with the weight function above.
            xx = runif(1, 0, 1)
            xy = c(xx, W)
            xx = rank(xy)
            index = xdist[xx[1]]

            # Next flow
            simflow[j, i] = historical[index]
        }
    }
    simflow[1, i] = sample(historical, 1)
}

(v) Boxplot the mean, variance, skew, lag-1 correlation and PDF from the simulations with the corresponding values from the historical data plotted on them.

armean = matrix(0, nrow = N, ncol = 1)
armax = matrix(0, nrow = N, ncol = 1)
armin = matrix(0, nrow = N, ncol = 1)
arcor = matrix(0, nrow = N, ncol = 1)
arskw = matrix(0, nrow = N, ncol = 1)
arstdev = matrix(0, nrow = N, ncol = 1)
for (j in 1:N) {
    armean[j] = mean(simflow[j, ])
    armax[j] = max(simflow[j, ])
    armin[j] = min(simflow[j, ])
    arstdev[j] = sd(simflow[j, ])
    arskw[j] = skew(simflow[j, ])
}
# correlation between one year to another..
for (j in 2:N) {
    j1 = j - 1
    arcor[j] = cor(simflow[j, ], simflow[j1, ])
}
arcor[1] = cor(simflow[1:(N - 1), 2], simflow[2:N, 2])

# Observed stats
obs.mean = as.matrix(mean(historical))
obs.skew = as.matrix(skew(historical))
obs.sd = as.matrix(sd(historical))
obs.min = as.matrix(min(historical))
obs.max = as.matrix(max(historical))
obs.cor = as.matrix(cor(historical[1:(N - 1)], historical[2:N]))

########### boxplot the stats ###############
simulated = c("armean", "arstdev", "arskw", "arcor", "armax", "armin")
observ = c("obs.mean", "obs.sd", "obs.skew", "obs.cor", "obs.max", "obs.min")
titles = c("Mean", "Standard Deviation", "Skew", "Correlation", "Max", "Min")

par(mfrow = c(1, 2))
for (i in 1:length(observ)) {
    xmeans = rbind(get(observ[i]), get(simulated[i]))
    xmeans1 = get(simulated[i])
    zz = boxplot(xmeans1, plot = F)
    z1 = bxp(zz, ylim = range(xmeans), cex = 1)
    points(z1, xmeans[1], pch = 16, col = "red")
    lines(z1, xmeans[1], pch = 16, col = "red")
    title(main = titles[i])
}

plot of chunk unnamed-chunk-18 plot of chunk unnamed-chunk-18 plot of chunk unnamed-chunk-18

(vi) Compute the spectrum from the simulations and overlay the historical spectrum. Comment on what you find in comparison to the simulation methods above.

## simulate from spectrum
nsim = 100
deltaT = 1
t = 1:length(historical)

fu = spec1(historical, nsim)
specsimrawsur = fu$raw.spec.sur
specsimrawsurs = fu$smooth.spec.sur
pgram = fu$raw.spec.orig
pgrams = fu$smooth.spec.orig
plotfreqs = fu$plotfreqs

plot(plotfreqs, pgram, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawsur, 
    pgram), col = "red", lwd = 3, type = "l")

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawsur[, i], col = "grey")
}
lines(plotfreqs, pgram, col = "red", lwd = 3)

plot of chunk unnamed-chunk-19


plot(plotfreqs, pgrams, xlab = "Freq. cy/yr", ylab = "Spectrum", ylim = range(specsimrawsurs, 
    pgrams), col = "red", lwd = 3, type = "l")

for (i in 1:nsim) {
    lines(plotfreqs, specsimrawsurs[, i], col = "grey")
}
lines(plotfreqs, pgrams, col = "red", lwd = 3)

plot of chunk unnamed-chunk-19

Modeling the flow as a markov chain produces relatively good results. The model captures mean, standard deviation and skew. This fails to capture correlation probably due to the fact that flows are being grouped into two states.

8. For the annual Lees Ferry flow data perform SSA and make predictions from it.

(i) Select a window size of about 15-25 years (feel free to experiment with the window size); create the Toeplitz matrix and perform SSA.

(ii) Plot the Eigen spectrum and identify the dominant modes.

source("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session4/ssa-b.txt")

lf = matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session4/LeesFerry-monflows-1906-2006.txt"), 
    ncol = 13, byrow = T)
yr = lf[, 1]
lf = lf[, 2:13]
lf = lf * 0.0004690502
lfann = apply(lf, 1, sum)
lfmean = colMeans(lf)
lfsd = apply(lf, 2, sd)
lfscale = t((t(lf) - lfmean)/lfsd)

### SSA on Annual total volume
M = c(20, 25, 30)  # try different M's
titles1 = c("M=20", "M=25", "M=30")
index = c()
par(mfrow = c(3, 2))
for (i in 1:length(M)) {
    ssaout = ssab(lfann, M[i])
    # Eigen Spectrum
    lambdas = ssaout$lambdas  #Fraction Eigen Value

    plot(1:M[i], lambdas[1:M[i]], type = "l", xlab = "Modes", ylab = "Frac. Var. explained")
    points(1:M[i], lambdas[1:M[i]], col = "red")
    title(main = titles1[i])

    plot(1:M[i], cumsum(lambdas))
    title(main = titles1[i])

    index[i] = list(which(cumsum(lambdas) > 0.6))
}

plot of chunk unnamed-chunk-20

index
## [[1]]
##  [1]  9 10 11 12 13 14 15 16 17 18 19 20
## 
## [[2]]
##  [1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 
## [[3]]
##  [1] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

(iii) Reconstruct the dominant modes (i.e. Reconstructed Compnents - RCs) and plot them. Infer from them the dominant periodicities.

(iv) Sum the leading modes and plot them along with the original time series.

(v) Plot the dominant modes and show their corresponding wavelet spectrum

# Use M=20 as the first 9 modes explain over 60% of the variance
ssaout = ssab(lfann, 25)

# Plot the leading RCs, each against the timeseries
plot(yr, lfann, type = "l", col = "black", ylim = range(ssaout$Rpc[, 1], ssaout$Rpc[, 
    2], ssaout$Rpc[, 3], ssaout$Rpc[, 4]))
lines(yr, ssaout$Rpc[, 1], col = "red", lwd = 2)
lines(yr, ssaout$Rpc[, 2], col = "purple", lwd = 2)
lines(yr, ssaout$Rpc[, 3], col = "green", lwd = 2)
lines(yr, ssaout$Rpc[, 4], col = "blue", lwd = 2)
legend("bottomleft", legend = c("Original Data", "RC1", "RC2", "RC3", "RC4"), 
    lty = 1, lwd = 1, col = c("black", "red", "purple", "green", "blue"))

plot of chunk unnamed-chunk-21


# Add the first K reconstructed components
K = 10

par(mfrow = c(1, 1))
Recon.LF = apply(ssaout$Rpc[, 1:K], 1, sum)
plot(yr, lfann, type = "l", xlab = "Year", ylab = "Lees Ferry Annual Flow", 
    ylim = range(lfann, Recon.LF))
lines(yr, Recon.LF, col = "red", lwd = 2)

plot of chunk unnamed-chunk-21


xrecover = apply(ssaout$Rpc, 1, sum)
plot(xrecover, lfann, main = "summed Reconstructed Components vs Annual Lee's Ferry Data")

plot of chunk unnamed-chunk-21

# This plot proves that the reconstructed components are exactly eqaul to
# the data set.

(vi) Fit a best AR model for each RC and predict the RCs for 1998, 1999, and 2000. Sum all the predicted RCs to get a 'forcast'. Repeat this for all years through 2006. Compute performance of the forcasts- Rsquared, RMSE, observed vs forcasted.

# Find best M
M = c(10, 15, 20, 25, 30, 35, 40)  # try different M's
titles1 = c("M=10", "M=15", "M=20", "M=25", "M=30", "M=35", "M=40")
index = c()
par(mfrow = c(3, 2))
for (i in 1:length(M)) {
    ssaout = ssab(lfann[1:92], M[i])  #fit model 1906-1997
    # Eigen Spectrum
    lambdas = ssaout$lambdas  #Fraction Eigen Value

    plot(1:M[i], lambdas[1:M[i]], type = "l", xlab = "Modes", ylab = "Frac. Var. explained")
    points(1:M[i], lambdas[1:M[i]], col = "red")
    title(main = titles1[i])

    plot(1:M[i], cumsum(lambdas))
    title(main = titles1[i])

    index[i] = list(which(cumsum(lambdas) > 0.6))
}

plot of chunk unnamed-chunk-22 plot of chunk unnamed-chunk-22

index
## [[1]]
## [1]  5  6  7  8  9 10
## 
## [[2]]
## [1]  7  8  9 10 11 12 13 14 15
## 
## [[3]]
##  [1]  9 10 11 12 13 14 15 16 17 18 19 20
## 
## [[4]]
##  [1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 
## [[5]]
##  [1] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 
## [[6]]
##  [1] 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [24] 35
## 
## [[7]]
##  [1] 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [24] 35 36 37 38 39 40

pred.yr = c(1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006)

pred = 1:K
pred.norm = 1:length((K + 1):M)
forcast = matrix(0, nrow = length(pred.yr), ncol = 1)
RC.noise = (K + 1):M
for (j in 1:length(pred.yr)) {

    if (j == 1) {
        M = 15
        ssaout = ssab(lfann[1:92], M)
    } else {
        ssaout = ssab(c(lfann[1:92], forcast[1:(j - 1)]), M)
    }

    lambdas = ssaout$lambdas  #Fraction Eigen Value
    K = which(cumsum(lambdas) > 0.6)[1]

    for (i in 1:K) {
        rc.ar = ar(ssaout$Rpc[, i])
        xp = predict(rc.ar, n.ahead = 1)
        pred[i] = xp$pred
    }

    for (i in 1:length(RC.noise)) {
        pred.norm[i] = rnorm(1, mean(ssaout$Rpc[, RC.noise[i]]), sd(ssaout$Rpc[, 
            RC.noise[i]]))
    }

    forcast[j] = sum(c(pred, pred.norm))
}

rownames(forcast) = pred.yr
forcast
##      [,1]
## 1998 9140
## 1999 8387
## 2000 7855
## 2001 6548
## 2002 5545
## 2003 4740
## 2004 2462
## 2005 2404
## 2006 2838

# calculate Stats: RMSE and rsquared
data = lfann[93:101]

par(mfrow = c(1, 1))

plot of chunk unnamed-chunk-22

plot(data, forcast)
lines(data, data, col = "red")

plot of chunk unnamed-chunk-22


rmse = sqrt(mean((data - forcast)^2))
SST = sum((data - mean(data))^2)
SSR = sum((data - forcast)^2)
rsq = 1 - (SST/SSR)
rsq
## [1] 0.6344
rmse
## [1] 2685

SSA predicts relatively well, as the predicted points fall on either side of the actual line. The amount of reconstructed components used vaires based on how many components it taked to explain 60% of the variance.