Emily Gill - CVEN6833 - Homework #2

November 11, 2013

Question 4: Field Forecasting

Set directory, read in libraries and source functions:

rm(list = ls())

# Set working directory
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW2")

# Source functions and libraries
library(fields)
library(maps)
library(MASS)
source("SSTxygrid.r")
source("FullSVD.r")

# Define n's
Nrows <- 72  # 72 longitude points on gridded SST Kaplan data
Ncols <- 36  # 36 latitude points on gridded SST Kaplan data
Ntime <- 110  # Monthly SST from Jan 1900 to Dec 2010

Read in SST data and Western precipitation winter data:

index1 <- scan("http://civil.colorado.edu/~emgi6568/index1.txt")

KapSST <- readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%201900%29%28Dec%202010%29RANGE/T/4/boxAverage/T/12/STEP/data.r4", 
    what = "numeric", n = (Nrows * Ncols * Ntime), size = 4, endian = "swap")
results <- SSTxygrid(KapSST)
SSTdata <- results$SSTdata
xygrid <- results$grid

pdata <- matrix(scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%201900%29%28Dec%202010%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/VALUES/data.ch"), 
    ncol = 110, byrow = T)
precip <- scale(t(pdata))
mean.p <- attr(precip, "scaled:center")
sd.p <- attr(precip, "scaled:scale")

coords <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/climdivcoords.txt", 
    header = TRUE, sep = "\t")
coords1 <- coords[coords$X < -104, ]
coords1 <- coords1[order(coords1$CLIMDIV), ]
latlonUS <- coords1[, 4:5]
lons <- coords1$X
lats <- coords1$Y

Part (i)

Perform PCA on the two winter fields. Plot the EVS and the leading four spatial and temporal PCs for each field.

PCA.SST <- FullSVD(SSTdata, Nlamb = 40, temporal = FALSE, spatial = "FALSE", 
    xygrid = xygrid)

plot of chunk unnamed-chunk-3

PCA.P <- FullSVD(precip, Nlamb = 7, temporal = FALSE, spatial = "FALSE", xygrid = xygrid)

plot of chunk unnamed-chunk-3

Comments on Eigen Value Spectrum:

For SST (top), the first ten explain > 65%. For precip (bottom), the first PC explains over 90% of the variance. We will fit a model to this first precip PC and assume the remaining six are noise.

Part (ii)

Fit a regression model for each retained PC of winter precip with the SST PCs

X <- as.data.frame(PCA.SST$pcs[, 1:10])
Y <- as.matrix(PCA.P$pcs[, 1])
fit <- lm(Y ~ ., data = X)
bmAIC <- stepAIC(fit)
## Start:  AIC=190.1
## Y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V4    1       2.0 509 188
## - V7    1       2.3 509 189
## - V5    1       2.6 510 189
## - V3    1       4.8 512 189
## <none>              507 190
## - V8    1      10.4 517 190
## - V10   1      14.3 521 191
## - V9    1      15.7 523 191
## - V6    1      27.7 535 194
## - V2    1      36.6 543 196
## - V1    1      67.1 574 202
## 
## Step:  AIC=188.5
## Y ~ V1 + V2 + V3 + V5 + V6 + V7 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V7    1       2.3 511 187
## - V5    1       2.6 512 187
## - V3    1       4.8 514 188
## <none>              509 188
## - V8    1      10.4 519 189
## - V10   1      14.3 523 190
## - V9    1      15.7 525 190
## - V6    1      27.7 537 192
## - V2    1      36.6 546 194
## - V1    1      67.1 576 200
## 
## Step:  AIC=187
## Y ~ V1 + V2 + V3 + V5 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V5    1       2.6 514 186
## - V3    1       4.8 516 186
## <none>              511 187
## - V8    1      10.4 522 187
## - V10   1      14.3 526 188
## - V9    1      15.7 527 188
## - V6    1      27.7 539 191
## - V2    1      36.6 548 193
## - V1    1      67.1 578 199
## 
## Step:  AIC=185.6
## Y ~ V1 + V2 + V3 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V3    1       4.8 519 185
## <none>              514 186
## - V8    1      10.4 524 186
## - V10   1      14.3 528 187
## - V9    1      15.7 530 187
## - V6    1      27.7 542 189
## - V2    1      36.6 551 191
## - V1    1      67.1 581 197
## 
## Step:  AIC=184.6
## Y ~ V1 + V2 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## <none>              519 185
## - V8    1      10.4 529 185
## - V10   1      14.3 533 186
## - V9    1      15.7 534 186
## - V6    1      27.7 546 188
## - V2    1      36.6 555 190
## - V1    1      67.1 586 196
bmBIC <- stepAIC(fit, k = log(110))
## Start:  AIC=219.8
## Y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V4    1       2.0 509 216
## - V7    1       2.3 509 216
## - V5    1       2.6 510 216
## - V3    1       4.8 512 216
## - V8    1      10.4 517 217
## - V10   1      14.3 521 218
## - V9    1      15.7 523 218
## <none>              507 220
## - V6    1      27.7 535 221
## - V2    1      36.6 543 223
## - V1    1      67.1 574 229
## 
## Step:  AIC=215.5
## Y ~ V1 + V2 + V3 + V5 + V6 + V7 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V7    1       2.3 511 211
## - V5    1       2.6 512 211
## - V3    1       4.8 514 212
## - V8    1      10.4 519 213
## - V10   1      14.3 523 214
## - V9    1      15.7 525 214
## <none>              509 216
## - V6    1      27.7 537 217
## - V2    1      36.6 546 218
## - V1    1      67.1 576 224
## 
## Step:  AIC=211.3
## Y ~ V1 + V2 + V3 + V5 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V5    1       2.6 514 207
## - V3    1       4.8 516 208
## - V8    1      10.4 522 209
## - V10   1      14.3 526 210
## - V9    1      15.7 527 210
## <none>              511 211
## - V6    1      27.7 539 212
## - V2    1      36.6 548 214
## - V1    1      67.1 578 220
## 
## Step:  AIC=207.2
## Y ~ V1 + V2 + V3 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V3    1       4.8 519 204
## - V8    1      10.4 524 205
## - V10   1      14.3 528 206
## - V9    1      15.7 530 206
## <none>              514 207
## - V6    1      27.7 542 208
## - V2    1      36.6 551 210
## - V1    1      67.1 581 216
## 
## Step:  AIC=203.5
## Y ~ V1 + V2 + V6 + V8 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V8    1      10.4 529 201
## - V10   1      14.3 533 202
## - V9    1      15.7 534 202
## <none>              519 204
## - V6    1      27.7 546 204
## - V2    1      36.6 555 206
## - V1    1      67.1 586 212
## 
## Step:  AIC=201
## Y ~ V1 + V2 + V6 + V9 + V10
## 
##        Df Sum of Sq RSS AIC
## - V10   1      14.3 543 199
## - V9    1      15.7 545 200
## <none>              529 201
## - V6    1      27.7 557 202
## - V2    1      36.6 566 204
## - V1    1      67.1 596 209
## 
## Step:  AIC=199.2
## Y ~ V1 + V2 + V6 + V9
## 
##        Df Sum of Sq RSS AIC
## - V9    1      15.7 559 198
## <none>              543 199
## - V6    1      27.7 571 200
## - V2    1      36.6 580 202
## - V1    1      67.1 610 207
## 
## Step:  AIC=197.6
## Y ~ V1 + V2 + V6
## 
##        Df Sum of Sq RSS AIC
## <none>              559 198
## - V6    1      27.7 587 198
## - V2    1      36.6 596 200
## - V1    1      67.1 626 205

Comments on model fitting:

StepBIC is more parsimonious, so we will use this as the best model.

Part (iii)

Forecast the retained PCs of precipitation using the fitted model.

predPC1 <- predict(bmBIC)  # predict first PC using BIC model 

Part (iv)

For the other PCs, randomly sample.

# Fit the remaining PCs as noise and bind them to the predicted PCs from the
# model
AZpredPCs <- predPC1
for (i in 1:6) {
    predPC <- rnorm(Ntime, mean(PCA.P$pcs[, i + 1]))
    AZpredPCs <- cbind(AZpredPCs, predPC)
}

Part (v)

Using the Eigen vectors, predict precipitation at all 7 divisions simultaneously.

# Multiple by the eigen vector to transform PC's back to predicted
# precipitation anomalies.
AZpredP <- AZpredPCs %*% PCA.P$u

Part (vi)

Plot the forecasted precipitation and the observed at each location and compute R2 and RMSE.

par(mfrow = c(3, 3))
titles <- c("1st", "2nd", "3rd", "4th", "5th", "6th", "7th")
for (i in 1:7) {
    plot(precip[, i], AZpredP[, i], xlab = "Y actual", ylab = "Y estimate", 
        main = titles[i], xlim = c(-3, 4), ylim = c(-3, 4))
    lines(precip[, i], precip[, i])
}
par(oma = c(2, 2, 3, 2))
title("Actual versus Predicted AZ Precip Anomalies", outer = TRUE)

plot of chunk unnamed-chunk-8

# Look at model skill by computing RMSE and R2 values for each of the 7
# stations
RMSE <- diag(cor(precip, AZpredP))
RMSE
## [1]  0.20992 -0.15278 -0.06799  0.31284 -0.06038 -0.07843 -0.06364
RSQ <- RMSE^2
RSQ
## [1] 0.044066 0.023341 0.004622 0.097870 0.003645 0.006151 0.004051

Part (vii)

Predict 2011 and 2012 winter precip using the 1900-2010 model developed in part (ii).

Ntime <- 2
# Read in 2011-2012 precipitation data
pdata.x = matrix(scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%202010%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/VALUES/data.ch"), 
    ncol = Ntime, byrow = T)
pdata.x = t(pdata.x)

data.x <- readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%202010%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/data.r4", 
    what = "numeric", n = (Nrows * Ncols * Ntime), size = 4, endian = "swap")
data.x <- array(data = data.x, dim = c(Nrows, Ncols, Ntime))
results.x <- SSTxygrid(data.x)
SSTdata.x <- results.x$SSTdata
xygrid.x <- results.x$grid
SSTdata.2011 <- scale(SSTdata.x[1, ])
SSTdata.2012 <- scale(SSTdata.x[2, ])

Predict 2011 precip:

# PCA on 2011 SST
zPCA.2011 <- FullSVD(SSTdata.2011, Nlamb = 40, temporal = FALSE, spatial = "FALSE", 
    xygrid = xygrid)

new2011 <- as.data.frame(matrix(zPCA.2011$pcs[1:10], 1, 10))
predPC1.2011 <- predict(bmAIC, newdata = new2011)

PCpred2011 <- c(predPC1.2011, apply(PCA.P$pcs[, 2:7], 2, mean))
Ppred2011 <- ((PCpred2011 %*% PCA.P$u) + mean.p)/sd.p
Ppred2011  # predicted precipitation at each station
##       [,1]  [,2]  [,3] [,4]  [,5]  [,6]  [,7]
## [1,] 1.818 2.082 1.701 1.91 1.309 1.656 1.764
pdata.x[1, ]  # actual precipitation at each station
## [1] 1.1200 1.0750 1.0550 1.2525 0.4175 0.6450 0.3150

Predict 2012 precip:

# PCA on 2012 SST
zPCA.2012 <- FullSVD(SSTdata.2012, Nlamb = 40, temporal = FALSE, spatial = "FALSE", 
    xygrid = xygrid)

new2012 <- as.data.frame(matrix(zPCA.2012$pcs[1:10], 1, 10))
predPC1.2012 <- predict(bmAIC, newdata = new2012)

PCpred2012 <- c(predPC1.2012, apply(PCA.P$pcs[, 2:7], 2, mean))
Ppred2012 <- ((PCpred2012 %*% PCA.P$u) + mean.p)/sd.p
Ppred2012  # predicted precipitation at each station
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]
## [1,] 1.771 2.145 1.742 1.874 1.358 1.655 1.787
pdata.x[2, ]  # actual precipitation at each station
## [1] 0.5825 0.8675 1.0450 1.1750 0.2525 0.6675 0.7975

Comments on fitting:

The predicted values for 2011 and 2012 are not great. We will see this these improve when we do CCA Forecasting.

Question 5: CCA Forecasting

Part (i and ii)

Perform CCA on the two fields. Take the leading 7 PCs of winter SSTs and perform CCA with the 7 CLIMDIV winter precipitation. Fit a regression for each canonical variate. Use the regressions to predict the flow variates and back transform them to the original space.

SSTpc <- PCA.SST$pcs[, 1:7]  # choose the first seven SST PCs
X1 = scale(SSTpc)
M = dim(SSTpc)[2]
J = dim(precip)[2]
J = min(M, J)  # chooses the lowest dimension
N = length(precip[, 1])
Qx1 = qr.Q(qr(X1))
Qy1 = qr.Q(qr(precip))
T11 = qr.R(qr(X1))
T22 = qr.R(qr(precip))
VV = t(Qx1) %*% Qy1
BB = svd(VV)$v
AA = svd(VV)$u
BB = solve(T22) %*% svd(VV)$v * sqrt(N - 1)
wm1 = precip %*% BB
AA = solve(T11) %*% svd(VV)$u * sqrt(N - 1)
vm1 = X1 %*% AA
cancorln = svd(VV)$d[1:J]  # canonical correlation
Fyy = var(precip) %*% BB

betahat = solve(t(AA) %*% t(X1) %*% X1 %*% AA) %*% t(AA) %*% t(X1) %*% precip
ypred = X1 %*% AA %*% betahat  # predict precipitation PCs at the AZ 7 stations

ypred1 <- ypred * sd.p + mean.p  # transform back to precipitation

Part (iii)

Evaluate the performance and predict 2011 and 2012.

# Calculate the RMSE and R2
RMSE <- diag(cor(precip, ypred))
RMSE
## [1] 0.3688 0.3943 0.4441 0.4237 0.4561 0.4884 0.5439
RSQ <- RMSE^2
RSQ
## [1] 0.1360 0.1555 0.1972 0.1795 0.2080 0.2385 0.2958

# Plot the observed versus the predicted
precipx <- precip * sd.p + mean.p
par(mfrow = c(3, 3))
titles <- c("1st", "2nd", "3rd", "4th", "5th", "6th", "7th")
for (i in 1:7) {
    plot(precipx[, i], ypred1[, i], xlab = "Y actual", ylab = "Y estimate", 
        main = titles[i], xlim = c(0, 4), ylim = c(0, 4))
    lines(precipx[, i], precipx[, i])
}
par(oma = c(2, 2, 3, 2))
title("Actual versus Predicted AZ Precip Anomalies", outer = TRUE)

plot of chunk unnamed-chunk-14

Comments on fit:

RSQ values are much better and the observed fit is also better with CCA.

# Predict 2011 using the same betas and AA matrix from original model
X2011 <- matrix(zPCA.2011$pcs[1:7], 1, 7)
ypred <- X2011 %*% AA %*% betahat
ypred2011 <- ypred * sd.p + mean.p
ypred2011
##        [,1]  [,2]  [,3]  [,4]   [,5]  [,6]   [,7]
## [1,] 0.7957 1.096 1.288 1.766 0.3245 0.821 0.8021
pdata.x[1, ]
## [1] 1.1200 1.0750 1.0550 1.2525 0.4175 0.6450 0.3150

# Predict 2012 using the same betas and AA matrix from original model
X2012 <- matrix(zPCA.2012$pcs[1:7], 1, 7)
ypred <- X2012 %*% AA %*% betahat
ypred2012 <- ypred * sd.p + mean.p
ypred2012
##        [,1]  [,2]  [,3]  [,4]   [,5]   [,6]   [,7]
## [1,] 0.8386 1.004 1.188 1.677 0.3229 0.7301 0.7499
pdata.x[2, ]
## [1] 0.5825 0.8675 1.0450 1.1750 0.2525 0.6675 0.7975

Comments on predictions:

Here we see that the predictions using CCA are much closer to observed precipitation than in question #4. This is expected.