Set directory, read in libraries and source functions:
rm(list = ls())
# Set working directory
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW2")
# Source functions and libraries
library(fields)
library(maps)
library(MASS)
source("SSTxygrid.r")
source("FullSVD.r")
# Define n's
Nrows <- 72 # 72 longitude points on gridded SST Kaplan data
Ncols <- 36 # 36 latitude points on gridded SST Kaplan data
Ntime <- 110 # Monthly SST from Jan 1900 to Dec 2010
Read in SST data and Western precipitation winter data:
index1 <- scan("http://civil.colorado.edu/~emgi6568/index1.txt")
KapSST <- readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%201900%29%28Dec%202010%29RANGE/T/4/boxAverage/T/12/STEP/data.r4",
what = "numeric", n = (Nrows * Ncols * Ntime), size = 4, endian = "swap")
results <- SSTxygrid(KapSST)
SSTdata <- results$SSTdata
xygrid <- results$grid
pdata <- matrix(scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%201900%29%28Dec%202010%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/VALUES/data.ch"),
ncol = 110, byrow = T)
precip <- scale(t(pdata))
mean.p <- attr(precip, "scaled:center")
sd.p <- attr(precip, "scaled:scale")
coords <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/climdivcoords.txt",
header = TRUE, sep = "\t")
coords1 <- coords[coords$X < -104, ]
coords1 <- coords1[order(coords1$CLIMDIV), ]
latlonUS <- coords1[, 4:5]
lons <- coords1$X
lats <- coords1$Y
Perform PCA on the two winter fields. Plot the EVS and the leading four spatial and temporal PCs for each field.
PCA.SST <- FullSVD(SSTdata, Nlamb = 40, temporal = FALSE, spatial = "FALSE",
xygrid = xygrid)
PCA.P <- FullSVD(precip, Nlamb = 7, temporal = FALSE, spatial = "FALSE", xygrid = xygrid)
Comments on Eigen Value Spectrum:
For SST (top), the first ten explain > 65%. For precip (bottom), the first PC explains over 90% of the variance. We will fit a model to this first precip PC and assume the remaining six are noise.
Fit a regression model for each retained PC of winter precip with the SST PCs
X <- as.data.frame(PCA.SST$pcs[, 1:10])
Y <- as.matrix(PCA.P$pcs[, 1])
fit <- lm(Y ~ ., data = X)
bmAIC <- stepAIC(fit)
## Start: AIC=190.1
## Y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V4 1 2.0 509 188
## - V7 1 2.3 509 189
## - V5 1 2.6 510 189
## - V3 1 4.8 512 189
## <none> 507 190
## - V8 1 10.4 517 190
## - V10 1 14.3 521 191
## - V9 1 15.7 523 191
## - V6 1 27.7 535 194
## - V2 1 36.6 543 196
## - V1 1 67.1 574 202
##
## Step: AIC=188.5
## Y ~ V1 + V2 + V3 + V5 + V6 + V7 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V7 1 2.3 511 187
## - V5 1 2.6 512 187
## - V3 1 4.8 514 188
## <none> 509 188
## - V8 1 10.4 519 189
## - V10 1 14.3 523 190
## - V9 1 15.7 525 190
## - V6 1 27.7 537 192
## - V2 1 36.6 546 194
## - V1 1 67.1 576 200
##
## Step: AIC=187
## Y ~ V1 + V2 + V3 + V5 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V5 1 2.6 514 186
## - V3 1 4.8 516 186
## <none> 511 187
## - V8 1 10.4 522 187
## - V10 1 14.3 526 188
## - V9 1 15.7 527 188
## - V6 1 27.7 539 191
## - V2 1 36.6 548 193
## - V1 1 67.1 578 199
##
## Step: AIC=185.6
## Y ~ V1 + V2 + V3 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V3 1 4.8 519 185
## <none> 514 186
## - V8 1 10.4 524 186
## - V10 1 14.3 528 187
## - V9 1 15.7 530 187
## - V6 1 27.7 542 189
## - V2 1 36.6 551 191
## - V1 1 67.1 581 197
##
## Step: AIC=184.6
## Y ~ V1 + V2 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## <none> 519 185
## - V8 1 10.4 529 185
## - V10 1 14.3 533 186
## - V9 1 15.7 534 186
## - V6 1 27.7 546 188
## - V2 1 36.6 555 190
## - V1 1 67.1 586 196
bmBIC <- stepAIC(fit, k = log(110))
## Start: AIC=219.8
## Y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V4 1 2.0 509 216
## - V7 1 2.3 509 216
## - V5 1 2.6 510 216
## - V3 1 4.8 512 216
## - V8 1 10.4 517 217
## - V10 1 14.3 521 218
## - V9 1 15.7 523 218
## <none> 507 220
## - V6 1 27.7 535 221
## - V2 1 36.6 543 223
## - V1 1 67.1 574 229
##
## Step: AIC=215.5
## Y ~ V1 + V2 + V3 + V5 + V6 + V7 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V7 1 2.3 511 211
## - V5 1 2.6 512 211
## - V3 1 4.8 514 212
## - V8 1 10.4 519 213
## - V10 1 14.3 523 214
## - V9 1 15.7 525 214
## <none> 509 216
## - V6 1 27.7 537 217
## - V2 1 36.6 546 218
## - V1 1 67.1 576 224
##
## Step: AIC=211.3
## Y ~ V1 + V2 + V3 + V5 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V5 1 2.6 514 207
## - V3 1 4.8 516 208
## - V8 1 10.4 522 209
## - V10 1 14.3 526 210
## - V9 1 15.7 527 210
## <none> 511 211
## - V6 1 27.7 539 212
## - V2 1 36.6 548 214
## - V1 1 67.1 578 220
##
## Step: AIC=207.2
## Y ~ V1 + V2 + V3 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V3 1 4.8 519 204
## - V8 1 10.4 524 205
## - V10 1 14.3 528 206
## - V9 1 15.7 530 206
## <none> 514 207
## - V6 1 27.7 542 208
## - V2 1 36.6 551 210
## - V1 1 67.1 581 216
##
## Step: AIC=203.5
## Y ~ V1 + V2 + V6 + V8 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V8 1 10.4 529 201
## - V10 1 14.3 533 202
## - V9 1 15.7 534 202
## <none> 519 204
## - V6 1 27.7 546 204
## - V2 1 36.6 555 206
## - V1 1 67.1 586 212
##
## Step: AIC=201
## Y ~ V1 + V2 + V6 + V9 + V10
##
## Df Sum of Sq RSS AIC
## - V10 1 14.3 543 199
## - V9 1 15.7 545 200
## <none> 529 201
## - V6 1 27.7 557 202
## - V2 1 36.6 566 204
## - V1 1 67.1 596 209
##
## Step: AIC=199.2
## Y ~ V1 + V2 + V6 + V9
##
## Df Sum of Sq RSS AIC
## - V9 1 15.7 559 198
## <none> 543 199
## - V6 1 27.7 571 200
## - V2 1 36.6 580 202
## - V1 1 67.1 610 207
##
## Step: AIC=197.6
## Y ~ V1 + V2 + V6
##
## Df Sum of Sq RSS AIC
## <none> 559 198
## - V6 1 27.7 587 198
## - V2 1 36.6 596 200
## - V1 1 67.1 626 205
Comments on model fitting:
StepBIC is more parsimonious, so we will use this as the best model.
Forecast the retained PCs of precipitation using the fitted model.
predPC1 <- predict(bmBIC) # predict first PC using BIC model
For the other PCs, randomly sample.
# Fit the remaining PCs as noise and bind them to the predicted PCs from the
# model
AZpredPCs <- predPC1
for (i in 1:6) {
predPC <- rnorm(Ntime, mean(PCA.P$pcs[, i + 1]))
AZpredPCs <- cbind(AZpredPCs, predPC)
}
Using the Eigen vectors, predict precipitation at all 7 divisions simultaneously.
# Multiple by the eigen vector to transform PC's back to predicted
# precipitation anomalies.
AZpredP <- AZpredPCs %*% PCA.P$u
Plot the forecasted precipitation and the observed at each location and compute R2 and RMSE.
par(mfrow = c(3, 3))
titles <- c("1st", "2nd", "3rd", "4th", "5th", "6th", "7th")
for (i in 1:7) {
plot(precip[, i], AZpredP[, i], xlab = "Y actual", ylab = "Y estimate",
main = titles[i], xlim = c(-3, 4), ylim = c(-3, 4))
lines(precip[, i], precip[, i])
}
par(oma = c(2, 2, 3, 2))
title("Actual versus Predicted AZ Precip Anomalies", outer = TRUE)
# Look at model skill by computing RMSE and R2 values for each of the 7
# stations
RMSE <- diag(cor(precip, AZpredP))
RMSE
## [1] 0.20992 -0.15278 -0.06799 0.31284 -0.06038 -0.07843 -0.06364
RSQ <- RMSE^2
RSQ
## [1] 0.044066 0.023341 0.004622 0.097870 0.003645 0.006151 0.004051
Predict 2011 and 2012 winter precip using the 1900-2010 model developed in part (ii).
Ntime <- 2
# Read in 2011-2012 precipitation data
pdata.x = matrix(scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%202010%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/VALUES/data.ch"),
ncol = Ntime, byrow = T)
pdata.x = t(pdata.x)
data.x <- readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%202010%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/data.r4",
what = "numeric", n = (Nrows * Ncols * Ntime), size = 4, endian = "swap")
data.x <- array(data = data.x, dim = c(Nrows, Ncols, Ntime))
results.x <- SSTxygrid(data.x)
SSTdata.x <- results.x$SSTdata
xygrid.x <- results.x$grid
SSTdata.2011 <- scale(SSTdata.x[1, ])
SSTdata.2012 <- scale(SSTdata.x[2, ])
Predict 2011 precip:
# PCA on 2011 SST
zPCA.2011 <- FullSVD(SSTdata.2011, Nlamb = 40, temporal = FALSE, spatial = "FALSE",
xygrid = xygrid)
new2011 <- as.data.frame(matrix(zPCA.2011$pcs[1:10], 1, 10))
predPC1.2011 <- predict(bmAIC, newdata = new2011)
PCpred2011 <- c(predPC1.2011, apply(PCA.P$pcs[, 2:7], 2, mean))
Ppred2011 <- ((PCpred2011 %*% PCA.P$u) + mean.p)/sd.p
Ppred2011 # predicted precipitation at each station
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.818 2.082 1.701 1.91 1.309 1.656 1.764
pdata.x[1, ] # actual precipitation at each station
## [1] 1.1200 1.0750 1.0550 1.2525 0.4175 0.6450 0.3150
Predict 2012 precip:
# PCA on 2012 SST
zPCA.2012 <- FullSVD(SSTdata.2012, Nlamb = 40, temporal = FALSE, spatial = "FALSE",
xygrid = xygrid)
new2012 <- as.data.frame(matrix(zPCA.2012$pcs[1:10], 1, 10))
predPC1.2012 <- predict(bmAIC, newdata = new2012)
PCpred2012 <- c(predPC1.2012, apply(PCA.P$pcs[, 2:7], 2, mean))
Ppred2012 <- ((PCpred2012 %*% PCA.P$u) + mean.p)/sd.p
Ppred2012 # predicted precipitation at each station
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 1.771 2.145 1.742 1.874 1.358 1.655 1.787
pdata.x[2, ] # actual precipitation at each station
## [1] 0.5825 0.8675 1.0450 1.1750 0.2525 0.6675 0.7975
Comments on fitting:
The predicted values for 2011 and 2012 are not great. We will see this these improve when we do CCA Forecasting.
Perform CCA on the two fields. Take the leading 7 PCs of winter SSTs and perform CCA with the 7 CLIMDIV winter precipitation. Fit a regression for each canonical variate. Use the regressions to predict the flow variates and back transform them to the original space.
SSTpc <- PCA.SST$pcs[, 1:7] # choose the first seven SST PCs
X1 = scale(SSTpc)
M = dim(SSTpc)[2]
J = dim(precip)[2]
J = min(M, J) # chooses the lowest dimension
N = length(precip[, 1])
Qx1 = qr.Q(qr(X1))
Qy1 = qr.Q(qr(precip))
T11 = qr.R(qr(X1))
T22 = qr.R(qr(precip))
VV = t(Qx1) %*% Qy1
BB = svd(VV)$v
AA = svd(VV)$u
BB = solve(T22) %*% svd(VV)$v * sqrt(N - 1)
wm1 = precip %*% BB
AA = solve(T11) %*% svd(VV)$u * sqrt(N - 1)
vm1 = X1 %*% AA
cancorln = svd(VV)$d[1:J] # canonical correlation
Fyy = var(precip) %*% BB
betahat = solve(t(AA) %*% t(X1) %*% X1 %*% AA) %*% t(AA) %*% t(X1) %*% precip
ypred = X1 %*% AA %*% betahat # predict precipitation PCs at the AZ 7 stations
ypred1 <- ypred * sd.p + mean.p # transform back to precipitation
Evaluate the performance and predict 2011 and 2012.
# Calculate the RMSE and R2
RMSE <- diag(cor(precip, ypred))
RMSE
## [1] 0.3688 0.3943 0.4441 0.4237 0.4561 0.4884 0.5439
RSQ <- RMSE^2
RSQ
## [1] 0.1360 0.1555 0.1972 0.1795 0.2080 0.2385 0.2958
# Plot the observed versus the predicted
precipx <- precip * sd.p + mean.p
par(mfrow = c(3, 3))
titles <- c("1st", "2nd", "3rd", "4th", "5th", "6th", "7th")
for (i in 1:7) {
plot(precipx[, i], ypred1[, i], xlab = "Y actual", ylab = "Y estimate",
main = titles[i], xlim = c(0, 4), ylim = c(0, 4))
lines(precipx[, i], precipx[, i])
}
par(oma = c(2, 2, 3, 2))
title("Actual versus Predicted AZ Precip Anomalies", outer = TRUE)
Comments on fit:
RSQ values are much better and the observed fit is also better with CCA.
# Predict 2011 using the same betas and AA matrix from original model
X2011 <- matrix(zPCA.2011$pcs[1:7], 1, 7)
ypred <- X2011 %*% AA %*% betahat
ypred2011 <- ypred * sd.p + mean.p
ypred2011
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.7957 1.096 1.288 1.766 0.3245 0.821 0.8021
pdata.x[1, ]
## [1] 1.1200 1.0750 1.0550 1.2525 0.4175 0.6450 0.3150
# Predict 2012 using the same betas and AA matrix from original model
X2012 <- matrix(zPCA.2012$pcs[1:7], 1, 7)
ypred <- X2012 %*% AA %*% betahat
ypred2012 <- ypred * sd.p + mean.p
ypred2012
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0.8386 1.004 1.188 1.677 0.3229 0.7301 0.7499
pdata.x[2, ]
## [1] 0.5825 0.8675 1.0450 1.1750 0.2525 0.6675 0.7975
Comments on predictions:
Here we see that the predictions using CCA are much closer to observed precipitation than in question #4. This is expected.