# HW 2 Q 5 JLCY, 9 Nov 2013

############################################################ 

# part i

# Read in Nov-Mar 1900-2010 SST average data Read in AZ Winter PPT of the
# same period at 7 locations First Perform PCA on the winter SSTs from
# 1900-2010 Then perform CCA on 1st 7 winter SST PCs and AZ Winter PPT at 7
# locations Regress them to do forecast - part ii

# SST

nrows = 72
ncols = 36

ntime = 111  # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-1900-2011.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PCA

sstanavg = sstdata

# get variance matrix..
zs_sst = var(scale(sstanavg))

# do an Eigen(Singular Value) decomposition..
zsvd_sst = svd(zs_sst)

# Principal Components...
pcs_sst = t(t(zsvd_sst$u) %*% t(sstanavg))

# Eigen Values.. - fraction variance
lambdas = (zsvd_sst$d/sum(zsvd_sst$d))

plot(1:25, lambdas[1:25], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "Egien Variance Spectrum - Winter SST 1900-2010")
points(1:25, lambdas[1:25], col = "red")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# PPT

ntime = 111  # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011

ntimep = ntime

nlocation = 7  # 7 stations in AZ

# Lat - Long grid..  create grid below:

# read coord online
# locs=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt'),
# ncol=5, byrow=T)

# read coord from pc
locs = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\WesternUS-coords.txt"), 
    ncol = 5, byrow = T)

xlat = locs[, 3]
ylon = locs[, 4]

xygrid = matrix(0, nrow = 81, ncol = 2)

xygrid[, 1] = xlat
xygrid[, 2] = ylon

############################################################ 

# Read Arizona PPT data -- already annual averaged

data = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-1900-2011.txt")

data_m <- array(dim = c(ntime, nlocation))

# set data matrix: 111 years x 7 locations
i = 1
j = 1
for (i in 1:7) {
    data_m[, i] = data[j:(j + 110)]
    i = i + 1
    j = j + 111
}

pptanavg_m = data_m

pptanavg = scale(pptanavg_m)

ppt_Mean = apply(data_m, 2, mean)
ppt_Sd = apply(data_m, 2, sd)
ppt_Scale = t((t(data_m) - ppt_Mean)/+ppt_Sd)

############################################################ 

# CCA on 1st 7 PCs of SST with 7 PPT locations

sstPc = pcs_sst[, 1:7]
X1 = scale(sstPc)

M = dim(sstPc)[2]
J = dim(ppt_Scale)[2]
J = min(M, J)

N = length(ppt_Scale[, 1])

Qx1 = qr.Q(qr(X1))
Qy1 = qr.Q(qr(ppt_Scale))

T11 = qr.R(qr(X1))
T22 = qr.R(qr(ppt_Scale))

VV = t(Qx1) %*% Qy1

BB = svd(VV)$v
AA = svd(VV)$u

BB = solve(T22) %*% svd(VV)$v * sqrt(N - 1)

wm1 = ppt_Scale %*% BB

AA = solve(T11) %*% svd(VV)$u * sqrt(N - 1)

vm1 = X1 %*% AA

cancorln = svd(VV)$d[1:J]  #canonical correlation

Fyy = var(ppt_Scale) %*% BB

############################################################ 

# part ii

# Prediction

betahat = solve(t(AA) %*% t(X1) %*% X1 %*% AA) %*% t(AA) %*% t(X1) %*% ppt_Scale

# predicted
ypred = X1 %*% AA %*% betahat

# scale back the data - mean forecast
ppt_pred = t(t(ypred) * ppt_Sd + ppt_Mean)

############################################################ 

# part iii

# plot observed ppt & forecast ppt for each location find out R^2 & RMSE

# location 1

plot(data_m[, 1], ppt_pred[, 1], main = "Observed PPT & CCA Forecast PPT - Location 1")

# add 1:1 line
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

# create ppt_mean matrix with same mean for each column
ppt_mean_m = matrix(0, nrow = 111, ncol = 7)
for (i in 1:111) {
    ppt_mean_m[i, ] = ppt_Mean
    i = i + 1
}

# find out R^2 & RMSE SSR_1 = sum((ppt_pred[,1] - ppt_mean_m[,1])^2) SST_1 =
# sum((data_m[,1] - ppt_mean_m[,1])^2) R2_1 = SSR_1 / SST_1 # for fitting
R2_1 = (cor(data_m[, 1], ppt_pred[, 1]))^2  # for prediction
cat("R^2 at location 1 = ", R2_1, "\n", "\n")

## R^2 at location 1 =  0.1178 
##


RMSE_1 = sqrt((sum((ppt_pred[, 1] - data_m[, 1])^2))/N)
cat("RMSE at location 1 = ", RMSE_1, "\n", "\n")

## RMSE at location 1 =  0.5318 
##


############################################################ 

# location 2

plot(data_m[, 2], ppt_pred[, 2], main = "Observed PPT & CCA Forecast PPT - Location 2")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_2 = (cor(data_m[, 2], ppt_pred[, 2]))^2  # for prediction
cat("R^2 at location 2 = ", R2_2, "\n", "\n")

## R^2 at location 2 =  0.1485 
##


RMSE_2 = sqrt((sum((ppt_pred[, 2] - data_m[, 2])^2))/N)
cat("RMSE at location 2 = ", RMSE_2, "\n", "\n")

## RMSE at location 2 =  0.5377 
##


############################################################ 

# location 3

plot(data_m[, 3], ppt_pred[, 3], main = "Observed PPT & CCA Forecast PPT - Location 3")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_3 = (cor(data_m[, 3], ppt_pred[, 3]))^2  # for prediction
cat("R^2 at location 3 = ", R2_3, "\n", "\n")

## R^2 at location 3 =  0.1892 
##


RMSE_3 = sqrt((sum((ppt_pred[, 3] - data_m[, 3])^2))/N)
cat("RMSE at location 3 = ", RMSE_3, "\n", "\n")

## RMSE at location 3 =  0.7764 
##


############################################################ 

# location 4

plot(data_m[, 4], ppt_pred[, 4], main = "Observed PPT & CCA Forecast PPT - Location 4")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_4 = (cor(data_m[, 4], ppt_pred[, 4]))^2  # for prediction
cat("R^2 at location 4 = ", R2_4, "\n", "\n")

## R^2 at location 4 =  0.172 
##


RMSE_4 = sqrt((sum((ppt_pred[, 4] - data_m[, 4])^2))/N)
cat("RMSE at location 4 = ", RMSE_4, "\n", "\n")

## RMSE at location 4 =  0.9623 
##


############################################################ 

# location 5

plot(data_m[, 5], ppt_pred[, 5], main = "Observed PPT & CCA Forecast PPT - Location 5")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_5 = (cor(data_m[, 5], ppt_pred[, 5]))^2  # for prediction
cat("R^2 at location 5 = ", R2_5, "\n", "\n")

## R^2 at location 5 =  0.2099 
##


RMSE_5 = sqrt((sum((ppt_pred[, 5] - data_m[, 5])^2))/N)
cat("RMSE at location 5 = ", RMSE_5, "\n", "\n")

## RMSE at location 5 =  0.333 
##


############################################################ 

# location 6

plot(data_m[, 6], ppt_pred[, 6], main = "Observed PPT & CCA Forecast PPT - Location 6")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_6 = (cor(data_m[, 6], ppt_pred[, 6]))^2  # for prediction
cat("R^2 at location 6 = ", R2_6, "\n", "\n")

## R^2 at location 6 =  0.2441 
##


RMSE_6 = sqrt((sum((ppt_pred[, 6] - data_m[, 6])^2))/N)
cat("RMSE at location 6 = ", RMSE_6, "\n", "\n")

## RMSE at location 6 =  0.5553 
##


############################################################ 

# location 7

plot(data_m[, 7], ppt_pred[, 7], main = "Observed PPT & CCA Forecast PPT - Location 7")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

R2_7 = (cor(data_m[, 7], ppt_pred[, 7]))^2  # for prediction
cat("R^2 at location 7 = ", R2_7, "\n", "\n")

## R^2 at location 7 =  0.3125 
##


RMSE_7 = sqrt((sum((ppt_pred[, 7] - data_m[, 7])^2))/N)
cat("RMSE at location 7 = ", RMSE_7, "\n", "\n")

## RMSE at location 7 =  0.4638 
##


############################################################ 

# forecast for Winter AZ PPT 2011-2012

# SST

ntime = 1  # Dec 2011 to Mar 2012

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2011-2012.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata_11_12 = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata_11_12[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PPT

nlocation = 7  # 7 stations in AZ

# Read Arizona PPT data -- already annual averaged

data_m_11_12 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2011-2012.txt")

############################################################ 

# transform to PC(SST) & predict PC(PPT) with best model

sstanavg_11_12 = sstdata_11_12

# PC(SST_11_12)
pcs_sst_11_12 = sstanavg_11_12 %*% zsvd_sst$u

############################################################ 

# CCA forecast

# set new sstPc & X1
sstPc_11_12 = pcs_sst_11_12[1:7]
X1_11_12 = t(scale(sstPc_11_12))

# forecast
ypred_11_12 = X1_11_12 %*% AA %*% betahat

# scale back the data - mean forecast
ppt_pred_11_12 = t(t(ypred_11_12) * ppt_Sd + ppt_Mean)

# compare with observation
plot(data_m_11_12, ppt_pred_11_12, main = "Observed PPT & CCA Forecast PPT - 2011-2012")

# add 1:1 line
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# forecast for Winter AZ PPT 2012-2013

# SST

ntime = 1  # Dec 2012 to Mar 2013

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2012-2013.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata_12_13 = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata_12_13[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PPT

nlocation = 7  # 7 stations in AZ

# Read Arizona PPT data -- already annual averaged

data_m_12_13 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2012-2013.txt")

############################################################ 

# transform to PC(SST) & predict PC(PPT) with best model

sstanavg_12_13 = sstdata_12_13

# PC(SST_12_13)
pcs_sst_12_13 = sstanavg_12_13 %*% zsvd_sst$u

############################################################ 

# CCA forecast

# set new sstPc & X1
sstPc_12_13 = pcs_sst_12_13[1:7]
X1_12_13 = t(scale(sstPc_12_13))

# forecast
ypred_12_13 = X1_12_13 %*% AA %*% betahat

# scale back the data - mean forecast
ppt_pred_12_13 = t(t(ypred_12_13) * ppt_Sd + ppt_Mean)

# compare with observation
plot(data_m_12_13, ppt_pred_12_13, main = "Observed PPT & CCA Forecast PPT - 2012-2013")

# add 1:1 line
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################