# HW 2 Bonus JLCY, 9 Nov 2013

############################################################ 

# part i

# SST

nrows = 72
ncols = 36

ntime = 111  # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-1900-2011.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PCA

sstanavg = sstdata

# get variance matrix..
zs_sst = var(scale(sstanavg))

# do an Eigen(Singular Value) decomposition..
zsvd_sst = svd(zs_sst)

# Principal Components...
pcs_sst = t(t(zsvd_sst$u) %*% t(sstanavg))

# Eigen Values.. - fraction variance
lambdas = (zsvd_sst$d/sum(zsvd_sst$d))

plot(1:25, lambdas[1:25], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "Egien Variance Spectrum - Winter SST 1900-2010")
points(1:25, lambdas[1:25], col = "red")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# PPT

ntime = 111  # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011

ntimep = ntime

nlocation = 7  # 7 stations in AZ

# Lat - Long grid..  create grid below:

# read coord online
# locs=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt'),
# ncol=5, byrow=T)

# read coord from pc
locs = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\WesternUS-coords.txt"), 
    ncol = 5, byrow = T)

xlat = locs[, 3]
ylon = locs[, 4]

xygrid = matrix(0, nrow = 81, ncol = 2)

xygrid[, 1] = xlat
xygrid[, 2] = ylon

############################################################ 

# Read Arizona PPT data -- already annual averaged

data = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-1900-2011.txt")

data_m <- array(dim = c(ntime, nlocation))

# set data matrix: 111 years x 7 locations
i = 1
j = 1
for (i in 1:7) {
    data_m[, i] = data[j:(j + 110)]
    i = i + 1
    j = j + 111
}

# column mean & standard deviation -- for each of the 7 locations
ppt_mean = apply(data_m, 2, mean)
ppt_sd = apply(data_m, 2, sd)

############################################################ 

# PCA

pptanavg_m = data_m

pptanavg = scale(pptanavg_m)

# get variance matrix..
zs_ppt = var(pptanavg)

# do an Eigen(Singular Value) decomposition..
zsvd_ppt = svd(zs_ppt)

# Principal Components...
pcs_ppt = t(t(zsvd_ppt$u) %*% t(pptanavg))  # = pptanavg %*% zsvd_ppt$u

# therefore, pptanavg = pcs_ppt %*% t(zsvd_ppt$u)

# Eigen Values.. - fraction variance
lambdas = (zsvd_ppt$d/sum(zsvd_ppt$d))

plot(1:7, lambdas[1:7], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "Egien Variance Spectrum - Winter AZ PPT 1900-2010")
points(1:7, lambdas[1:7], col = "red")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# pick first 3 PCs

############################################################ 

# part ii

# Email on 4 Nov 1. Retain more than 3 PCS for SST, say first 10 2. Retain
# significant PCs of precip (PC1 explains a lot of variance and the rest is
# noise..)  --> Retain PC1(PPT) & PC2(PPT) 3. Fit a best model bet PC1 of
# precip and the SST PCs 4. resample the remaining PCs as they are noise --
# take the mean 5. backtransform

############################################################ 

# define PC(SST) as predictors
PC1_sst = pcs_sst[, 1]
PC2_sst = pcs_sst[, 2]
PC3_sst = pcs_sst[, 3]
PC4_sst = pcs_sst[, 4]
PC5_sst = pcs_sst[, 5]
PC6_sst = pcs_sst[, 6]
PC7_sst = pcs_sst[, 7]
PC8_sst = pcs_sst[, 8]
PC9_sst = pcs_sst[, 9]
PC10_sst = pcs_sst[, 10]

# define PC(PPT) as preditand
PC1_ppt = pcs_ppt[, 1]
PC2_ppt = pcs_ppt[, 2]
PC3_ppt = pcs_ppt[, 3]
PC4_ppt = pcs_ppt[, 4]
PC5_ppt = pcs_ppt[, 5]
PC6_ppt = pcs_ppt[, 6]
PC7_ppt = pcs_ppt[, 7]

# PC3 to PC7 of PPT are noise -- not fitting model to them

############################################################ 

# fitting PC1_ppt

# fitting with gaussian distribution, link = identity
fit_1_gaussian_identity = glm(PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + 
    PC5_sst + PC6_sst + PC7_sst + PC8_sst + PC9_sst + PC10_sst, family = gaussian(link = "identity"))
# ss = summary(fit_1_gaussian_identity) print(ss) AIC = 504.75

# GCV, model parameters = m = 3
m = 3
N = 111
MSE = (sum((fit_1_gaussian_identity$fit - PC1_ppt)^2))/N
GCV_fit_1_gaussian_identity = MSE/((1 - m/N)^2)  # = 0.2908073
cat("GCV_fit_1_gaussian_identity = ", GCV_fit_1_gaussian_identity, "\n", "\n")

## GCV_fit_1_gaussian_identity =  4.702 
##


library(MPV)

## 
## Attaching package: 'MPV'
## 
## The following object is masked from 'package:datasets':
## 
##     stackloss


# PRESS
press_fit_1_gaussian_identity = PRESS(fit_1_gaussian_identity)  # = 599.1912
cat("PRESS_fit_1_gaussian_identity = ", press_fit_1_gaussian_identity, "\n", 
    "\n")

## PRESS_fit_1_gaussian_identity =  599.2 
##


############################################################ 

# binomial, gamma, poisson, inverse.gaussian are not applicable to PC(PPT)
# fit_1_gaussian_identity has low AIC, GCV and PRESS values
# fit_1_gaussian_identity is a good fit

############################################################ 

# Stepwise Regression and display results

library(MASS)

## Warning: package 'MASS' was built under R version 3.0.2

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:MPV':
## 
##     cement


# AIC AIC_fit_1_gaussian_identity = stepAIC(fit_1_gaussian_identity,
# direction='both') AIC_fit_1_gaussian_identity$anova AIC = 500.04 for
# PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC8_sst + PC9_sst

n = length(PC1_ppt)

# BIC (SBC)
BIC_fit_1_gaussian_identity = stepAIC(fit_1_gaussian_identity, direction = "both", 
    k = log(n))

## Start:  AIC=534.5
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst + 
##     PC7_sst + PC8_sst + PC9_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC10_sst  1      494 530
## - PC5_sst   1      495 530
## - PC6_sst   1      500 531
## - PC7_sst   1      501 531
## - PC1_sst   1      503 532
## - PC3_sst   1      505 532
## - PC4_sst   1      507 533
## <none>             494 535
## - PC9_sst   1      519 535
## - PC8_sst   1      530 538
## - PC2_sst   1      584 548
## 
## Step:  AIC=529.8
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst + 
##     PC7_sst + PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## - PC5_sst   1      495 525
## - PC6_sst   1      500 526
## - PC7_sst   1      501 527
## - PC1_sst   1      503 527
## - PC3_sst   1      505 528
## - PC4_sst   1      507 528
## <none>             494 530
## - PC9_sst   1      519 531
## - PC8_sst   1      530 533
## + PC10_sst  1      494 535
## - PC2_sst   1      585 544
## 
## Step:  AIC=525.3
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC6_sst + PC7_sst + 
##     PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## - PC6_sst   1      501 522
## - PC7_sst   1      502 522
## - PC1_sst   1      504 523
## - PC3_sst   1      505 523
## - PC4_sst   1      507 523
## <none>             495 525
## - PC9_sst   1      520 526
## - PC8_sst   1      530 528
## + PC5_sst   1      494 530
## + PC10_sst  1      495 530
## - PC2_sst   1      586 539
## 
## Step:  AIC=521.9
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC7_sst + PC8_sst + 
##     PC9_sst
## 
##            Df Deviance AIC
## - PC7_sst   1      509 519
## - PC1_sst   1      510 519
## - PC3_sst   1      511 519
## - PC4_sst   1      512 520
## <none>             501 522
## - PC9_sst   1      524 522
## - PC8_sst   1      535 525
## + PC6_sst   1      495 525
## + PC5_sst   1      500 526
## + PC10_sst  1      501 527
## - PC2_sst   1      589 535
## 
## Step:  AIC=519
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## - PC4_sst   1      519 516
## - PC3_sst   1      520 517
## - PC1_sst   1      522 517
## <none>             509 519
## - PC9_sst   1      533 519
## + PC7_sst   1      501 522
## - PC8_sst   1      545 522
## + PC6_sst   1      502 522
## + PC5_sst   1      508 524
## + PC10_sst  1      509 524
## - PC2_sst   1      593 531
## 
## Step:  AIC=516.5
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## - PC3_sst   1      528 514
## - PC1_sst   1      535 515
## <none>             519 516
## - PC9_sst   1      543 517
## + PC4_sst   1      509 519
## - PC8_sst   1      556 519
## + PC7_sst   1      512 520
## + PC6_sst   1      514 520
## + PC5_sst   1      518 521
## + PC10_sst  1      519 521
## - PC2_sst   1      600 528
## 
## Step:  AIC=513.7
## PC1_ppt ~ PC1_sst + PC2_sst + PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## - PC1_sst   1      548 513
## <none>             528 514
## - PC9_sst   1      553 514
## - PC8_sst   1      563 516
## + PC3_sst   1      519 516
## + PC4_sst   1      520 517
## + PC7_sst   1      520 517
## + PC6_sst   1      523 517
## + PC5_sst   1      528 518
## + PC10_sst  1      528 518
## - PC2_sst   1      606 524
## 
## Step:  AIC=513
## PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst
## 
##            Df Deviance AIC
## <none>             548 513
## + PC1_sst   1      528 514
## - PC8_sst   1      580 515
## - PC9_sst   1      582 515
## + PC3_sst   1      535 515
## + PC7_sst   1      535 515
## + PC4_sst   1      537 515
## + PC6_sst   1      543 517
## + PC5_sst   1      547 517
## + PC10_sst  1      547 518
## - PC2_sst   1      634 525

# BIC_fit_1_gaussian_identity$anova BIC = 513.02 for PC1_ppt ~ PC2_sst +
# PC8_sst + PC9_sst

# use BIC rather than AIC - less predictors needed

# Best model: PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst
bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family = gaussian(link = "identity"))

############################################################ 

# fitting PC2_ppt

# fitting with gaussian distribution, link = identity
fit_2_gaussian_identity = glm(PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + 
    PC5_sst + PC6_sst + PC7_sst + PC8_sst + PC9_sst + PC10_sst, family = gaussian(link = "identity"))
# ss = summary(fit_2_gaussian_identity) print(ss) AIC = 184.03

# GCV, model parameters = m = 3
m = 3
MSE = (sum((fit_2_gaussian_identity$fit - PC2_ppt)^2))/N
GCV_fit_2_gaussian_identity = MSE/((1 - m/N)^2)  # = 0.01617222
cat("GCV_fit_2_gaussian_identity = ", GCV_fit_2_gaussian_identity, "\n", "\n")

## GCV_fit_2_gaussian_identity =  0.2615 
##


# PRESS
press_fit_2_gaussian_identity = PRESS(fit_2_gaussian_identity)  # = 34.34974
cat("PRESS_fit_2_gaussian_identity = ", press_fit_2_gaussian_identity, "\n", 
    "\n")

## PRESS_fit_2_gaussian_identity =  34.35 
##


############################################################ 

# binomial, gamma, poisson, inverse.gaussian are not applicable to PC(PPT)
# fit_2_gaussian_identity has low AIC, GCV and PRESS values
# fit_2_gaussian_identity is a good fit

############################################################ 

# Stepwise Regression and display results

library(MASS)

# AIC AIC_fit_2_gaussian_identity = stepAIC(fit_2_gaussian_identity,
# direction='both') AIC_fit_2_gaussian_identity$anova AIC = 176.31 for
# PC2_ppt ~ PC2_sst + PC7_sst + PC8_sst + PC10_sst

# BIC (SBC)
BIC_fit_2_gaussian_identity = stepAIC(fit_2_gaussian_identity, direction = "both", 
    k = log(n))

## Start:  AIC=213.8
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst + 
##     PC7_sst + PC8_sst + PC9_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC9_sst   1     27.5 209
## - PC5_sst   1     27.5 209
## - PC4_sst   1     27.5 209
## - PC3_sst   1     27.6 210
## - PC1_sst   1     27.9 211
## - PC6_sst   1     28.0 211
## - PC8_sst   1     28.1 212
## - PC7_sst   1     28.2 212
## - PC10_sst  1     28.6 214
## <none>            27.5 214
## - PC2_sst   1     29.1 216
## 
## Step:  AIC=209.1
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst + 
##     PC7_sst + PC8_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC5_sst   1     27.5 204
## - PC4_sst   1     27.5 205
## - PC3_sst   1     27.6 205
## - PC1_sst   1     27.9 206
## - PC6_sst   1     28.0 206
## - PC8_sst   1     28.1 207
## - PC7_sst   1     28.2 208
## - PC10_sst  1     28.7 209
## <none>            27.5 209
## - PC2_sst   1     29.1 211
## + PC9_sst   1     27.5 214
## 
## Step:  AIC=204.5
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC6_sst + PC7_sst + 
##     PC8_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC4_sst   1     27.5 200
## - PC3_sst   1     27.6 200
## - PC1_sst   1     27.9 201
## - PC6_sst   1     28.0 202
## - PC8_sst   1     28.1 202
## - PC7_sst   1     28.3 203
## <none>            27.5 204
## - PC10_sst  1     28.7 204
## - PC2_sst   1     29.1 206
## + PC5_sst   1     27.5 209
## + PC9_sst   1     27.5 209
## 
## Step:  AIC=199.9
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC6_sst + PC7_sst + PC8_sst + 
##     PC10_sst
## 
##            Df Deviance AIC
## - PC3_sst   1     27.6 196
## - PC1_sst   1     28.0 197
## - PC6_sst   1     28.0 197
## - PC8_sst   1     28.2 198
## - PC7_sst   1     28.3 198
## - PC10_sst  1     28.7 200
## <none>            27.5 200
## - PC2_sst   1     29.1 202
## + PC4_sst   1     27.5 204
## + PC5_sst   1     27.5 205
## + PC9_sst   1     27.5 205
## 
## Step:  AIC=195.5
## PC2_ppt ~ PC1_sst + PC2_sst + PC6_sst + PC7_sst + PC8_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC6_sst   1     28.1 193
## - PC1_sst   1     28.1 193
## - PC8_sst   1     28.3 193
## - PC7_sst   1     28.4 194
## - PC10_sst  1     28.8 195
## <none>            27.6 196
## - PC2_sst   1     29.2 197
## + PC3_sst   1     27.5 200
## + PC4_sst   1     27.6 200
## + PC5_sst   1     27.6 200
## + PC9_sst   1     27.6 200
## 
## Step:  AIC=192.8
## PC2_ppt ~ PC1_sst + PC2_sst + PC7_sst + PC8_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC1_sst   1     28.6 190
## - PC8_sst   1     28.8 191
## - PC7_sst   1     29.0 192
## - PC10_sst  1     29.2 192
## <none>            28.1 193
## - PC2_sst   1     29.6 194
## + PC6_sst   1     27.6 196
## + PC3_sst   1     28.0 197
## + PC4_sst   1     28.1 197
## + PC9_sst   1     28.1 198
## + PC5_sst   1     28.1 198
## 
## Step:  AIC=189.9
## PC2_ppt ~ PC2_sst + PC7_sst + PC8_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC8_sst   1     29.3 188
## - PC7_sst   1     29.7 189
## <none>            28.6 190
## - PC10_sst  1     30.0 190
## - PC2_sst   1     30.2 191
## + PC1_sst   1     28.1 193
## + PC6_sst   1     28.1 193
## + PC3_sst   1     28.4 194
## + PC4_sst   1     28.5 194
## + PC5_sst   1     28.5 194
## + PC9_sst   1     28.6 195
## 
## Step:  AIC=188
## PC2_ppt ~ PC2_sst + PC7_sst + PC10_sst
## 
##            Df Deviance AIC
## - PC7_sst   1     30.4 187
## <none>            29.3 188
## - PC10_sst  1     30.8 189
## - PC2_sst   1     31.0 190
## + PC8_sst   1     28.6 190
## + PC1_sst   1     28.8 191
## + PC6_sst   1     28.8 191
## + PC3_sst   1     29.1 192
## + PC5_sst   1     29.2 192
## + PC4_sst   1     29.3 192
## + PC9_sst   1     29.3 193
## 
## Step:  AIC=187.2
## PC2_ppt ~ PC2_sst + PC10_sst
## 
##            Df Deviance AIC
## <none>            30.3 187
## - PC2_sst   1     31.9 188
## + PC7_sst   1     29.3 188
## - PC10_sst  1     32.0 188
## + PC1_sst   1     29.6 189
## + PC8_sst   1     29.7 189
## + PC6_sst   1     29.8 190
## + PC3_sst   1     30.1 191
## + PC5_sst   1     30.3 192
## + PC4_sst   1     30.3 192
## + PC9_sst   1     30.3 192

# BIC_fit_2_gaussian_identity$anova BIC = 187.19 for PC2_ppt ~ PC2_sst +
# PC10_sst

# use BIC rather than AIC - less predictors needed

# Best model: PC2_ppt ~ PC2_sst + PC10_sst
bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link = "identity"))

############################################################ 

# part iii

# set 1st 10 PC(SST) as data frame
X = as.data.frame(pcs_sst[, 1:10])

# forecast retained PC1(PPT) from PC(SST) and model
PC1_ppt_pred = predict(bestfit_1, X, se.fit = T)

plot(PC1_ppt, PC1_ppt_pred$fit, main = "PC1(PPT) & Forecast PC1(PPT)")

# add 1:1 line
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

# forecast retained PC2(PPT) from PC(SST) and model
PC2_ppt_pred = predict(bestfit_2, X, se.fit = T)

plot(PC2_ppt, PC2_ppt_pred$fit, main = "PC2(PPT) & Forecast PC2(PPT)")

# add 1:1 line
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# create an ensemble of 100 members PC(PPT) ensemble matrix = 100 members x
# 7 locations
EN = 100

# create a normal distribution based on PC1_ppt_pred mean = 0, sd = standard
# error of PC1_ppt_pred
PC1_ppt_se_norm = rnorm(EN, mean = 0, sd = PC1_ppt_pred$se)

# create a normal distribution based on PC2_ppt_pred mean = 0, sd = standard
# error of PC2_ppt_pred
PC2_ppt_se_norm = rnorm(EN, mean = 0, sd = PC2_ppt_pred$se)

# PC1_ppt_se_norm & PC2_ppt_se_norm are 1st 2 columns of PC(PPT) ensemble
# matrix

############################################################ 

# part iv

# set empty array
PC3to7_sample = c()

# bootstrapping PC3(PPT) to PC7(PPT) -- noise sampling for 100 times
# replacement = true PC3to7_sample: 100 members x 5 locations
for (i in 3:7) {
    PC3to7_sample = cbind(PC3to7_sample, sample(pcs_ppt[, i], EN, replace = T))
}

############################################################ 

# part v

# create a 3 dimensional array for PC(PPT) 111 years x 7 locations x 100
# ensemble members
forecast_PC_ppt_EN = array(0, dim = c(111, 7, EN))

# for 1st to 111th year, 1st to 7th PC 1st ensemble column = forecast
# PC1(PPT) + PC1(PPT) standard error samples 2nd ensemble column = forecast
# PC2(PPT) + PC2(PPT) standard error samples 3rd to 7th ensemble column =
# random PC3(PPT) to PC7(PPT) samples

for (i in 1:111) {
    forecast_PC_ppt_EN[i, 1, ] = PC1_ppt_pred$fit[i] + PC1_ppt_se_norm
    forecast_PC_ppt_EN[i, 2, ] = PC2_ppt_pred$fit[i] + PC2_ppt_se_norm
    forecast_PC_ppt_EN[i, 3:7, ] = PC3to7_sample
}

forecast_ppt_EN_anomaly = array(0, dim = c(111, 7, EN))

# back transform: X = Y %*% E^T
for (i in 1:EN) {
    forecast_ppt_EN_anomaly[, , i] = forecast_PC_ppt_EN[, , i] %*% t(zsvd_ppt$u)
}

# set forecast ppt ensemble matrix
forecast_ppt_EN = array(0, dim = c(111, 7, EN))

# back transform: forecast_ppt_EN = (forecast_ppt_EN_anomaly * ppt_sd) +
# ppt_mean

for (i in 1:111) {
    for (k in 1:EN) {
        for (j in 1:7) {
            meg = forecast_ppt_EN_anomaly[i, j, k] * ppt_sd[j]
            forecast_ppt_EN[i, j, k] = meg + ppt_mean[j]
            j = j + 1
        }
        k = k + 1
    }
    i = i + 1
}

# negative PPT does not make sense setting negative values in
# forecast_ppt_EN to 0

for (i in 1:111) {
    for (k in 1:EN) {
        for (j in 1:7) {
            if (forecast_ppt_EN[i, j, k] < 0) {
                forecast_ppt_EN[i, j, k] = 0
            }
            j = j + 1
        }
        k = k + 1
    }
    i = i + 1
}

############################################################ 

# plot time series of forecast at location 1

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_1 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_1[i, ] = forecast_ppt_EN[, 1, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_1 = data.frame(forecast_ppt_loc_1)

library(graphics)

# boxplot time series
boxplot(forecast_ppt_loc_1, ylim = c(-0.1, 3), main = "Boxplot time series of forecast PPT - location 1: 1900 - 2010; Obs as red line")

points(data_m[, 1], col = "red", pch = 16)
lines(data_m[, 1], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1

par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 2

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_2 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_2[i, ] = forecast_ppt_EN[, 2, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_2 = data.frame(forecast_ppt_loc_2)

# boxplot time series
boxplot(forecast_ppt_loc_2, ylim = c(-0.1, 3.5), main = "Boxplot time series of forecast PPT - location 2: 1900 - 2010; Obs as red line")

points(data_m[, 2], col = "red", pch = 16)
lines(data_m[, 2], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 3

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_3 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_3[i, ] = forecast_ppt_EN[, 3, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_3 = data.frame(forecast_ppt_loc_3)

# boxplot time series
boxplot(forecast_ppt_loc_3, ylim = c(-0.1, 5), main = "Boxplot time series of forecast PPT - location 3: 1900 - 2010; Obs as red line")

points(data_m[, 3], col = "red", pch = 16)
lines(data_m[, 3], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 4

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_4 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_4[i, ] = forecast_ppt_EN[, 4, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_4 = data.frame(forecast_ppt_loc_4)

# boxplot time series
boxplot(forecast_ppt_loc_4, ylim = c(-0.1, 6), main = "Boxplot time series of forecast PPT - location 4: 1900 - 2010; Obs as red line")

points(data_m[, 4], col = "red", pch = 16)
lines(data_m[, 4], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 5

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_5 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_5[i, ] = forecast_ppt_EN[, 5, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_5 = data.frame(forecast_ppt_loc_5)

# boxplot time series
boxplot(forecast_ppt_loc_5, ylim = c(-0.1, 2), main = "Boxplot time series of forecast PPT - location 5: 1900 - 2010; Obs as red line")

points(data_m[, 5], col = "red", pch = 16)
lines(data_m[, 5], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 6

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_6 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_6[i, ] = forecast_ppt_EN[, 6, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_6 = data.frame(forecast_ppt_loc_6)

# boxplot time series
boxplot(forecast_ppt_loc_6, ylim = c(-0.1, 3.7), main = "Boxplot time series of forecast PPT - location 6: 1900 - 2010; Obs as red line")

points(data_m[, 6], col = "red", pch = 16)
lines(data_m[, 6], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot time series of forecast at location 7

# create data matrix: 100 ensemble members x 111 years
forecast_ppt_loc_7 = matrix(0, nrow = EN, ncol = 111)

for (i in 1:EN) {
    forecast_ppt_loc_7[i, ] = forecast_ppt_EN[, 7, i]
    i = i + 1
}

# set data frame
forecast_ppt_loc_7 = data.frame(forecast_ppt_loc_7)

# boxplot time series
boxplot(forecast_ppt_loc_7, ylim = c(-0.1, 3), main = "Boxplot time series of forecast PPT - location 7: 1900 - 2010; Obs as red line")

points(data_m[, 7], col = "red", pch = 16)
lines(data_m[, 7], col = "red", lwd = 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# part vi

# forecast for Winter AZ PPT 2011-2012

# SST

ntime = 1  # Dec 2011 to Mar 2012

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2011-2012.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata_11_12 = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata_11_12[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PPT

nlocation = 7  # 7 stations in AZ

# Read Arizona PPT data -- already annual averaged

data_m_11_12 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2011-2012.txt")

############################################################ 

# transform to PC(SST) & predict PC(PPT) with best model

sstanavg_11_12 = sstdata_11_12

# PC(SST_11_12)
pcs_sst_11_12 = sstanavg_11_12 %*% zsvd_sst$u

############################################################ 

# bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family =
# gaussian(link = 'identity'))

# re-write bestfit_1 to predict PC1(SST_11_12)

PC2 = PC2_sst
PC8 = PC8_sst
PC9 = PC9_sst
xe = cbind(PC2, PC8, PC9)

bestfit_1 = glm(PC1_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))

# check with original data X = as.data.frame(pcs_sst[,1:10]) PC1_ppt_pred =
# predict(bestfit_1, X, se.fit = T) plot(PC1_ppt, PC1_ppt_pred$fit, main =
# 'PC1(PPT) & Forecast PC1(PPT)') still give the same plot

PC2 = pcs_sst_11_12[2]
PC8 = pcs_sst_11_12[8]
PC9 = pcs_sst_11_12[9]
xe = cbind(PC2, PC8, PC9)

PC1_ppt_pred_11_12 = predict(bestfit_1, newdata = as.data.frame(xe), se.fit = T)

############################################################ 

# bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link =
# 'identity'))

# re-write bestfit_2 to predict PC2(SST_11_12)

PC2 = PC2_sst
PC10 = PC10_sst
xe = cbind(PC2, PC10)

bestfit_2 = glm(PC2_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))

# check with original data X = as.data.frame(pcs_sst[,1:10]) PC2_ppt_pred =
# predict(bestfit_2, X, se.fit = T) plot(PC2_ppt, PC2_ppt_pred$fit, main =
# 'PC2PPT) & Forecast PC2(PPT)') still give the same plot

PC2 = pcs_sst_11_12[2]
PC10 = pcs_sst_11_12[10]
xe = cbind(PC2, PC10)

PC2_ppt_pred_11_12 = predict(bestfit_2, newdata = as.data.frame(xe), se.fit = T)

############################################################ 

# create a 2 dimensional array for PC_11_12(PPT) (1 year x) 7 locations x
# 100 ensemble members
forecast_PC_ppt_11_12_EN = array(0, dim = c(7, EN))

# 1st to 7th PC 1st ensemble column = forecast PC1_11_12(PPT) + PC1(PPT)
# standard error samples 2nd ensemble column = forecast PC2_11_12(PPT) +
# PC2(PPT) standard error samples 3rd to 7th ensemble column = random
# PC3(PPT) to PC7(PPT) samples

forecast_PC_ppt_11_12_EN[1, ] = PC1_ppt_pred_11_12$fit + PC1_ppt_se_norm
forecast_PC_ppt_11_12_EN[2, ] = PC2_ppt_pred_11_12$fit + PC2_ppt_se_norm
forecast_PC_ppt_11_12_EN[3:7, ] = PC3to7_sample

forecast_ppt_11_12_EN_anomaly = array(0, dim = c(7, EN))

# back transform: X = Y %*% E^T
for (i in 1:EN) {
    forecast_ppt_11_12_EN_anomaly[, i] = forecast_PC_ppt_11_12_EN[, i] %*% t(zsvd_ppt$u)
}

# set forecast ppt ensemble matrix
forecast_ppt_11_12_EN = array(0, dim = c(7, 100))

# back transform: forecast_ppt_11_12_EN = (forecast_ppt_11_12_EN_anomaly *
# ppt_sd) + ppt_mean

for (k in 1:EN) {
    for (j in 1:7) {
        meg = forecast_ppt_11_12_EN_anomaly[j, k] * ppt_sd[j]
        forecast_ppt_11_12_EN[j, k] = meg + ppt_mean[j]
        j = j + 1
    }
    k = k + 1
}

# negative PPT does not make sense setting negative values in
# forecast_ppt_EN to 0

for (k in 1:EN) {
    for (j in 1:7) {
        if (forecast_ppt_11_12_EN[j, k] < 0) {
            forecast_ppt_11_12_EN[j, k] = 0
        }
        j = j + 1
    }
    k = k + 1
}

############################################################ 

# forecast for Winter AZ PPT 2012-2013

# SST

ntime = 1  # Dec 2012 to Mar 2013

ntimep = ntime

nglobe = nrows * ncols

# Lat - Long grid..  create grid below:

ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid)  # = 36

xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid)  # = 72

xygrid = matrix(0, nrow = nx * ny, ncol = 2)

i = 0
for (iy in 1:ny) {
    for (ix in 1:nx) {
        i = i + 1
        xygrid[i, 1] = ygrid[iy]
        xygrid[i, 2] = xgrid[ix]
    }
}

############################################################ 

# Read Kaplan SST data -- already annual averaged

data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2012-2013.r4", 
    what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")

data <- array(data = data, dim = c(nrows, ncols, ntime))

# read data for the 1st month
data1 = data[, , 1]

# the lat -long data grid..

# all the grid points on the globe
index = 1:(nx * ny)

# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]

# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]

nsites = length(index1)
data2 = data1[index1]

# SSTdata matrix - rows are months annd columns are locations
sstdata_12_13 = matrix(0, nrow = ntimep, ncol = nsites)

for (i in 1:ntimep) {
    data1 = data[, , i]
    index1 = index[data1 != "NaN"]
    data2 = data1[index1]
    sstdata_12_13[i, ] = data2
}

indexgrid = index1
rm("data")  #remove the object data to clear up space

############################################################ 

# PPT

nlocation = 7  # 7 stations in AZ

# Read Arizona PPT data -- already annual averaged

data_m_12_13 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2012-2013.txt")

############################################################ 

# transform to PC(SST) & predict PC(PPT) with best model

sstanavg_12_13 = sstdata_12_13

# PC(SST_12_13)
pcs_sst_12_13 = sstanavg_12_13 %*% zsvd_sst$u

############################################################ 

# bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family =
# gaussian(link = 'identity'))

# re-write bestfit_1 to predict PC1(SST_12_13)

PC2 = PC2_sst
PC8 = PC8_sst
PC9 = PC9_sst
xe = cbind(PC2, PC8, PC9)

bestfit_1 = glm(PC1_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))

PC2 = pcs_sst_12_13[2]
PC8 = pcs_sst_12_13[8]
PC9 = pcs_sst_12_13[9]
xe = cbind(PC2, PC8, PC9)

PC1_ppt_pred_12_13 = predict(bestfit_1, newdata = as.data.frame(xe), se.fit = T)

############################################################ 

# bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link =
# 'identity'))

# re-write bestfit_2 to predict PC2(SST_12_13)

PC2 = PC2_sst
PC10 = PC10_sst
xe = cbind(PC2, PC10)

bestfit_2 = glm(PC2_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))

PC2 = pcs_sst_12_13[2]
PC10 = pcs_sst_12_13[10]
xe = cbind(PC2, PC10)

PC2_ppt_pred_12_13 = predict(bestfit_2, newdata = as.data.frame(xe), se.fit = T)

############################################################ 

# create a 2 dimensional array for PC_12_13(PPT) (1 year x) 7 locations x
# 100 ensemble members
forecast_PC_ppt_12_13_EN = array(0, dim = c(7, EN))

# 1st to 7th PC 1st ensemble column = forecast PC1_12_13(PPT) + PC1(PPT)
# standard error samples 2nd ensemble column = forecast PC2_12_13(PPT) +
# PC2(PPT) standard error samples 3rd to 7th ensemble column = random
# PC3(PPT) to PC7(PPT) samples

forecast_PC_ppt_12_13_EN[1, ] = PC1_ppt_pred_12_13$fit + PC1_ppt_se_norm
forecast_PC_ppt_12_13_EN[2, ] = PC2_ppt_pred_12_13$fit + PC2_ppt_se_norm
forecast_PC_ppt_12_13_EN[3:7, ] = PC3to7_sample

forecast_ppt_12_13_EN_anomaly = array(0, dim = c(7, EN))

# back transform: X = Y %*% E^T
for (i in 1:EN) {
    forecast_ppt_12_13_EN_anomaly[, i] = forecast_PC_ppt_12_13_EN[, i] %*% t(zsvd_ppt$u)
}

# set forecast ppt ensemble matrix
forecast_ppt_12_13_EN = array(0, dim = c(7, 100))

# back transform: forecast_ppt_12_13_EN = (forecast_ppt_12_13_EN_anomaly *
# ppt_sd) + ppt_mean

for (k in 1:EN) {
    for (j in 1:7) {
        meg = forecast_ppt_12_13_EN_anomaly[j, k] * ppt_sd[j]
        forecast_ppt_12_13_EN[j, k] = meg + ppt_mean[j]
        j = j + 1
    }
    k = k + 1
}

# negative PPT does not make sense setting negative values in
# forecast_ppt_EN to 0

for (k in 1:EN) {
    for (j in 1:7) {
        if (forecast_ppt_12_13_EN[j, k] < 0) {
            forecast_ppt_12_13_EN[j, k] = 0
        }
        j = j + 1
    }
    k = k + 1
}

############################################################ 

# part vii

# plot boxplots for all 7 locations for winter 2011-12

# set data frame
forecast_ppt_11_12_EN = data.frame(t(forecast_ppt_11_12_EN))

# boxplot time series
boxplot(forecast_ppt_11_12_EN, ylim = c(-0.1, 3), main = "Boxplot of forecast PPT for all 7 locations - 2011-2012; Obs as red dots")

points(data_m_11_12, col = "red", pch = 16)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# plot boxplots for all 7 locations for winter 2012-13

# set data frame
forecast_ppt_12_13_EN = data.frame(t(forecast_ppt_12_13_EN))

# boxplot time series
boxplot(forecast_ppt_12_13_EN, ylim = c(-0.1, 3), main = "Boxplot of forecast PPT for all 7 locations - 2012-2013; Obs as red dots")

points(data_m_12_13, col = "red", pch = 16)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################