# HW 2 Q 4 JLCY, 9 Nov 2013
############################################################
# part i
# SST
nrows = 72
ncols = 36
ntime = 111 # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011
ntimep = ntime
nglobe = nrows * ncols
# Lat - Long grid.. create grid below:
ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid) # = 36
xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid) # = 72
xygrid = matrix(0, nrow = nx * ny, ncol = 2)
i = 0
for (iy in 1:ny) {
for (ix in 1:nx) {
i = i + 1
xygrid[i, 1] = ygrid[iy]
xygrid[i, 2] = xgrid[ix]
}
}
############################################################
# Read Kaplan SST data -- already annual averaged
data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-1900-2011.r4",
what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")
data <- array(data = data, dim = c(nrows, ncols, ntime))
# read data for the 1st month
data1 = data[, , 1]
# the lat -long data grid..
# all the grid points on the globe
index = 1:(nx * ny)
# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]
# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]
nsites = length(index1)
data2 = data1[index1]
# SSTdata matrix - rows are months annd columns are locations
sstdata = matrix(0, nrow = ntimep, ncol = nsites)
for (i in 1:ntimep) {
data1 = data[, , i]
index1 = index[data1 != "NaN"]
data2 = data1[index1]
sstdata[i, ] = data2
}
indexgrid = index1
rm("data") #remove the object data to clear up space
############################################################
# PCA
sstanavg = sstdata
# get variance matrix..
zs_sst = var(scale(sstanavg))
# do an Eigen(Singular Value) decomposition..
zsvd_sst = svd(zs_sst)
# Principal Components...
pcs_sst = t(t(zsvd_sst$u) %*% t(sstanavg))
# Eigen Values.. - fraction variance
lambdas = (zsvd_sst$d/sum(zsvd_sst$d))
plot(1:25, lambdas[1:25], type = "l", xlab = "Modes", ylab = "Frac. Var. explained",
main = "Egien Variance Spectrum - Winter SST 1900-2010")
points(1:25, lambdas[1:25], col = "red")
par(ask = TRUE)
############################################################
# PPT
ntime = 111 # Dec 1900 to Mar 1901 - Dec 2010 to Mar 2011
ntimep = ntime
nlocation = 7 # 7 stations in AZ
# Lat - Long grid.. create grid below:
# read coord online
# locs=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt'),
# ncol=5, byrow=T)
# read coord from pc
locs = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\WesternUS-coords.txt"),
ncol = 5, byrow = T)
xlat = locs[, 3]
ylon = locs[, 4]
xygrid = matrix(0, nrow = 81, ncol = 2)
xygrid[, 1] = xlat
xygrid[, 2] = ylon
############################################################
# Read Arizona PPT data -- already annual averaged
data = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-1900-2011.txt")
data_m <- array(dim = c(ntime, nlocation))
# set data matrix: 111 years x 7 locations
i = 1
j = 1
for (i in 1:7) {
data_m[, i] = data[j:(j + 110)]
i = i + 1
j = j + 111
}
# column mean & standard deviation -- for each of the 7 locations
ppt_mean = apply(data_m, 2, mean)
ppt_sd = apply(data_m, 2, sd)
############################################################
# PCA
pptanavg_m = data_m
pptanavg = scale(pptanavg_m)
# get variance matrix..
zs_ppt = var(pptanavg)
# do an Eigen(Singular Value) decomposition..
zsvd_ppt = svd(zs_ppt)
# Principal Components...
pcs_ppt = t(t(zsvd_ppt$u) %*% t(pptanavg)) # = pptanavg %*% zsvd_ppt$u
# therefore, pptanavg = pcs_ppt %*% t(zsvd_ppt$u)
# Eigen Values.. - fraction variance
lambdas = (zsvd_ppt$d/sum(zsvd_ppt$d))
plot(1:7, lambdas[1:7], type = "l", xlab = "Modes", ylab = "Frac. Var. explained",
main = "Egien Variance Spectrum - Winter AZ PPT 1900-2010")
points(1:7, lambdas[1:7], col = "red")
par(ask = TRUE)
############################################################
# pick first 3 PCs
############################################################
# part ii
# Email on 4 Nov 1. Retain more than 3 PCS for SST, say first 10 2. Retain
# significant PCs of precip (PC1 explains a lot of variance and the rest is
# noise..) --> Retain PC1(PPT) & PC2(PPT) 3. Fit a best model bet PC1 of
# precip and the SST PCs 4. resample the remaining PCs as they are noise --
# take the mean 5. backtransform
############################################################
# define PC(SST) as predictors
PC1_sst = pcs_sst[, 1]
PC2_sst = pcs_sst[, 2]
PC3_sst = pcs_sst[, 3]
PC4_sst = pcs_sst[, 4]
PC5_sst = pcs_sst[, 5]
PC6_sst = pcs_sst[, 6]
PC7_sst = pcs_sst[, 7]
PC8_sst = pcs_sst[, 8]
PC9_sst = pcs_sst[, 9]
PC10_sst = pcs_sst[, 10]
# define PC(PPT) as preditand
PC1_ppt = pcs_ppt[, 1]
PC2_ppt = pcs_ppt[, 2]
PC3_ppt = pcs_ppt[, 3]
PC4_ppt = pcs_ppt[, 4]
PC5_ppt = pcs_ppt[, 5]
PC6_ppt = pcs_ppt[, 6]
PC7_ppt = pcs_ppt[, 7]
# PC3 to PC7 of PPT are noise -- not fitting model to them
############################################################
# fitting PC1_ppt
# fitting with gaussian distribution, link = identity = default glm
fit_1_gaussian_identity = glm(PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst +
PC5_sst + PC6_sst + PC7_sst + PC8_sst + PC9_sst + PC10_sst, family = gaussian(link = "identity"))
# ss = summary(fit_1_gaussian_identity) print(ss) AIC = 504.75
# GCV, model parameters = m = 3
m = 3
N = 111
MSE = (sum((fit_1_gaussian_identity$fit - PC1_ppt)^2))/N
GCV_fit_1_gaussian_identity = MSE/((1 - m/N)^2) # = 0.2908073
cat("GCV_fit_1_gaussian_identity = ", GCV_fit_1_gaussian_identity, "\n", "\n")
## GCV_fit_1_gaussian_identity = 4.702
##
library(MPV)
##
## Attaching package: 'MPV'
##
## The following object is masked from 'package:datasets':
##
## stackloss
# PRESS
press_fit_1_gaussian_identity = PRESS(fit_1_gaussian_identity) # = 599.1912
cat("PRESS_fit_1_gaussian_identity = ", press_fit_1_gaussian_identity, "\n",
"\n")
## PRESS_fit_1_gaussian_identity = 599.2
##
############################################################
# binomial, gamma, poisson, inverse.gaussian are not applicable to PC(PPT)
# fit_1_gaussian_identity has low AIC, GCV and PRESS values
# fit_1_gaussian_identity is a good fit
############################################################
# Stepwise Regression and display results
library(MASS)
## Warning: package 'MASS' was built under R version 3.0.2
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:MPV':
##
## cement
# AIC AIC_fit_1_gaussian_identity = stepAIC(fit_1_gaussian_identity,
# direction='both') AIC_fit_1_gaussian_identity$anova AIC = 500.04 for
# PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC8_sst + PC9_sst
n = length(PC1_ppt)
# BIC (SBC)
BIC_fit_1_gaussian_identity = stepAIC(fit_1_gaussian_identity, direction = "both",
k = log(n))
## Start: AIC=534.5
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst +
## PC7_sst + PC8_sst + PC9_sst + PC10_sst
##
## Df Deviance AIC
## - PC10_sst 1 494 530
## - PC5_sst 1 495 530
## - PC6_sst 1 500 531
## - PC7_sst 1 501 531
## - PC1_sst 1 503 532
## - PC3_sst 1 505 532
## - PC4_sst 1 507 533
## <none> 494 535
## - PC9_sst 1 519 535
## - PC8_sst 1 530 538
## - PC2_sst 1 584 548
##
## Step: AIC=529.8
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst +
## PC7_sst + PC8_sst + PC9_sst
##
## Df Deviance AIC
## - PC5_sst 1 495 525
## - PC6_sst 1 500 526
## - PC7_sst 1 501 527
## - PC1_sst 1 503 527
## - PC3_sst 1 505 528
## - PC4_sst 1 507 528
## <none> 494 530
## - PC9_sst 1 519 531
## - PC8_sst 1 530 533
## + PC10_sst 1 494 535
## - PC2_sst 1 585 544
##
## Step: AIC=525.3
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC6_sst + PC7_sst +
## PC8_sst + PC9_sst
##
## Df Deviance AIC
## - PC6_sst 1 501 522
## - PC7_sst 1 502 522
## - PC1_sst 1 504 523
## - PC3_sst 1 505 523
## - PC4_sst 1 507 523
## <none> 495 525
## - PC9_sst 1 520 526
## - PC8_sst 1 530 528
## + PC5_sst 1 494 530
## + PC10_sst 1 495 530
## - PC2_sst 1 586 539
##
## Step: AIC=521.9
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC7_sst + PC8_sst +
## PC9_sst
##
## Df Deviance AIC
## - PC7_sst 1 509 519
## - PC1_sst 1 510 519
## - PC3_sst 1 511 519
## - PC4_sst 1 512 520
## <none> 501 522
## - PC9_sst 1 524 522
## - PC8_sst 1 535 525
## + PC6_sst 1 495 525
## + PC5_sst 1 500 526
## + PC10_sst 1 501 527
## - PC2_sst 1 589 535
##
## Step: AIC=519
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC8_sst + PC9_sst
##
## Df Deviance AIC
## - PC4_sst 1 519 516
## - PC3_sst 1 520 517
## - PC1_sst 1 522 517
## <none> 509 519
## - PC9_sst 1 533 519
## + PC7_sst 1 501 522
## - PC8_sst 1 545 522
## + PC6_sst 1 502 522
## + PC5_sst 1 508 524
## + PC10_sst 1 509 524
## - PC2_sst 1 593 531
##
## Step: AIC=516.5
## PC1_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC8_sst + PC9_sst
##
## Df Deviance AIC
## - PC3_sst 1 528 514
## - PC1_sst 1 535 515
## <none> 519 516
## - PC9_sst 1 543 517
## + PC4_sst 1 509 519
## - PC8_sst 1 556 519
## + PC7_sst 1 512 520
## + PC6_sst 1 514 520
## + PC5_sst 1 518 521
## + PC10_sst 1 519 521
## - PC2_sst 1 600 528
##
## Step: AIC=513.7
## PC1_ppt ~ PC1_sst + PC2_sst + PC8_sst + PC9_sst
##
## Df Deviance AIC
## - PC1_sst 1 548 513
## <none> 528 514
## - PC9_sst 1 553 514
## - PC8_sst 1 563 516
## + PC3_sst 1 519 516
## + PC4_sst 1 520 517
## + PC7_sst 1 520 517
## + PC6_sst 1 523 517
## + PC5_sst 1 528 518
## + PC10_sst 1 528 518
## - PC2_sst 1 606 524
##
## Step: AIC=513
## PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst
##
## Df Deviance AIC
## <none> 548 513
## + PC1_sst 1 528 514
## - PC8_sst 1 580 515
## - PC9_sst 1 582 515
## + PC3_sst 1 535 515
## + PC7_sst 1 535 515
## + PC4_sst 1 537 515
## + PC6_sst 1 543 517
## + PC5_sst 1 547 517
## + PC10_sst 1 547 518
## - PC2_sst 1 634 525
# BIC_fit_1_gaussian_identity$anova BIC = 513.02 for PC1_ppt ~ PC2_sst +
# PC8_sst + PC9_sst
# use BIC rather than AIC - less predictors needed
# Best model: PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst
bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family = gaussian(link = "identity"))
############################################################
# fitting PC2_ppt
# fitting with gaussian distribution, link = identity = default glm
fit_2_gaussian_identity = glm(PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst +
PC5_sst + PC6_sst + PC7_sst + PC8_sst + PC9_sst + PC10_sst, family = gaussian(link = "identity"))
# ss = summary(fit_2_gaussian_identity) print(ss) AIC = 184.03
# GCV, model parameters = m = 3
m = 3
MSE = (sum((fit_2_gaussian_identity$fit - PC2_ppt)^2))/N
GCV_fit_2_gaussian_identity = MSE/((1 - m/N)^2) # = 0.01617222
cat("GCV_fit_2_gaussian_identity = ", GCV_fit_2_gaussian_identity, "\n", "\n")
## GCV_fit_2_gaussian_identity = 0.2615
##
# PRES
press_fit_2_gaussian_identity = PRESS(fit_2_gaussian_identity) # = 34.34974
cat("PRESS_fit_2_gaussian_identity = ", press_fit_2_gaussian_identity, "\n",
"\n")
## PRESS_fit_2_gaussian_identity = 34.35
##
############################################################
# binomial, gamma, poisson, inverse.gaussian are not applicable to PC(PPT)
# fit_2_gaussian_identity has low AIC, GCV and PRESS values
# fit_2_gaussian_identity is a good fit
############################################################
# Stepwise Regression and display results
library(MASS)
# AIC AIC_fit_2_gaussian_identity = stepAIC(fit_2_gaussian_identity,
# direction='both') AIC_fit_2_gaussian_identity$anova AIC = 176.31 for
# PC2_ppt ~ PC2_sst + PC7_sst + PC8_sst + PC10_sst
# BIC (SBC)
BIC_fit_2_gaussian_identity = stepAIC(fit_2_gaussian_identity, direction = "both",
k = log(n))
## Start: AIC=213.8
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst +
## PC7_sst + PC8_sst + PC9_sst + PC10_sst
##
## Df Deviance AIC
## - PC9_sst 1 27.5 209
## - PC5_sst 1 27.5 209
## - PC4_sst 1 27.5 209
## - PC3_sst 1 27.6 210
## - PC1_sst 1 27.9 211
## - PC6_sst 1 28.0 211
## - PC8_sst 1 28.1 212
## - PC7_sst 1 28.2 212
## - PC10_sst 1 28.6 214
## <none> 27.5 214
## - PC2_sst 1 29.1 216
##
## Step: AIC=209.1
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC5_sst + PC6_sst +
## PC7_sst + PC8_sst + PC10_sst
##
## Df Deviance AIC
## - PC5_sst 1 27.5 204
## - PC4_sst 1 27.5 205
## - PC3_sst 1 27.6 205
## - PC1_sst 1 27.9 206
## - PC6_sst 1 28.0 206
## - PC8_sst 1 28.1 207
## - PC7_sst 1 28.2 208
## - PC10_sst 1 28.7 209
## <none> 27.5 209
## - PC2_sst 1 29.1 211
## + PC9_sst 1 27.5 214
##
## Step: AIC=204.5
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC4_sst + PC6_sst + PC7_sst +
## PC8_sst + PC10_sst
##
## Df Deviance AIC
## - PC4_sst 1 27.5 200
## - PC3_sst 1 27.6 200
## - PC1_sst 1 27.9 201
## - PC6_sst 1 28.0 202
## - PC8_sst 1 28.1 202
## - PC7_sst 1 28.3 203
## <none> 27.5 204
## - PC10_sst 1 28.7 204
## - PC2_sst 1 29.1 206
## + PC5_sst 1 27.5 209
## + PC9_sst 1 27.5 209
##
## Step: AIC=199.9
## PC2_ppt ~ PC1_sst + PC2_sst + PC3_sst + PC6_sst + PC7_sst + PC8_sst +
## PC10_sst
##
## Df Deviance AIC
## - PC3_sst 1 27.6 196
## - PC1_sst 1 28.0 197
## - PC6_sst 1 28.0 197
## - PC8_sst 1 28.2 198
## - PC7_sst 1 28.3 198
## - PC10_sst 1 28.7 200
## <none> 27.5 200
## - PC2_sst 1 29.1 202
## + PC4_sst 1 27.5 204
## + PC5_sst 1 27.5 205
## + PC9_sst 1 27.5 205
##
## Step: AIC=195.5
## PC2_ppt ~ PC1_sst + PC2_sst + PC6_sst + PC7_sst + PC8_sst + PC10_sst
##
## Df Deviance AIC
## - PC6_sst 1 28.1 193
## - PC1_sst 1 28.1 193
## - PC8_sst 1 28.3 193
## - PC7_sst 1 28.4 194
## - PC10_sst 1 28.8 195
## <none> 27.6 196
## - PC2_sst 1 29.2 197
## + PC3_sst 1 27.5 200
## + PC4_sst 1 27.6 200
## + PC5_sst 1 27.6 200
## + PC9_sst 1 27.6 200
##
## Step: AIC=192.8
## PC2_ppt ~ PC1_sst + PC2_sst + PC7_sst + PC8_sst + PC10_sst
##
## Df Deviance AIC
## - PC1_sst 1 28.6 190
## - PC8_sst 1 28.8 191
## - PC7_sst 1 29.0 192
## - PC10_sst 1 29.2 192
## <none> 28.1 193
## - PC2_sst 1 29.6 194
## + PC6_sst 1 27.6 196
## + PC3_sst 1 28.0 197
## + PC4_sst 1 28.1 197
## + PC9_sst 1 28.1 198
## + PC5_sst 1 28.1 198
##
## Step: AIC=189.9
## PC2_ppt ~ PC2_sst + PC7_sst + PC8_sst + PC10_sst
##
## Df Deviance AIC
## - PC8_sst 1 29.3 188
## - PC7_sst 1 29.7 189
## <none> 28.6 190
## - PC10_sst 1 30.0 190
## - PC2_sst 1 30.2 191
## + PC1_sst 1 28.1 193
## + PC6_sst 1 28.1 193
## + PC3_sst 1 28.4 194
## + PC4_sst 1 28.5 194
## + PC5_sst 1 28.5 194
## + PC9_sst 1 28.6 195
##
## Step: AIC=188
## PC2_ppt ~ PC2_sst + PC7_sst + PC10_sst
##
## Df Deviance AIC
## - PC7_sst 1 30.4 187
## <none> 29.3 188
## - PC10_sst 1 30.8 189
## - PC2_sst 1 31.0 190
## + PC8_sst 1 28.6 190
## + PC1_sst 1 28.8 191
## + PC6_sst 1 28.8 191
## + PC3_sst 1 29.1 192
## + PC5_sst 1 29.2 192
## + PC4_sst 1 29.3 192
## + PC9_sst 1 29.3 193
##
## Step: AIC=187.2
## PC2_ppt ~ PC2_sst + PC10_sst
##
## Df Deviance AIC
## <none> 30.3 187
## - PC2_sst 1 31.9 188
## + PC7_sst 1 29.3 188
## - PC10_sst 1 32.0 188
## + PC1_sst 1 29.6 189
## + PC8_sst 1 29.7 189
## + PC6_sst 1 29.8 190
## + PC3_sst 1 30.1 191
## + PC5_sst 1 30.3 192
## + PC4_sst 1 30.3 192
## + PC9_sst 1 30.3 192
# BIC_fit_2_gaussian_identity$anova BIC = 187.19 for PC2_ppt ~ PC2_sst +
# PC10_sst
# use BIC rather than AIC - less predictors needed
# Best model: PC2_ppt ~ PC2_sst + PC10_sst
bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link = "identity"))
############################################################
# part iii
# set 1st 10 PC(SST) as data frame
X = as.data.frame(pcs_sst[, 1:10])
# forecast retained PC1(PPT) from PC(SST) and model
PC1_ppt_pred = predict(bestfit_1, X, se.fit = T)
# plot PC1 & forecast PC1
plot(PC1_ppt, PC1_ppt_pred$fit, main = "PC1(PPT) & Forecast PC1(PPT)")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
# forecast retained PC2(PPT) from PC(SST) and model
PC2_ppt_pred = predict(bestfit_2, X, se.fit = T)
# plot PC2 & forecast PC2
plot(PC2_ppt, PC2_ppt_pred$fit, main = "PC2(PPT) & Forecast PC2(PPT)")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
############################################################
# part iv
# take the mean of the other PCs -- PC3(PPT) to PC7(PPT)
PC3_7_mean = apply(pcs_ppt[, 3:7], 2, mean)
PC3_7_mean_m = matrix(0, nrow = 111, ncol = 5)
# set all the 111 values of each PC3(PPT) to PC7(PPT) as the column mean
for (i in 1:111) {
PC3_7_mean_m[i, ] = PC3_7_mean
i = i + 1
}
############################################################
# part v
# create forecast PC(PPT) matrix: 111 years x 7 locations forecast PC1 & PC2
# are model forecasts forecast PC3 to PC7 are the mean of 7 locations
forecast_PC_ppt_m = matrix(0, nrow = 111, ncol = 7)
forecast_PC_ppt_m[, 1] = PC1_ppt_pred$fit
forecast_PC_ppt_m[, 2] = PC2_ppt_pred$fit
forecast_PC_ppt_m[, 3:7] = PC3_7_mean_m
# back transform: X = Y %*% E^T
forecast_ppt_anomaly = forecast_PC_ppt_m %*% t(zsvd_ppt$u)
# plot pptanavg & forecast pptanavg
plot(pptanavg, forecast_ppt_anomaly, main = "AZ scaled pptanavg & forecast pptanavg - 1900-2010")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
# set forecast ppt matrix
forecast_ppt = matrix(0, nrow = 111, ncol = 7)
# back transform: forecast_ppt = (forecast_ppt_anomaly * ppt_sd) + ppt_mean
for (i in 1:111) {
for (j in 1:7) {
meg = forecast_ppt_anomaly[i, j] * ppt_sd[j]
forecast_ppt[i, j] = meg + ppt_mean[j]
j = j + 1
}
i = i + 1
}
############################################################
# part vi
# plot observed ppt & forecast ppt for each location find out R^2 & RMSE
# location 1
plot(data_m[, 1], forecast_ppt[, 1], main = "Observed PPT & Forecast PPT - Location 1")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
# create ppt_mean matrix with same mean for each column
ppt_mean_m = matrix(0, nrow = 111, ncol = 7)
for (i in 1:111) {
ppt_mean_m[i, ] = ppt_mean
i = i + 1
}
# find out R^2 & RMSE SSR_1 = sum((forecast_ppt[,1] - ppt_mean_m[,1])^2)
# SST_1 = sum((data_m[,1] - ppt_mean_m[,1])^2) R2_1 = SSR_1 / SST_1 # for
# fitting
R2_1 = (cor(data_m[, 1], forecast_ppt[, 1]))^2 # for prediction
cat("R^2 at location 1 = ", R2_1, "\n", "\n")
## R^2 at location 1 = 0.1812
##
RMSE_1 = sqrt((sum((forecast_ppt[, 1] - data_m[, 1])^2))/N)
cat("RMSE at location 1 = ", RMSE_1, "\n", "\n")
## RMSE at location 1 = 0.5128
##
############################################################
# location 2
plot(data_m[, 2], forecast_ppt[, 2], main = "Observed PPT & Forecast PPT - Location 2")
abline(0, 1)
par(ask = TRUE)
R2_2 = (cor(data_m[, 2], forecast_ppt[, 2]))^2
cat("R^2 at location 2 = ", R2_2, "\n", "\n")
## R^2 at location 2 = 0.1607
##
RMSE_2 = sqrt((sum((forecast_ppt[, 2] - data_m[, 2])^2))/N)
cat("RMSE at location 2 = ", RMSE_2, "\n", "\n")
## RMSE at location 2 = 0.5339
##
############################################################
# location 3
plot(data_m[, 3], forecast_ppt[, 3], main = "Observed PPT & Forecast PPT - Location 3")
abline(0, 1)
par(ask = TRUE)
R2_3 = (cor(data_m[, 3], forecast_ppt[, 3]))^2
cat("R^2 at location 3 = ", R2_3, "\n", "\n")
## R^2 at location 3 = 0.1971
##
RMSE_3 = sqrt((sum((forecast_ppt[, 3] - data_m[, 3])^2))/N)
cat("RMSE at location 3 = ", RMSE_3, "\n", "\n")
## RMSE at location 3 = 0.7727
##
############################################################
# location 4
plot(data_m[, 4], forecast_ppt[, 4], main = "Observed PPT & Forecast PPT - Location 4")
abline(0, 1)
par(ask = TRUE)
R2_4 = (cor(data_m[, 4], forecast_ppt[, 4]))^2
cat("R^2 at location 4 = ", R2_4, "\n", "\n")
## R^2 at location 4 = 0.1927
##
RMSE_4 = sqrt((sum((forecast_ppt[, 4] - data_m[, 4])^2))/N)
cat("RMSE at location 4 = ", RMSE_4, "\n", "\n")
## RMSE at location 4 = 0.9504
##
############################################################
# location 5
plot(data_m[, 5], forecast_ppt[, 5], main = "Observed PPT & Forecast PPT - Location 5")
abline(0, 1)
par(ask = TRUE)
R2_5 = (cor(data_m[, 5], forecast_ppt[, 5]))^2
cat("R^2 at location 5 = ", R2_5, "\n", "\n")
## R^2 at location 5 = 0.1979
##
RMSE_5 = sqrt((sum((forecast_ppt[, 5] - data_m[, 5])^2))/N)
cat("RMSE at location 5 = ", RMSE_5, "\n", "\n")
## RMSE at location 5 = 0.3355
##
############################################################
# locaiton 6
plot(data_m[, 6], forecast_ppt[, 6], main = "Observed PPT & Forecast PPT - Location 6")
abline(0, 1)
par(ask = TRUE)
R2_6 = (cor(data_m[, 6], forecast_ppt[, 6]))^2
cat("R^2 at location 6 = ", R2_6, "\n", "\n")
## R^2 at location 6 = 0.1985
##
RMSE_6 = sqrt((sum((forecast_ppt[, 6] - data_m[, 6])^2))/N)
cat("RMSE at location 6 = ", RMSE_6, "\n", "\n")
## RMSE at location 6 = 0.5722
##
############################################################
# location 7
plot(data_m[, 7], forecast_ppt[, 7], main = "Observed PPT & Forecast PPT - Location 7")
abline(0, 1)
par(ask = TRUE)
R2_7 = (cor(data_m[, 7], forecast_ppt[, 7]))^2
cat("R^2 at location 7 = ", R2_7, "\n", "\n")
## R^2 at location 7 = 0.2547
##
RMSE_7 = sqrt((sum((forecast_ppt[, 7] - data_m[, 7])^2))/N)
cat("RMSE at location 7 = ", RMSE_7, "\n", "\n")
## RMSE at location 7 = 0.4829
##
############################################################
# part vii
# forecast for Winter AZ PPT 2011-2012
# SST
ntime = 1 # Dec 2011 to Mar 2012
ntimep = ntime
nglobe = nrows * ncols
# Lat - Long grid.. create grid below:
ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid) # = 36
xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid) # = 72
xygrid = matrix(0, nrow = nx * ny, ncol = 2)
i = 0
for (iy in 1:ny) {
for (ix in 1:nx) {
i = i + 1
xygrid[i, 1] = ygrid[iy]
xygrid[i, 2] = xgrid[ix]
}
}
############################################################
# Read Kaplan SST data -- already annual averaged
data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2011-2012.r4",
what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")
data <- array(data = data, dim = c(nrows, ncols, ntime))
# read data for the 1st month
data1 = data[, , 1]
# the lat -long data grid..
# all the grid points on the globe
index = 1:(nx * ny)
# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]
# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]
nsites = length(index1)
data2 = data1[index1]
# SSTdata matrix - rows are months annd columns are locations
sstdata_11_12 = matrix(0, nrow = ntimep, ncol = nsites)
for (i in 1:ntimep) {
data1 = data[, , i]
index1 = index[data1 != "NaN"]
data2 = data1[index1]
sstdata_11_12[i, ] = data2
}
indexgrid = index1
rm("data") #remove the object data to clear up space
############################################################
# PPT
nlocation = 7 # 7 stations in AZ
# Read Arizona PPT data -- already annual averaged
data_m_11_12 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2011-2012.txt")
############################################################
# transform to PC(SST) & predict PC(PPT) with best model
sstanavg_11_12 = sstdata_11_12
# PC(SST_11_12)
pcs_sst_11_12 = sstanavg_11_12 %*% zsvd_sst$u
############################################################
# bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family =
# gaussian(link = 'identity'))
# re-write bestfit_1 to predict PC1(SST_11_12)
PC2 = PC2_sst
PC8 = PC8_sst
PC9 = PC9_sst
xe = cbind(PC2, PC8, PC9)
bestfit_1 = glm(PC1_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))
# check with original data X = as.data.frame(pcs_sst[,1:10]) PC1_ppt_pred =
# predict(bestfit_1, X, se.fit = T) plot(PC1_ppt, PC1_ppt_pred$fit, main =
# 'PC1(PPT) & Forecast PC1(PPT)') still give the same plot
PC2 = pcs_sst_11_12[2]
PC8 = pcs_sst_11_12[8]
PC9 = pcs_sst_11_12[9]
xe = cbind(PC2, PC8, PC9)
PC1_ppt_pred_11_12 = predict(bestfit_1, newdata = as.data.frame(xe))
############################################################
# bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link =
# 'identity'))
# re-write bestfit_2 to predict PC2(SST_11_12)
PC2 = PC2_sst
PC10 = PC10_sst
xe = cbind(PC2, PC10)
bestfit_2 = glm(PC2_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))
# check with original data X = as.data.frame(pcs_sst[,1:10]) PC2_ppt_pred =
# predict(bestfit_2, X, se.fit = T) plot(PC2_ppt, PC2_ppt_pred$fit, main =
# 'PC2PPT) & Forecast PC2(PPT)') still give the same plot
PC2 = pcs_sst_11_12[2]
PC10 = pcs_sst_11_12[10]
xe = cbind(PC2, PC10)
PC2_ppt_pred_11_12 = predict(bestfit_2, newdata = as.data.frame(xe))
############################################################
# using mean of PC3 to PC7 from original data for PC3 to PC7 in 2011-12
PC3to7_ppt_11_12 = PC3_7_mean
# set up predicted PC matrix
PC_ppt_pred_11_12 = matrix(0, nrow = 1, ncol = 7)
PC_ppt_pred_11_12[1] = PC1_ppt_pred_11_12
PC_ppt_pred_11_12[2] = PC2_ppt_pred_11_12
PC_ppt_pred_11_12[3:7] = PC3to7_ppt_11_12
ppt_pred_11_12 = matrix(0, nrow = 1, ncol = 7)
# back transform: X = Y %*% E^T
ppt_pred_11_12_anomaly = PC_ppt_pred_11_12 %*% t(zsvd_ppt$u)
# back transform: ppt_pred_11_12 = (ppt_pred_11_12_anomaly * ppt_sd) +
# ppt_mean
meg = ppt_pred_11_12_anomaly * ppt_sd
ppt_pred_11_12 = meg + ppt_mean
# compare with observation
plot(data_m_11_12, ppt_pred_11_12, main = "Observed PPT & Forecast PPT - 2011-2012")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
############################################################
# forecast for Winter AZ PPT 2012-2013
# SST
ntime = 1 # Dec 2012 to Mar 2013
ntimep = ntime
nglobe = nrows * ncols
# Lat - Long grid.. create grid below:
ygrid = seq(-87.5, 87.5, by = 5)
ny = length(ygrid) # = 36
xgrid = seq(27.5, 382.5, by = 5)
xgrid[xgrid > 180] = xgrid[xgrid > 180]
nx = length(xgrid) # = 72
xygrid = matrix(0, nrow = nx * ny, ncol = 2)
i = 0
for (iy in 1:ny) {
for (ix in 1:nx) {
i = i + 1
xygrid[i, 1] = ygrid[iy]
xygrid[i, 2] = xgrid[ix]
}
}
############################################################
# Read Kaplan SST data -- already annual averaged
data = readBin("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\Kaplan-SST-DJFM-2012-2013.r4",
what = "numeric", n = (nrows * ncols * ntime), size = 4, endian = "swap")
data <- array(data = data, dim = c(nrows, ncols, ntime))
# read data for the 1st month
data1 = data[, , 1]
# the lat -long data grid..
# all the grid points on the globe
index = 1:(nx * ny)
# only non-missing data, missing data = 1.E30 include only grid points on
# the ocean sort out NaN values
index1 = index[data1 != "NaN"]
# choose only ocean's grid points for xygrid
xygrid1 = xygrid[index1, ]
x1 = xygrid1[, 2]
nsites = length(index1)
data2 = data1[index1]
# SSTdata matrix - rows are months annd columns are locations
sstdata_12_13 = matrix(0, nrow = ntimep, ncol = nsites)
for (i in 1:ntimep) {
data1 = data[, , i]
index1 = index[data1 != "NaN"]
data2 = data1[index1]
sstdata_12_13[i, ] = data2
}
indexgrid = index1
rm("data") #remove the object data to clear up space
############################################################
# PPT
nlocation = 7 # 7 stations in AZ
# Read Arizona PPT data -- already annual averaged
data_m_12_13 = scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 2\\AZ-cdiv-ppt-DJFM-2012-2013.txt")
############################################################
# transform to PC(SST) & predict PC(PPT) with best model
sstanavg_12_13 = sstdata_12_13
# PC(SST_12_13)
pcs_sst_12_13 = sstanavg_12_13 %*% zsvd_sst$u
############################################################
# bestfit_1 = glm(PC1_ppt ~ PC2_sst + PC8_sst + PC9_sst, family =
# gaussian(link = 'identity'))
# re-write bestfit_1 to predict PC1(SST_12_13)
PC2 = PC2_sst
PC8 = PC8_sst
PC9 = PC9_sst
xe = cbind(PC2, PC8, PC9)
bestfit_1 = glm(PC1_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))
PC2 = pcs_sst_12_13[2]
PC8 = pcs_sst_12_13[8]
PC9 = pcs_sst_12_13[9]
xe = cbind(PC2, PC8, PC9)
PC1_ppt_pred_12_13 = predict(bestfit_1, newdata = as.data.frame(xe))
############################################################
# bestfit_2 = glm(PC2_ppt ~ PC2_sst + PC10_sst, family = gaussian(link =
# 'identity'))
# re-write bestfit_2 to predict PC2(SST_12_13)
PC2 = PC2_sst
PC10 = PC10_sst
xe = cbind(PC2, PC10)
bestfit_2 = glm(PC2_ppt ~ ., data = as.data.frame(xe), family = gaussian(link = "identity"))
PC2 = pcs_sst_12_13[2]
PC10 = pcs_sst_12_13[10]
xe = cbind(PC2, PC10)
PC2_ppt_pred_12_13 = predict(bestfit_2, newdata = as.data.frame(xe))
############################################################
# using mean of PC3 to PC7 from original data for PC3 to PC7 in 2011-12
PC3to7_ppt_12_13 = PC3_7_mean
# set up predicted PC matrix
PC_ppt_pred_12_13 = matrix(0, nrow = 1, ncol = 7)
PC_ppt_pred_12_13[1] = PC1_ppt_pred_12_13
PC_ppt_pred_12_13[2] = PC2_ppt_pred_12_13
PC_ppt_pred_12_13[3:7] = PC3to7_ppt_12_13
ppt_pred_12_13 = matrix(0, nrow = 1, ncol = 7)
# back transform: X = Y %*% E^T
ppt_pred_12_13_anomaly = PC_ppt_pred_12_13 %*% t(zsvd_ppt$u)
# back transform: ppt_pred_12_13 = (ppt_pred_12_13_anomaly * ppt_sd) +
# ppt_mean
meg = ppt_pred_12_13_anomaly * ppt_sd
ppt_pred_12_13 = meg + ppt_mean
# compare with observation
plot(data_m_12_13, ppt_pred_12_13, main = "Observed PPT & Forecast PPT - 2012-2013")
# add 1:1 line
abline(0, 1)
par(ask = TRUE)
############################################################