Rebecca Di Bari

Homework # 2

CVEN 6833

Libraries

library(akima)
library(fields)
library(locfit)
library(MASS)

Sourced Functions

source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/SSTread1.txt")
source("C:/Users/Rebex/Documents/R/AdvDataAnalysis/SSTwinter12Read1.txt")

thebest = function(X, Y, deg, family) {
    nvar = length(X[1, ])
    N = length(Y)

    minalpha = 2 * (nvar * deg + 1)/N
    alpha_grid = seq(minalpha, 1, by = 0.05)

    zz = gcvplot(Y ~ X, alpha = alpha_grid, deg = deg, kern = "bisq", ev = dat(), 
        scale = TRUE, family = family)

    mygcv = sort(zz$values)[1]
    # return(mygcv)

    alpha = alpha_grid
    # pick the best alpha from the best family that gives the least GCV
    z6 = order(zz$values)[1]
    bestalpha = alpha[z6]

    myresult = list(gcv = mygcv, bestalpha = bestalpha, family = family)
}

Data

############ SST, all months ##################
nrows = 72
ncols = 36
ntimemonths = 1332
data = readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Jan%201900%29%28Dec%202011%29RANGE/T/12/boxAverage/T/12/STEP/data.r4", 
    what = "numeric", n = (nrows * ncols * ntimemonths), size = 4, endian = "swap")
mine = SSTread(nrows, ncols, ntimemonths, data)

SSTallavg = mine$mydata
SSTallavg = SSTallavg[1:111, ]  # 1900- 2011
xygrid = mine$xygrid
nglobe = mine$nglobe
indexgrid = mine$indexgrid

############# SST, winter months ###############
ntimemonths = 1344
data12 = readBin("http://iridl.ldeo.columbia.edu/expert/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%201900%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/data.r4", 
    what = "numeric", n = (nrows * ncols * ntimemonths), size = 4, endian = "swap")
mine1 = SSTread(nrows, ncols, ntimemonths, data12)

SSTwinter = mine1$mydata
SSTwinter = SSTwinter[1:111, ]  # 1900 - 2011
xygridW = mine1$xygrid
indexgridW = mine1$indexgrid
SSTwinter12 = mine1$mydata  # 1900- 2012

############### Precip data ######################
Precipdata = t(matrix(scan("http://iridl.ldeo.columbia.edu/expert/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%201900%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/401/402/403/404/405/406/407/502/504/505/1001/1002/1003/1004/1005/1006/1007/1008/1009/1010/2401/2402/2403/2404/2405/2406/2407/2601/2602/2603/2604/2901/2902/2904/2905/2906/2907/2908/3501/3502/3503/3504/3505/3506/3507/3508/3509/4201/4202/4203/4204/4205/4206/4207/4501/4502/4503/4504/4505/4506/4507/4508/4509/4510/4801/4802/4803/4804/4805/4806/4807/4808/4809/4810/VALUES/data.ch"), 
    ncol = 112, byrow = T))

regions = c(201, 202, 203, 204, 205, 206, 207, 401, 402, 403, 404, 405, 406, 
    407, 502, 504, 505, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 
    1010, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2601, 2602, 2603, 2604, 
    2901, 2902, 2904, 2905, 2906, 2907, 2908, 3501, 3502, 3503, 3504, 3505, 
    3506, 3507, 3508, 3509, 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4501, 
    4502, 4503, 4504, 4505, 4506, 4507, 4508, 4509, 4510, 4801, 4802, 4803, 
    4804, 4805, 4806, 4807, 4808, 4809, 4810)  # These are regions associated with Western US

# rename column headers
colnames(Precipdata) = regions
# rename row names
years = 1901:2012
rownames(Precipdata) = years

WestPrecip = Precipdata[1:111, ]  # 1900- 2011
WestPrecip12 = Precipdata[1:112, ]  # 1900- 2012

# AZ precip (associated to regions 200-207). Also, scale data.
AZprecip = WestPrecip[, 1:7]  # 1900- 2011

AZprecip12 = WestPrecip12[, 1:7]  # 1900- 2012
AZprecip12 = scale(AZprecip12)
AZprecip12SD = attr(AZprecip12, "scaled:scale")
AZprecip12Mean = attr(AZprecip12, "scaled:center")


# Lat Long of grid points of Western US .
wcdivlocs = matrix(scan("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session2/files-4HW2/WesternUS-coords.txt"), 
    ncol = 5, byrow = T)

idx = 1:length(regions)
for (i in length(regions)) {
    idx[i] = which(wcdivlocs[, 1] == regions[i])
}

# put lat long in same order as WestPrecip data
long = -wcdivlocs[idx, 4]
lat = wcdivlocs[idx, 3]

Question 1: You wish to identify the space-time patterns of variability of the monthly sea surface temperature for the 1900 - 2012 period.

(i) Perform a PCA on the global annual SST anomalies

-Plot the Eigen variance spectrum for the first 25 modes

-Plot the leading 4 spatial (Eigen vectors) and temporal (PCs) modes of Variability

zs = var(SSTallavg)  #variance matrix
zsvd = svd(zs)  #Eigen decomposition
pcs = t(t(zsvd$u) %*% t(SSTallavg))  #PCs

lambdas = (zsvd$d/sum(zsvd$d))  #Eigen Values/ Fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "EigenValues")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-4

# Use the first 3 modes as it explains over 70% of the variance


# Plot Spatial and Temporal Trends

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")
nrows = 72
ncols = 36

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygrid[, 2]))
    ylat = sort(unique(xygrid[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgrid] = zsvd$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1901:2011, pcs[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-5

(ii) Obviously one of the modes will be a tropical Pacific mode (i.e., El Nino Southern Oscillation, ENSO). Remove this mode from the data and repeat the PCA and (i) above. Compare and contrast the results.

nmodes = length(zsvd$u[1, ])  # number of modes
ndrop = c(1)  # modes to drop
E = matrix(0, nrow = nmodes, ncol = nmodes)  #Eigen vector
E[, ndrop] = zsvd$u[, ndrop]
sstanavgdrop = pcs %*% t(E)

sstanrem = SSTallavg - sstanavgdrop

# Now perform PCA on sstanrem

zs1 = var(sstanrem)  # var matrix
zsvd1 = svd(zs1)  #Eigen decomposition
pcs1 = t(t(zsvd1$u) %*% t(sstanrem))  # PCs

lambdas = (zsvd1$d/sum(zsvd1$d))  #Eigen Values/Fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "EigenValues")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-6

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygrid[, 2]))
    ylat = sort(unique(xygrid[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgrid] = zsvd1$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1901:2011, pcs[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-7

(iii) Repeat (i) for three sub periods 1900-1935; 1935-1975 and 1975-2012 to investigate the variability of the leading 4 modes

1900- 1935

sst_1900_1935 = SSTallavg[1:35, ]
zs2 = var(sst_1900_1935)  # var matrix
zsvd2 = svd(zs2)  #Eigen decomposition
pcs2 = t(t(zsvd2$u) %*% t(sst_1900_1935))  #PCs

lambdas = (zsvd2$d/sum(zsvd2$d))  #Eigen Values/ Fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "EigenValues")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-8

# Plot Spatial and Temporal trends for first 3 modes/ associated
# eigenvectors

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygrid[, 2]))
    ylat = sort(unique(xygrid[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgrid] = zsvd2$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1901:1935, pcs2[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-9

1936-1975

sst_1936_1975 = SSTallavg[36:75, ]
zs2 = var(sst_1936_1975)
zsvd2 = svd(zs2)  #Eigen decomposition
pcs2 = t(t(zsvd2$u) %*% t(sst_1936_1975))  # PCs

lambdas = (zsvd2$d/sum(zsvd2$d))  #Eigen Values/ Fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "EigenValues")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-10

# Plot Spatial and Temporal trends for first 4 modes/ associated
# eigenvectors

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygrid[, 2]))
    ylat = sort(unique(xygrid[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgrid] = zsvd2$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1936:1975, pcs2[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-11

1976-2011

sst_1976_2011 = SSTallavg[76:111, ]
zs2 = var(sst_1976_2011)  # variance matrix
zsvd2 = svd(zs2)  # egien decomposition
pcs2 = t(t(zsvd2$u) %*% t(sst_1976_2011))  #PCs

lambdas = (zsvd2$d/sum(zsvd2$d))  #Eigen Values/ Fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained", 
    main = "EigenValues")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-12

# Plot Spatial and Temporal trends for first 4 modes/ associated
# eigenvectors

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygrid[, 2]))
    ylat = sort(unique(xygrid[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgrid] = zsvd2$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1976:2011, pcs2[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-13

The patterns between the different years seems not to have significantly changed.

Question 2: Joint patterns of variability between two fields are often desired to understand their spatial and temporal relationship. In this regard, we wish to identify patterns of joint variability of winter (Dec-Mar) average SST over the globe and winter precipitation over U.S for the period 1958 - 2008

(i) Preform PCA on SST and Precip data

# PCA on the seasonal SST
zs = var(SSTwinter)  #variance matrix
zsvd = svd(zs)  #Eigen decomposition..
pcsSST = t(t(zsvd$u) %*% t(SSTwinter))  #Principal Components

lambdas = (zsvd$d/sum(zsvd$d))  #Eigen Values/fraction variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Frac. Var. explained")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-14

# the data is on a grid so fill the entire globaal grid with NaN and then
# populate the ocean grids with the Eigen vector
xlong = sort(unique(xygridW[, 2]))
ylat = sort(unique(xygridW[, 1]))

titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    xlong = sort(unique(xygridW[, 2]))
    ylat = sort(unique(xygridW[, 1]))
    zfull = rep(NaN, nglobe)
    zfull[indexgridW] = zsvd$u[, i]
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 70))
    title(main = titles1[i])
    contour(xlong, ylat, (zmat), ylim = range(-40, 70), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = T)

    plot(1901:2011, pcsSST[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-15

# PCA on Western Precip
zs = var(WestPrecip)  #var matrix
zsvd = svd(zs)  #Eigen decomposition
pcsPrecip = t(t(zsvd$u) %*% t(WestPrecip))  #PCs

lambdas = (zsvd$d/sum(zsvd$d))  #Eigen Values/ Fraction Variance

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Western US Precip")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-16

# Spatial and Temporal maps
titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    z = interp(long, lat, zsvd$u[, i], duplicate = "strip")
    surface(z, xlim = range(long), ylim = range(lat))
    title(main = titles1[i])
    map("usa", add = T)
    map("state", add = T)
    title(main = titles1[i])

    plot(1901:2011, pcsPrecip[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-17

(ii) Relate the leading PCs of SST with the corresponding leading PCs of precipitation - scatterplot and Local Polynomial smoothers through them.

par(mfrow = c(3, 3))
for (i in 1:3) {
    for (j in 1:3) {
        plot(pcsSST[, i], pcsPrecip[, j])
        # fit local polynomial
        mystud1 = thebest(as.matrix(pcsSST[, i]), pcsPrecip[, j], 1, "qguassian")
        bestalpha = mystud1$bestalpha
        fit = locfit(pcsPrecip[, j] ~ pcsSST[, i], alpha = bestalpha, deg = 1, 
            kern = "bisq", scale = TRUE)
        plot(fit, add = T, col = "red")
        title(main = paste("SST PC", i, ", WestPrecip PC", j))
    }
}

plot of chunk unnamed-chunk-18

(iii) Correlate the leading PC of precipitation with the SST data and vice-versa - thus producing correlation maps and inferring the patterns. Using these infer the relationships.

pc2tpcpcorr = scan("http://iridl.ldeo.columbia.edu/expert/SOURCES/.NOAA/.NCDC/.CIRS/.ClimateDivision/.pcp/T/%28Dec%201990%29%28Dec%202012%29RANGE/T/4/boxAverage/T/12/STEP/IDIV/201/202/203/204/205/206/207/401/402/403/404/405/406/407/502/504/505/1001/1002/1003/1004/1005/1006/1007/1008/1009/1010/2401/2402/2403/2404/2405/2406/2407/2601/2602/2603/2604/2901/2902/2904/2905/2906/2907/2908/3501/3502/3503/3504/3505/3506/3507/3508/3509/4201/4202/4203/4204/4205/4206/4207/4501/4502/4503/4504/4505/4506/4507/4508/4509/4510/4801/4802/4803/4804/4805/4806/4807/4808/4809/4810/VALUES/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/T/%28Dec%201900%29%28Mar%202012%29RANGE/T/4/boxAverage/T/12/STEP%5BT%5Dstandardize%5BX/Y%5D%5BT%5Dsvd/ev/2/RANGE/.Ts%5BT%5Dcorrelate/data.ch")

par(mfrow = c(1, 1))
quilt.plot(long, lat, pc2tpcpcorr, xlim = range(-125, -100))
US(add = TRUE, col = "grey", lwd = 2, xlim = range(-125, -100))

plot of chunk unnamed-chunk-19

(iv) Given the changes in recent decades (e.g., droughts in the W. US) repeat (i) for recent period say 1975 - 2012 for the W. US precipitation (i.e. divisions to the west of 100W longitude). Comment on differences you find with (i) above.

# Precip data 1975 to 2012
West_1975_2012 = WestPrecip12[76:112, ]
zs = var(West_1975_2012)  #var matrix
zsvd = svd(zs)  #Eigen Decomposition
pcsPrecip = t(t(zsvd$u) %*% t(West_1975_2012))  #PCs

lambdas = (zsvd$d/sum(zsvd$d))  #Eigen Values

par(mfrow = c(1, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Western US Precip")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-20

# Spatial and Temporal maps
titles1 = c("Spatial: First Eignevector", "Spatial: Second Eignevector", "Spatial: Third Eignevector")
titles2 = c("Temporal: First Mode", "Temporal: Second Mode", "Temporal: Third Mode")

par(mfrow = c(3, 2))
for (i in 1:length(titles1)) {
    z = interp(long, lat, zsvd$u[, i], duplicate = "strip")
    surface(z, xlim = range(long), ylim = range(lat))
    title(main = titles1[i])
    map("usa", add = T)
    map("state", add = T)
    title(main = titles1[i])

    plot(1975:2011, pcsPrecip[, i], type = "l", xlab = "year")
    title(main = titles2[i])
}

plot of chunk unnamed-chunk-21

Question 3: Perform an SVD analysis on the two fields - plot the hetrogeneous correlation maps for the first four modes and also plot the time coefficients. How do these compare with the PCA results (i) from above?.

# Perform SVD
C = var(WestPrecip, SSTwinter)
zz = svd(C)

# f1 and g1 are the matrix of time coefficients
f1 = as.matrix(WestPrecip) %*% zz$u
g1 = SSTwinter %*% zz$v

# hetrogenous correlation pattern.
J = min(length(SSTwinter[1, ]), length(WestPrecip[1, ]))

par(mfrow = c(1, 1))
plot(1:J, ((zz$d)^2)/sum((zz$d)^2), xlab = "Modes", ylab = "Sq. Covariance", 
    col = "red")

plot of chunk unnamed-chunk-22

### Spatial map the first hetro corrln map. TC of rain correlated on SST
par(mfrow = c(3, 2))
for (i in 1:3) {
    corrpats = cor(f1[, i], SSTwinter)

    # the data is on a grid so fill the entire globaal grid with NaN and then
    # populate the ocean grids with the Eigen vector
    xlong = sort(unique(xygridW[, 2]))
    ylat = sort(unique(xygridW[, 1]))

    zfull = rep(NaN, nglobe)
    zfull[indexgridW] = corrpats
    zmat = matrix(zfull, nrow = nrows, ncol = ncols)

    image.plot(xlong, ylat, zmat, ylim = range(-40, 40))
    contour(xlong, ylat, (zmat), ylim = range(-40, 40), add = TRUE, nlev = 6, 
        lwd = 2)
    map("world2", add = TRUE)
    title(main = "Hetrogeneous Corr. Map - TC1", i, "of Rain with SST")

    ### Spatial map the first hetro corrln map. TC of SST correlated on Rain
    corrpatr = cor(g1[, i], WestPrecip)

    z = interp(long, lat, corrpatr, duplicate = "strip")
    surface(z, xlim = range(long), ylim = range(lat))
    map("usa", add = T)
    map("state", add = T)
    title(main = paste("Hetrogeneous Corr. Map - TC", i, "of SST with Rain"))
}

plot of chunk unnamed-chunk-23

Question 4: wish to predict the winter (Nov - Mar) precipitation over 7 climate divisions of Arizona with average global winter SSTs. You can do the following (based on Regonda et al., 2006) steps are described below. Use the period 1900 - 2010.

(i) Perform PCA on the two fields, look at their Eigen spectrum to decide on the number of PCs to retain, keep a higher number.For SSTs you already have the Eigen spectrum, PCs from 2(i) above.

# Do only up to 2010- as to do latter problems

############# AZ precip 1900-2010 ###################
zs = var(AZprecip12[1:110, ])
zsvd = svd(zs)  #Eigen decomposition
evect12 = zsvd$u  #Eigen vector
pcsAZprecip12 = t(t(zsvd$u) %*% t(AZprecip12[1:110, ]))  #PCs

lambdas = (zsvd$d/sum(zsvd$d))

par(mfrow = c(2, 1))
plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Western US Precip", 
    main = "Eigen Spectrum AZ Precip")
points(1:40, lambdas[1:40], col = "red")

############### SSTwinter data 1900-2010 ######################
zs = var(SSTwinter12[1:110, ])
zsvd = svd(zs)  #Eigen decomposition
pcsSSTwinter12 = t(t(zsvd$u) %*% t(SSTwinter12[1:110, ]))  #PCs

lambdas = (zsvd$d/sum(zsvd$d))

plot(1:40, lambdas[1:40], type = "l", xlab = "Modes", ylab = "Western US Precip", 
    main = "Eigen Spectrum SST")
points(1:40, lambdas[1:40], col = "red")

plot of chunk unnamed-chunk-24

(ii) Fit a regression model for each retained PC of winter precipitation with the SST PCs.Use your regression knowledge to identify the best predictors, model etc. - if linear model works well use it, also feel free to use local polynomial approach as well.

(iii) Model simultaneously.

# Fit a model. Use first 10 modes of SSTwinter and 1st mode of AZprecip.
X = as.matrix(pcsSSTwinter12[, 1:10])
lmfit = lm(pcsAZprecip12[, 1] ~ X)
bestmodelPC1 = stepAIC(lmfit, trace = F)

# ANOVA
summary(bestmodelPC1)

## 
## Call:
## lm(formula = pcsAZprecip12[, 1] ~ 1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.543 -1.178  0.643  1.731  3.720 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -0.0267     0.2413   -0.11     0.91
## 
## Residual standard error: 2.53 on 109 degrees of freedom

(iii) Forecast the retained PCs of precipitation using the fitted model.

(iv) For the other PCs take their mean.

(vi) Plot the forecasted precipitation and the observed at each location and compute performance statistics such as R2, RMSE.

N = length(pcsAZprecip12[, 1])
# for(i in 1:N) {
pc1pred = predict(bestmodelPC1)

# the remaining PCs of AZPrecip
PC2 = pcsAZprecip12[, 3]
pc2pred = rnorm(N, mean = mean(PC2), sd = sd(PC2))

PC3 = pcsAZprecip12[, 3]
pc3pred = rnorm(N, mean = mean(PC3), sd = sd(PC3))

PC4 = pcsAZprecip12[, 4]
pc4pred = rnorm(N, mean = mean(PC4), sd = sd(PC4))

PC5 = pcsAZprecip12[, 5]
pc5pred = rnorm(N, mean = mean(PC5), sd = sd(PC5))

PC6 = pcsAZprecip12[, 6]
pc6pred = rnorm(N, mean = mean(PC6), sd = sd(PC6))

PC7 = pcsAZprecip12[, 7]
pc7pred = rnorm(N, mean = mean(PC7), sd = sd(PC7))

pcppred = cbind(pc1pred, pc2pred, pc3pred, pc4pred, pc5pred, pc6pred, pc7pred)

AZpred12 = pcppred %*% t(evect12)

# Scale it back
AZpred12 = AZpred12 * AZprecip12SD + AZprecip12Mean

# Look at first 3 locations
par(mfrow = c(3, 2))
for (i in 1:3) {
    rmse = sqrt(mean((AZprecip12[1:110, i] - AZpred12[, i])^2))
    SST = sum((AZprecip12[1:110, i] - mean(AZprecip12[1:110, i]))^2)
    SSR = sum((AZprecip12[1:110, i] - AZpred12[, i])^2)
    rsq = 1 - (SSR/SST)

    assign(paste("location_", i, "_rmse", sep = ""), rmse)
    assign(paste("location_", i, "_rsq", sep = ""), rsq)
}

location_1_rmse

## [1] 1.637

location_1_rsq

## [1] -1.67

location_2_rmse

## [1] 1.608

location_2_rsq

## [1] -1.574

location_3_rmse

## [1] 1.632

location_3_rsq

## [1] -1.652


# plot predicted Precip at each location
par(mfrow = c(3, 3))
for (i in 1:length(AZprecip12[1, ])) {
    plot(AZprecip12[1:110, i], AZpred12[, i], xlab = "Orignial AZ Precip", ylab = "Predicted AZ Precip", 
        main = paste("Actucal vs Predicted", i, "th Column"))
    lines(AZprecip12[1:110, i], AZprecip12[1:110, i], col = "red")
}

plot of chunk unnamed-chunk-26

(vii) Of course, the above is more of fitting rather than actual forecast. Use the above model to predict 2011 and 2012 winter precipitation and compare with the observed.

SSTwinter2011 = SSTwinter12[111, ]
# PC 2011 SSTs. Use first 10 modes
zs = var(SSTwinter2011)
zsvd = svd(zs)
evect2011 = zsvd$u  #EigenVector of 2011
pcsSSTwinter2011 = t(t(zsvd$u) %*% t(SSTwinter2011))  #PCs

# predict PC1 from bestmodel with leading 10 modes
pc1pred2011 = predict(bestmodelPC1, newdata = as.data.frame(t(pcsSSTwinter2011[1:10])))

# the remaining PCs can be obtained as
PC2 = pcsAZprecip12[, 2]
pc2pred = mean(PC2)

PC3 = pcsAZprecip12[, 3]
pc3pred = mean(PC3)

PC4 = pcsAZprecip12[, 4]
pc4pred = mean(PC4)

PC5 = pcsAZprecip12[, 5]
pc5pred = mean(PC5)

PC6 = pcsAZprecip12[, 6]
pc6pred = mean(PC6)

PC7 = pcsAZprecip12[, 7]
pc7pred = mean(PC7)

pcppred = cbind(pc1pred2011, pc2pred, pc3pred, pc4pred, pc5pred, pc6pred, pc7pred)

AZpred2011 = pcppred %*% t(evect12)

# Scale it back
AZprecip12Mean = apply(AZprecip12, 2, mean)
AZprecip12SD = apply(AZprecip12, 2, sd)
AZpred2011 = AZpred2011 * AZprecip12SD + AZprecip12Mean

par(mfrow = c(2, 1))
AZprecip12 = WestPrecip12[, 1:7]  # 1900- 2012
AZprecip2011 = AZprecip12[111, ]
# Compare Predicted AZprecip 2011 to orginal data
plot(t(AZprecip2011), AZpred2011, xlab = "Orignial AZ Precip", ylab = "Predicted AZ Precip", 
    main = "Actucal vs Predicted Precip 2011")
lines(t(AZprecip2011), t(AZprecip2011), col = "red")

################## PC 2012 SSTs. Use first 10 modes ################
SSTwinter2012 = SSTwinter12[112, ]
zs = var(SSTwinter2012)
zsvd = svd(zs)
evect2011 = zsvd$u  #EigenVector
pcsSSTwinter2012 = t(t(zsvd$u) %*% t(SSTwinter2012))  #PCs

# predict PC1 from bestmodel with leading 10 modes
pc1pred2012 = predict(bestmodelPC1, newdata = as.data.frame(t(pcsSSTwinter2012[1:10])))

# the remaining PCs can be obtained as
PC2 = pcsAZprecip12[, 2]
pc2pred = mean(PC2)

PC3 = pcsAZprecip12[, 3]
pc3pred = mean(PC3)

PC4 = pcsAZprecip12[, 4]
pc4pred = mean(PC4)

PC5 = pcsAZprecip12[, 5]
pc5pred = mean(PC5)

PC6 = pcsAZprecip12[, 6]
pc6pred = mean(PC6)

PC7 = pcsAZprecip12[, 7]
pc7pred = mean(PC7)

pcppred = cbind(pc1pred2012, pc2pred, pc3pred, pc4pred, pc5pred, pc6pred, pc7pred)

AZpred2012 = pcppred %*% t(evect12)

# Scale it back
AZprecip12Mean = apply(AZprecip12, 2, mean)
AZprecip12SD = apply(AZprecip12, 2, sd)
AZpred2012 = AZpred2012 * AZprecip12SD + AZprecip12Mean

AZprecip12 = WestPrecip12[, 1:7]  # 1900- 2012
AZprecip2012 = AZprecip12[112, ]
plot(t(AZprecip2012), AZpred2012, ylim = range(AZpred2012), xlab = "Orignial AZ Precip", 
    ylab = "Predicted AZ Precip", main = "Actucal vs Predicted Precip 2012")
lines(t(AZprecip2012), t(AZprecip2012), col = "red")

plot of chunk unnamed-chunk-27

Question 5: Repeat 4 using CCA

zs = var(SSTwinter12[1:110, ])
zsvd = svd(zs)  #Eigen decomposition
pcsSSTwinter12 = t(t(zsvd$u) %*% t(SSTwinter12[1:110, ]))  #PCs

######## Perform CCA on the first 10 PCs of SST ##############
sstPc = pcsSSTwinter12[, 1:10]
X1 = scale(sstPc)

M = dim(sstPc)[2]
J = dim(AZprecip12[1:110, ])[2]
J = min(M, J)
N = length(AZprecip12[1:110, 1])
Qx1 = qr.Q(qr(X1))
Qy1 = qr.Q(qr(AZprecip12[1:110, ]))
T11 = qr.R(qr(X1))
T22 = qr.R(qr(AZprecip12[1:110, ]))
VV = t(Qx1) %*% Qy1
BB = svd(VV)$v
AA = svd(VV)$u
BB = solve(T22) %*% svd(VV)$v * sqrt(N - 1)
wm1 = AZprecip12 %*% BB
AA = solve(T11) %*% svd(VV)$u * sqrt(N - 1)
vm1 = X1 %*% AA
cancorln = svd(VV)$d[1:J]  #canonical correlation
Fyy = var(AZprecip12[1:110, ]) %*% BB
# ........................................ Prediction.

betahat = solve(t(AA) %*% t(X1) %*% X1 %*% AA) %*% t(AA) %*% t(X1) %*% AZprecip12[1:110, 
    ]
ypred = X1 %*% AA %*% betahat

# scale back the data mean forecast
ypred = t(t(ypred) * AZprecip12SD + AZprecip12Mean)

(vi) Plot the forecasted precipitation and the observed at each location and compute performance statistics such as R2, RMSE.

# plot predicted Precip at each location
par(mfrow = c(3, 3))
for (i in 1:7) {
    plot(AZprecip12[1:110, i], ypred[, i], xlab = "Orignial AZ Precip", ylab = "Predicted AZ Precip", 
        main = paste("Actucal vs Predicted", i, "location"))
    lines(AZprecip12[1:110, i], AZprecip12[1:110, i], col = "red")
}

# R2 and rmse. Look at first 3 locations
par(mfrow = c(3, 2))

plot of chunk unnamed-chunk-29

for (i in 1:3) {
    rmse = sqrt(mean((AZprecip12[1:110, i] - ypred[, i])^2))
    SST = sum((AZprecip12[1:110, i] - mean(AZprecip12[, i]))^2)
    SSR = sum((AZprecip12[1:110, i] - ypred[, i])^2)
    rsq = 1 - (SSR/SST)

    assign(paste("location_", i, "_rmse", sep = ""), rmse)
    assign(paste("location_", i, "_rsq", sep = ""), rsq)
}

location_1_rmse

## [1] 0.5479

location_1_rsq

## [1] 0.07174

location_2_rmse

## [1] 0.5662

location_2_rsq

## [1] 0.06369

location_3_rmse

## [1] 0.8292

location_3_rsq

## [1] 0.08126

(vii) Predict AZ precipitation in 2011 and 2012

zs = var(SSTwinter12[111, ])
zsvd = svd(zs)  #Eigen decomposition
pcsSSTwinter2011 = t(t(zsvd$u) %*% t(SSTwinter12[111, ]))  #PCs

zs = var(SSTwinter12[112, ])
zsvd = svd(zs)  #Eigen decomposition
pcsSSTwinter2012 = t(t(zsvd$u) %*% t(SSTwinter12[112, ]))  #PCs

# Predict 2011
sstPc = as.matrix(t(pcsSSTwinter2011[1:10]))
ypred = sstPc %*% AA %*% betahat
ypred2011 = t(t(ypred) * AZprecip12SD + AZprecip12Mean)  # scale back the data

# Predict 2012
sstPc = as.matrix(t(pcsSSTwinter2012[1:10]))
ypred = sstPc %*% AA %*% betahat
ypred2012 = t(t(ypred) * AZprecip12SD + AZprecip12Mean)  # scale back the data

par(mfrow = c(2, 1))
# Compare Predicted AZprecip 2011 to orginal data
plot(t(AZprecip2011), ypred2011, xlab = "Orignial AZ Precip", ylab = "Predicted AZ Precip", 
    main = "Actucal vs Predicted Precip 2011")
lines(t(AZprecip2011), t(AZprecip2011), col = "red")
plot(t(AZprecip2012), ypred2012, xlab = "Orignial AZ Precip", ylab = "Predicted AZ Precip", 
    main = "Actucal vs Predicted Precip 2012")
lines(t(AZprecip2012), t(AZprecip2012), col = "red")

plot of chunk unnamed-chunk-30

Question 6: Cluster the average Western U.S. winter precipitation. Compute the average winter precipitation for each climate division in the Western U.S - i.e., the dataset will be a single column of one average precipitation value per division. This involves two steps

(i) identify the number of clusters, say, K:

Select a desired number of clusters, say, j; cluster the data and computer the WSS; repeat for j=1:10; plot j versus WSS ; and select the number of clusters K, where the WSS starts to saturate.

library(mclust)
detach("package:mclust", unload = TRUE)
library(fields)
library(cluster)
library(ggplot2)

WestPrecip = Precipdata[1:111, ]  # 1900- 2011
WestPrecip = matrix(apply(WestPrecip, 2, mean), 81, 1)
x = WestPrecip

nk = 20  #number of clusters to consider
wss <- (nrow(x) - 1) * sum(apply(x, 2, var))
for (i in 2:nk) {
    wss[i] <- sum(kmeans(x, centers = i)$withinss)
}
par(mfrow = c(1, 1))
qplot(1:nk, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares") + 
    theme_bw()

plot of chunk unnamed-chunk-31

WSS begins to staurate at about 5 clusters

(ii) cluster the data into K clusters and display them.

kbest = 5
zclust = kmeans(x, centers = kbest)
plot(long, lat, col = c(zclust$cluster), xlab = "", ylab = "")
US(add = TRUE, col = "grey", lwd = 2, xlim = range(-125, -100))

plot of chunk unnamed-chunk-32

(iii) Repeat (i)-(ii) by including latitude and longitude - the data set will now be a matrix of 3 columns - precipitation, latitude and longitude.

x = cbind(long, lat, WestPrecip)

nk = 20  #number of clusters to consider
wss <- (nrow(x) - 1) * sum(apply(x, 2, var))
for (i in 2:nk) {
    wss[i] <- sum(kmeans(x, centers = i)$withinss)
}
par(mfrow = c(1, 1))
qplot(1:nk, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares") + 
    theme_bw()

plot of chunk unnamed-chunk-33

The optimal clusters in this case is 10

xbest = 10
zclust = kmeans(x, centers = kbest)
plot(long, lat, col = c(zclust$cluster), xlab = "", ylab = "")
US(add = TRUE, col = "grey", lwd = 2, xlim = range(-125, -100))

plot of chunk unnamed-chunk-34

Comments

Question 1

Surprisingly, the years, i.e. 1900-1935, 1936-1975, 1976-2012, don't seem to be singificantly different. The first three modes in each grouping look similar.

Question 2

The first mode of the winter SST data is much different than the first mode of the full year SST. Minor relationships can be seen with the related graphs, however the correlation plot tells a different story

Question 3

SVD on covariance of Precip and SSTwinter show a relationship.

Question 4 and 5

CCA forcasting is significantly better than PCA forcasting as the scatter plots for 2011 and 2012 predictions from CCA fall closer to the best fit line. This is expected becase CCA seeks to maximize correlation as opposed to covariance.

Question 6

The clustering with lat, long included looks much better than clustering with just the Precipitation Data.The clusters in the lat,long,Precip map are more even and grouped by geogrpahic location, which logically makes sense.